Skip to content

Instantly share code, notes, and snippets.

@dniku
Created February 12, 2015 16:13
Show Gist options
  • Save dniku/d40c3bf369b73ae91ac0 to your computer and use it in GitHub Desktop.
Save dniku/d40c3bf369b73ae91ac0 to your computer and use it in GitHub Desktop.
Using gpu device 0: GRID K520
error: XDG_RUNTIME_DIR not set in the environment.
error: XDG_RUNTIME_DIR not set in the environment.
(train.py:2434): Gdk-CRITICAL **: gdk_cursor_new_for_display: assertion 'GDK_IS_DISPLAY (display)' failed
/home/ubuntu/pylearn2/pylearn2/utils/image.py:16: UserWarning: Unable to import matplotlib. Some features unavailable. Original exception: constructor returned NULL
"Original exception: " + str(matplotlib_exception))
(train.py:2434): Gdk-CRITICAL **: gdk_cursor_new_for_display: assertion 'GDK_IS_DISPLAY (display)' failed
/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/__init__.py:66: UserWarning: You are using probably a too old Theano version. That will cause compilation crash. If so, update Theano.
"You are using probably a too old Theano version. That"
Input shape: (28, 28)
Detector space: (21, 21)
1 /*
2 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without modification,
6 * are permitted provided that the following conditions are met:
7 *
8 * - Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * - Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
19 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include <stdio.h>
28 #include <cuda_runtime.h>
29 #include <nvmatrix_kernels.cuh>
30
31 __global__ void kTile(const float* src, float* tgt, const uint srcWidth, const uint srcHeight, const uint tgtWidth, const uint tgtHeight) {
32 const int idx = blockIdx.x * blockDim.x + threadIdx.x;
33 const int numThreads = blockDim.x * gridDim.x;
34 // const unsigned int numEls = tgtWidth * tgtHeight;
35 for (uint i = idx; i < tgtWidth * tgtHeight; i += numThreads) {
36 const uint y = i / tgtWidth;
37 const uint x = i % tgtWidth;
38 const uint srcY = y % srcHeight;
39 const uint srcX = x % srcWidth;
40 tgt[i] = src[srcY * srcWidth + srcX];
41 }
42 }
43
44 __global__ void kDotProduct_r(float* a, float* b, float* target, const uint numCols, const uint numElements) {
45 __shared__ float shmem[DP_BLOCKSIZE];
46
47 uint eidx = DP_BLOCKSIZE * blockIdx.x + threadIdx.x;
48 shmem[threadIdx.x] = 0;
49 if (eidx < numCols) {
50 for (; eidx < numElements; eidx += numCols) {
51 shmem[threadIdx.x] += a[eidx] * b[eidx];
52 }
53 }
54 __syncthreads();
55 if (threadIdx.x < 256) {
56 shmem[threadIdx.x] += shmem[threadIdx.x + 256];
57 }
58 __syncthreads();
59 if (threadIdx.x < 128) {
60 shmem[threadIdx.x] += shmem[threadIdx.x + 128];
61 }
62 __syncthreads();
63 if (threadIdx.x < 64) {
64 shmem[threadIdx.x] += shmem[threadIdx.x + 64];
65 }
66 __syncthreads();
67 if (threadIdx.x < 32) {
68 volatile float* mysh = &shmem[threadIdx.x];
69 *mysh += mysh[32];
70 *mysh += mysh[16];
71 *mysh += mysh[8];
72 *mysh += mysh[4];
73 *mysh += mysh[2];
74 *mysh += mysh[1];
75 if (threadIdx.x == 0) {
76 target[blockIdx.x] = *mysh;
77 }
78 }
79 }
80
81 __global__ void kSetupCurand(curandState *state, unsigned long long seed) {
82 const uint tidx = NUM_RND_THREADS_PER_BLOCK * blockIdx.x + threadIdx.x;
83 /* Each thread gets same seed, a different sequence number,
84 no offset */
85 curand_init(seed, tidx, 0, &state[tidx]);
86 }
87
88
89 /*
90 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
91 * All rights reserved.
92 *
93 * Redistribution and use in source and binary forms, with or without modification,
94 * are permitted provided that the following conditions are met:
95 *
96 * - Redistributions of source code must retain the above copyright notice,
97 * this list of conditions and the following disclaimer.
98 *
99 * - Redistributions in binary form must reproduce the above copyright notice,
100 * this list of conditions and the following disclaimer in the documentation
101 * and/or other materials provided with the distribution.
102 *
103 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
104 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
105 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
106 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
107 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
108 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
109 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
110 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
111 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
112 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113 */
114
115 #ifndef _NVMATRIX_EXPORT
116 #define _NVMATRIX_EXPORT
117 #endif
118
119 #include <set>
120 #include <vector>
121 #include <assert.h>
122 #include <cublas_v2.h>
123 #include <cutil_inline.h>
124 #include <stdlib.h>
125 #include <stdio.h>
126 #include <fstream>
127 #include <iostream>
128 #include <algorithm>
129 #include <typeinfo>
130 #include <nvmatrix.cuh>
131 #include <nvmatrix_operators.cuh>
132 #include <map>
133
134 using namespace std;
135
136 /*
137 * Device random number generator pointers.
138 */
139 //map<int,curandGenerator_t> NVMatrix::rndGen;
140 map<int,curandState*> NVMatrix::rndDevStates;
141 pthread_mutex_t* NVMatrix::_rndMutex = makeMutex();
142
143 pthread_mutex_t* NVMatrix::makeMutex() {
144 pthread_mutex_t* m = (pthread_mutex_t*) malloc(sizeof(pthread_mutex_t));
145 pthread_mutex_init(m, NULL);
146 return m;
147 }
148
149 NVMatrix::NVMatrix(const CudaNdarray * view,
150 int numRows, int numCols, const char * msg)
151 {
152 if (!CudaNdarray_is_c_contiguous(view))
153 {
154 printf("Non contiguous input: %s\n", msg);
155 printf("Dims: ");
156 for (int i=0; i < view->nd; i++)
157 printf("%d ",CudaNdarray_HOST_STRIDES(view)[i]);
158 printf("\n");
159 assert(false);
160 }
161
162 //Check that view actually contains numRows * numCols elements
163 const int * dims = CudaNdarray_HOST_DIMS(view);
164 int total = 1;
165 for (int i = 0; i < view->nd; i++)
166 {
167 total *= dims[i];
168 }
169 if (total != numRows * numCols)
170 {
171 fprintf(stderr, "NVMatrix asked to make a view of a CudaNdarray with %d elements",total);
172 fprintf(stderr, " but told to arrange these in a %d x %d rectangle (of total size %d).\n",
173 numRows, numCols, numRows * numCols);
174 fprintf(stderr, "CudaNdarray dims: ");
175 for (int i = 0; i < view->nd; i++)
176 fprintf(stderr, "%d ", dims[i]);
177 fprintf(stderr, "\n");
178 assert(false);
179 }
180
181 //Make the view
182 _numRows = numRows;
183 _numCols = numCols;
184 _numElements = numRows * numCols;
185 _ownsData = false;
186 _isTrans = false;
187 _devData = view->devdata;
188 _stride = getLeadingDim();
189 }
190
191 void NVMatrix::_init(int numRows, int numCols, int stride, bool isTrans) {
192 _numRows = numRows;
193 _numCols = numCols;
194 _numElements = numRows * numCols;
195 _ownsData = true;
196
197 _isTrans = isTrans;
198 _devData = NULL;
199 if (_numElements > 0) {
200 cudaError_t err = cudaMalloc((void**) &_devData,
201 _numElements * sizeof(float));
202 if (cudaSuccess != err){
203 fprintf(stderr, "!!!! device memory allocation error\n", NULL);
204 exit(EXIT_FAILURE);
205 }
206 }
207 _stride = stride < 0 ? getLeadingDim() : stride;
208 }
209
210 NVMatrix::NVMatrix() {
211 _init(0, 0, -1, false);
212 }
213
214 NVMatrix::NVMatrix(bool isTrans) {
215 _init(0, 0, -1, isTrans);
216 }
217
218 NVMatrix::NVMatrix(int numRows, int numCols, bool isTrans) {
219 _init(numRows, numCols, -1, isTrans);
220 }
221
222 /*
223 NVMatrix::NVMatrix(const Matrix& like, bool copy) {
224 _init(like.getNumRows(), like.getNumCols(), -1, like.isTrans());
225 if (copy) {
226 copyFromHost(like);
227 }
228 }
229 */
230
231 NVMatrix::NVMatrix(const NVMatrix& like, bool copy) {
232 _init(like.getNumRows(), like.getNumCols(), -1, like.isTrans());
233 if (copy) {
234 like.copy(*this);
235 }
236 }
237
238 /*
239 * Initializes NVMatrix with same dimensions as given matrix but
240 * does not copy any data.
241 */
242 NVMatrix::NVMatrix(const NVMatrix& like) {
243 _init(like.getNumRows(), like.getNumCols(), -1, like.isTrans());
244 }
245
246 /*
247 * Initializes NVMatrix with same dimensions as given matrix but
248 * does not copy any data.
249 NVMatrix::NVMatrix(const Matrix& like) {
250 _init(like.getNumRows(), like.getNumCols(), -1, false);
251 }
252 */
253
254 NVMatrix::NVMatrix(float* devData, int numRows, int numCols, int stride, bool isTrans) :
255 _numRows(numRows),
256 _numCols(numCols),
257 _numElements(numRows*numCols),
258 _ownsData(false),
259 _devData(devData),
260 _isTrans(isTrans) {
261 _stride = stride < 0 ? getLeadingDim() : stride;
262 }
263
264 NVMatrix::~NVMatrix() {
265 if(_ownsData && _numElements > 0) {
266 // This line was modified by Ian Goodfellow to use device_free
267 // so that theano may keep track of device memory usage
268 int status = device_free(_devData);
269 if (status != 0) {
270 fprintf(stderr, "!!!! memory free error\n");
271 exit(EXIT_FAILURE);
272 }
273 }
274 }
275
276 /*
277 void NVMatrix::copyFromHost(const Matrix& hostMatrix, bool resizeDeviceMatrix) {
278 if (resizeDeviceMatrix) {
279 resize(hostMatrix);
280 }
281 copyFromHost(hostMatrix);
282 }
283
284 void NVMatrix::copyFromHost(const Matrix& hostMatrix) {
285 // assert(getStride() == getLeadingDim());
286 assert(isSameDims(hostMatrix));
287 setTrans(hostMatrix.isTrans());
288
289 if (getNumElements() > 0) {
290 cublasStatus status = cublasSetMatrix(hostMatrix.getLeadingDim(), hostMatrix.getFollowingDim(), sizeof(float),
291 hostMatrix.getData(), hostMatrix.getLeadingDim(), _devData, _stride);
292 if (status != CUBLAS_STATUS_SUCCESS) {
293 fprintf(stderr, "!!!! device access error (write)\n");
294 exit( EXIT_FAILURE);
295 }
296 }
297 }
298
299 void NVMatrix::copyToHost(Matrix& hostMatrix) const {
300 // assert(getStride() == getLeadingDim());
301 assert(isSameDims(hostMatrix));
302 hostMatrix.setTrans(_isTrans);
303 if (getNumElements() > 0) {
304 // printf("rows: %d, cols: %d, stride: %d\n", getNumRows(), getNumCols(), getStride());
305 cublasStatus status = cublasGetMatrix(getLeadingDim(),getFollowingDim(), sizeof(float),
306 _devData, getStride(), hostMatrix.getData(), hostMatrix.getLeadingDim());
307 if (status != CUBLAS_STATUS_SUCCESS) {
308 fprintf(stderr, "!!!! device access error (read)\n");
309 exit( EXIT_FAILURE);
310 }
311 }
312 }
313
314 void NVMatrix::copyToHost(Matrix& hostMatrix, bool resizeTarget) const {
315 if (resizeTarget) {
316 hostMatrix.resize(_numRows, _numCols);
317 }
318 copyToHost(hostMatrix);
319 }
320 */
321
322 void NVMatrix::copy(NVMatrix& dest) const {
323 dest.resize(*this);
324 copy(dest, 0, -1, 0, -1, 0, 0);
325 }
326
327 NVMatrix& NVMatrix::copy() const {
328 NVMatrix* c = new NVMatrix();
329 copy(*c);
330 return *c;
331 }
332
333 void NVMatrix::rightMult(const NVMatrix &b, float scaleAB, NVMatrix &target) const {
334 assert(isContiguous() && b.isContiguous() && target.isContiguous());
335 // assert(&target != &b);
336 assert(_numCols == b.getNumRows());
337 if(&target != this) {
338 target.resize(_numRows, b.getNumCols());
339 target.setTrans(true);
340 }
341 assert(target.getNumRows() == _numRows);
342 assert(target.getNumCols() == b.getNumCols());
343 if(_numRows % 64 != 0 || _numCols % 64 != 0 || b.getNumCols() % 64 != 0) {
344 WARN("Matrix dimensions not divisible by 64 -- cublasSgemm performance may suffer.");
345 }
346 cublasStatus_t err;
347 float zero = 0;
348 err = cublasSgemm(handle, getTransOp(), b.getTransOp(),
349 _numRows, b.getNumCols(), _numCols,
350 &scaleAB, _devData, getLeadingDim(), b.getDevData(),
351 b.getLeadingDim(),
352 &zero, target.getDevData(), getNumRows());
353 checkCublasError(err, "cublasSgemm failed");
354 // cudaThreadSynchronize();
355 }
356
357 void NVMatrix::rightMult(const NVMatrix &b, float scaleAB) {
358 rightMult(b, scaleAB, *this);
359 }
360
361 void NVMatrix::rightMult(const NVMatrix &b, NVMatrix& target) const {
362 rightMult(b, 1, target);
363 }
364
365 /*
366 * This will only work if this matrix is in column-major order! In other words,
367 * if isTrans() returns true.
368 */
369 void NVMatrix::addProduct(const NVMatrix& a, const NVMatrix &b, float scaleThis, float scaleAB) {
370 if (scaleThis == 0) {
371 a.rightMult(b, scaleAB, *this);
372 return;
373 }
374 assert(isContiguous());
375 assert(a.getNumCols() == b.getNumRows());
376 assert(this->getNumRows() == a.getNumRows());
377 assert(this->getNumCols() == b.getNumCols());
378 assert(_isTrans);
379 if(a.getNumRows() % 64 != 0 || a.getNumCols() % 64 != 0 || b.getNumCols() % 64 != 0) {
380 WARN("Matrix dimensions not divisible by 64 -- cublasSgemm performance may suffer.");
381 }
382 cublasStatus_t err;
383 err = cublasSgemm(handle, a.getTransOp(), b.getTransOp(),
384 a.getNumRows(), b.getNumCols(), a.getNumCols(),
385 &scaleAB, a.getDevData(), a.getLeadingDim(),
386 b.getDevData(), b.getLeadingDim(),
387 &scaleThis, _devData, getLeadingDim());
388 checkCublasError(err, "cublasSgemm failed");
389 // cudaThreadSynchronize();
390 }
391
392 void NVMatrix::addProduct(const NVMatrix& a, const NVMatrix &b) {
393 addProduct(a, b, 1, 1);
394 }
395
396 template <class Randomizer>
397 void NVMatrix::_unaryRandomize(NVMatrix& target, Randomizer rnd) {
398 assert(isRndInitialized());
399 assert(isContiguous() && target.isContiguous());
400 if (!isSameDims(target)) {
401 target.resize(*this);
402 }
403 assert(isTrans() == target.isTrans());
404 kUnaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
405 cutilCheckMsg("kUnaryRandomize: Kernel execution failed");
406 }
407
408 template <class Randomizer>
409 void NVMatrix::_binaryRandomize(NVMatrix& data2, NVMatrix& target, Randomizer rnd) {
410 assert(isRndInitialized());
411 assert(isContiguous() && data2.isContiguous() && target.isContiguous());
412 assert(isSameDims(data2));
413 assert(isTrans() == data2.isTrans());
414 if (!isSameDims(target)) {
415 target.resize(*this);
416 }
417 assert(isTrans() == target.isTrans());
418 kBinaryRandomize<<<NUM_RND_BLOCKS,NUM_RND_THREADS_PER_BLOCK>>>(getDevData(), data2.getDevData(), target.getDevData(), getCurandState(), getNumElements(), rnd);
419 cutilCheckMsg("kBinaryRandomize: Kernel execution failed");
420 }
421
422 /* Function removed by Ian Goodfellow.
423 We do not need this function in theano / pylearn2 and it uses cudaMalloc directly.
424 If you need to enable it, modify it to use device_malloc instead.
425 Otherwise, theano will not be able to keep track of how much memory is used on
426 the device.
427 void NVMatrix::initRandom(unsigned long long seed) {
428 assert(!isRndInitialized());
429 pthread_mutex_lock(_rndMutex);
430 int d = getDeviceID();
431 rndDevStates[d] = NULL;
432 CUDA_CALL(cudaMalloc((void **)&rndDevStates[d], NUM_RND_STREAMS * sizeof(curandState)));
433 pthread_mutex_unlock(_rndMutex);
434 kSetupCurand<<<NUM_RND_BLOCKS, NUM_RND_THREADS_PER_BLOCK>>>(getCurandState(), 1 + seed*2); // so there's no chance it'll be correlated with the other one
435 cutilCheckMsg("initRandom: Kernel execution failed");
436 }
437
438 void NVMatrix::initRandom() {
439 NVMatrix::initRandom(time(0));
440 }
441 */
442
443 curandState* NVMatrix::getCurandState() {
444 pthread_mutex_lock(_rndMutex);
445 int d = getDeviceID();
446 assert(rndDevStates.count(d) != 0);
447 curandState* r = rndDevStates[d];
448 pthread_mutex_unlock(_rndMutex);
449 return r;
450 }
451
452 int NVMatrix::getDeviceID() {
453 int d;
454 cudaGetDevice(&d);
455 return d;
456 }
457
458 bool NVMatrix::isRndInitialized() {
459 pthread_mutex_lock(_rndMutex);
460 bool b = rndDevStates.count(getDeviceID()) != 0;
461 pthread_mutex_unlock(_rndMutex);
462 return b;
463 }
464
465 /* Function removed by Ian Goodfellow due to not needing
466 it and it using cudaFree instead of device_free
467 void NVMatrix::destroyRandom() {
468 assert(isRndInitialized());
469 int d = getDeviceID();
470
471 pthread_mutex_lock(_rndMutex);
472 CUDA_CALL(cudaFree(rndDevStates[d]));
473 rndDevStates.erase(d);
474 pthread_mutex_unlock(_rndMutex);
475 } */
476
477 void NVMatrix::binarizeProbs() {
478 binarizeProbs(*this);
479 }
480
481 void NVMatrix::binarizeProbs(NVMatrix& target) {
482 _unaryRandomize(target, BinarizeUnaryRandomizer());
483 }
484
485 void NVMatrix::randomizeUniform() {
486 assert(isContiguous());
487 assert(isRndInitialized());
488 // CURAND_CALL(curandGenerateUniform(rndGen, _devData, getNumElements()));
489 _unaryRandomize(*this, UniformUnaryRandomizer());
490 }
491
492 void NVMatrix::randomizeGaussian() {
493 randomizeGaussian(1);
494 }
495
496 void NVMatrix::randomizeGaussian(float stdev) {
497 randomizeGaussian(0, stdev);
498 }
499
500 void NVMatrix::randomizeGaussian(float mean, float stdev) {
501 assert(isContiguous());
502 assert(isRndInitialized());
503 // CURAND_CALL(curandGenerateNormal(rndGen, _devData, getNumElements(), mean, stdev));
504 _unaryRandomize(*this, GaussianUnaryRandomizer(mean, stdev));
505 }
506
507 /*
508 * Kind of a hack since we don't actually need the contents of this matrix for it,
509 * so we don't really need a binary randomizer.
510 */
511 void NVMatrix::randomizeGaussian(NVMatrix& stdevs) {
512 _binaryRandomize(stdevs, *this, GaussianBinaryRandomizer());
513 }
514
515 void NVMatrix::addGaussianNoise() {
516 addGaussianNoise(1);
517 }
518
519 void NVMatrix::addGaussianNoise(float stdev) {
520 addGaussianNoise(stdev, *this);
521 }
522
523 void NVMatrix::addGaussianNoise(float stdev, NVMatrix& target) {
524 _unaryRandomize(target, AddGaussianUnaryRandomizer(stdev));
525 }
526
527 void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var) {
528 addGaussianNoise(stdevs, var, *this);
529 }
530
531 void NVMatrix::addGaussianNoise(NVMatrix& stdevs) {
532 addGaussianNoise(stdevs, false, *this);
533 }
534
535 void NVMatrix::addGaussianNoise(NVMatrix& stdevs, bool var, NVMatrix& target) {
536 if (var) {
537 _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<true>());
538 } else {
539 _binaryRandomize(stdevs, target, AddGaussianBinaryRandomizer<false>());
540 }
541 }
542
543 void NVMatrix::biggerThan(NVMatrix& b, NVMatrix& target) {
544 applyBinary(NVMatrixBinaryOps::BiggerThan(), b, target);
545 }
546
547 void NVMatrix::biggerThan(NVMatrix& b) {
548 biggerThan(b, *this);
549 }
550
551 void NVMatrix::equals(NVMatrix& b, NVMatrix& target) {
552 applyBinary(NVMatrixBinaryOps::Equals(), b, target);
553 }
554
555 void NVMatrix::equals(NVMatrix& m) {
556 equals(m, *this);
557 }
558
559 void NVMatrix::biggerThanVector(NVMatrix& vec, NVMatrix& target) {
560 applyBinaryV(NVMatrixBinaryOps::BiggerThan(), vec, target);
561 }
562
563 void NVMatrix::biggerThanVector(NVMatrix& vec) {
564 biggerThanVector(vec, *this);
565 }
566
567 void NVMatrix::_checkBounds(int startRow, int endRow, int startCol, int endCol) const {
568 assert(startRow >= 0 && startRow < _numRows);
569 assert(endRow > startRow && endRow <= _numRows);
570 assert(startCol >= 0 && startCol < _numCols);
571 assert(endCol > startCol && endCol <= _numCols);
572 }
573
574 /*
575 * The only place where stride is supported for now!
576 * Will ALWAYS return a view of the original data, sometimes non-contiguous.
577 */
578 NVMatrix& NVMatrix::slice(int startRow, int endRow, int startCol, int endCol) const {
579 endRow = endRow < 0 ? this->_numRows : endRow;
580 endCol = endCol < 0 ? this->_numCols : endCol;
581 _checkBounds(startRow, endRow, startCol, endCol);
582 if (!isTrans()) {
583 return *new NVMatrix(this->_devData + startRow * _stride + startCol, endRow - startRow, endCol - startCol, _stride, false);
584 }
585 return *new NVMatrix(this->_devData + startCol * _stride + startRow, endRow - startRow, endCol - startCol, _stride, true);
586 }
587
588 /* this will NEVER return a view */
589 void NVMatrix::slice(int startRow, int endRow, int startCol, int endCol, NVMatrix& target) const {
590 endRow = endRow < 0 ? this->_numRows : endRow;
591 endCol = endCol < 0 ? this->_numCols : endCol;
592 _checkBounds(startRow, endRow, startCol, endCol);
593
594 int sliceRows = endRow - startRow, sliceCols = endCol - startCol;
595 if (target.getNumRows() != sliceRows || target.getNumCols() != sliceCols) {
596 target.resize(sliceRows, sliceCols);
597 }
598 this->copy(target, startRow, endRow, startCol, endCol, 0, 0);
599 }
600
601 NVMatrix& NVMatrix::sliceRows(int startRow, int endRow) const {
602 return slice(startRow, endRow, 0, -1);
603 }
604
605 void NVMatrix::sliceRows(int startRow, int endRow, NVMatrix& target) const {
606 slice(startRow, endRow, 0, -1, target);
607 }
608
609 NVMatrix& NVMatrix::sliceCols(int startCol, int endCol) const {
610 return slice(0, -1, startCol, endCol);
611 }
612
613 void NVMatrix::sliceCols(int startCol, int endCol, NVMatrix& target) const {
614 slice(0, -1, startCol, endCol, target);
615 }
616
617 /*
618 * Guaranteed to not change the data if the number of elements doesn't change.
619 * So you can use this to "reshape" a matrix.
620 */
621
622 bool NVMatrix::resize(int numRows, int numCols) {
623 bool reallocated = false;
624 if (numRows != _numRows || numCols != _numCols) {
625 // this assertion was removed by Ian Goodfellow because it seems to come too early
626 // assert(_ownsData);
627 if (_numElements != numRows * numCols) {
628 assert(_ownsData); // assert moved here by Ian Goodfellow
629 if (_numElements > 0) { // free old memory
630 // This line was modified by Ian Goodfellow to use device_free so theano may track device memory usage accurately
631 int status = device_free(_devData);
632 if (status != 0) {
633 fprintf(stderr, "!!!! memory free error: %X\n", status);
634 exit(EXIT_FAILURE);
635 }
636 }
637 if (numRows * numCols > 0) { // allocate new memory
638 cudaError_t status = cudaMalloc((void**) &_devData,
639 numCols * numRows * sizeof(float));
640 if (status != cudaSuccess) {
641 fprintf(stderr, "!!!! device memory allocation error\n");
642 exit(EXIT_FAILURE);
643 }
644 } else {
645 _devData = NULL;
646 }
647 reallocated = true;
648 }
649 _numRows = numRows;
650 _numCols = numCols;
651 _numElements = numRows * numCols;
652 _stride = getLeadingDim();
653 }
654 return reallocated;
655 }
656
657 bool NVMatrix::resize(const NVMatrix& like) {
658 setTrans(like.isTrans());
659 return resize(like.getNumRows(), like.getNumCols());
660 }
661
662 /*
663 bool NVMatrix::resize(const Matrix& like) {
664 setTrans(like.isTrans());
665 return resize(like.getNumRows(), like.getNumCols());
666 }
667 */
668
669 void NVMatrix::reshape(int numRows, int numCols) {
670 assert(isContiguous());
671 assert(_numElements == numRows*numCols);
672 _numRows = numRows;
673 _numCols = numCols;
674 _stride = getLeadingDim();
675 }
676
677 NVMatrix& NVMatrix::reshaped(int numRows, int numCols) {
678 assert(isContiguous());
679 assert(_numElements == numRows*numCols);
680 return *new NVMatrix(_devData, numRows, numCols, -1, _isTrans);
681 }
682
683 void NVMatrix::copy(NVMatrix &dest, int srcStartRow, int srcEndRow,
684 int srcStartCol, int srcEndCol,
685 int destStartRow, int destStartCol) const {
686 srcEndRow = srcEndRow < 0 ? _numRows : srcEndRow;
687 srcEndCol = srcEndCol < 0 ? _numCols : srcEndCol;
688 NVMatrix* srcSlice = &slice(srcStartRow, srcEndRow, srcStartCol, srcEndCol);
689 NVMatrix* destSlice = &dest.slice(destStartRow, destStartRow + srcEndRow - srcStartRow, destStartCol, destStartCol + srcEndCol - srcStartCol);
690 srcSlice->apply(NVMatrixOps::Identity(), *destSlice);
691 delete srcSlice;
692 delete destSlice;
693 }
694
695
696 NVMatrix& NVMatrix::getTranspose() {
697 return *new NVMatrix(_devData, _numCols, _numRows, _stride, !_isTrans);;
698 }
699
700 void NVMatrix::transpose(NVMatrix& target) {
701 flipTrans(target);
702 target.setTrans(!target.isTrans());
703 target.reshape(target.getNumCols(), target.getNumRows());
704 }
705
706 void NVMatrix::transpose() {
707 int tmp = _numCols;
708 _numCols = _numRows;
709 _numRows = tmp;
710 _isTrans = !_isTrans;
711 }
712
713 bool NVMatrix::transpose(bool trans) {
714 bool oldTrans = _isTrans;
715 if (oldTrans != trans) {
716 transpose();
717 }
718 return oldTrans;
719 }
720
721 /*
722 * Flips the ordering of the matrix from row-major to column-major and vice versa.
723 * This creates temporary storage -- not a cheap operation.
724 *
725 * This is not equivalent to a "hard transpose". The resultant matrix still has
726 * the same dimensions, its layout in memory just changes.
727 */
728 NVMatrix& NVMatrix::flipTrans() {
729 NVMatrix* meTrans = new NVMatrix(*this);
730 flipTrans(*meTrans);
731 return *meTrans;
732 }
733
734 void NVMatrix::flipTrans(NVMatrix& target) {
735 assert(&target != this);
736 target.resize(_numRows, _numCols);
737 target.setTrans(!isTrans());
738 apply(NVMatrixOps::Identity(), target);
739 }
740
741 void NVMatrix::squaredDiff(NVMatrix& b) {
742 squaredDiff(b, *this);
743 }
744
745 void NVMatrix::squaredDiff(NVMatrix& b, NVMatrix& target) {
746 applyBinary(NVMatrixBinaryOps::SquaredDiff(), b, target);
747 }
748
749 void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB, NVMatrix& target) {
750 if (scaleA == 0) {
751 b.scale(scaleB, target);
752 return;
753 }
754 if (scaleA == 1 && scaleB == 1) { // slight optimization
755 applyBinary(NVMatrixBinaryOps::Add(), b, target);
756 } else {
757 applyBinary(NVMatrixBinaryOps::WeightedAdd(scaleA, scaleB), b, target);
758 }
759 }
760
761 void NVMatrix::add(NVMatrix& b, float scaleB, NVMatrix& target) {
762 add(b, 1, scaleB, target);
763 }
764
765 void NVMatrix::add(NVMatrix& b, NVMatrix& target) {
766 add(b, 1, target);
767 }
768
769 void NVMatrix::add(NVMatrix& b, float scaleB) {
770 add(b, scaleB, *this);
771 }
772
773 void NVMatrix::add(NVMatrix& b, float scaleA, float scaleB) {
774 add(b, scaleA, scaleB, *this);
775 }
776
777 void NVMatrix::add(NVMatrix& b) {
778 add(b, 1, *this);
779 }
780
781 void NVMatrix::subtract(NVMatrix& b, NVMatrix& target) {
782 add(b, -1, target);
783 }
784
785 void NVMatrix::subtract(NVMatrix& b) {
786 add(b, -1);
787 }
788
789 void NVMatrix::eltwiseMult(NVMatrix& b, NVMatrix& target) {
790 applyBinary(NVMatrixBinaryOps::Multiply(), b, target);
791 }
792
793 void NVMatrix::eltwiseMult(NVMatrix& b) {
794 eltwiseMult(b, *this);
795 }
796
797 void NVMatrix::eltwiseDivide(NVMatrix& b, NVMatrix& target) {
798 applyBinary(NVMatrixBinaryOps::Divide(), b, target);
799 }
800
801 void NVMatrix::eltwiseDivide(NVMatrix& b) {
802 eltwiseDivide(b, *this);
803 }
804
805 void NVMatrix::tile(int timesY, int timesX, NVMatrix& target) {
806 assert(isContiguous() && target.isContiguous());
807 assert(timesX > 0 && timesY > 0);
808 target.resize(_numRows*timesY, _numCols*timesX);
809 target.setTrans(_isTrans);
810 if(!isTrans()) {
811 kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK>>>(_devData, target._devData, _numCols, _numRows, target._numCols, target._numRows);
812 } else {
813 kTile<<<NUM_TILE_BLOCKS,NUM_TILE_THREADS_PER_BLOCK>>>(_devData, target._devData, _numRows, _numCols, target._numRows, target._numCols);
814 }
815 cutilCheckMsg("Kernel execution failed");
816 }
817
818 void NVMatrix::addVector(NVMatrix& vec, float scaleVec, NVMatrix& target) {
819 applyBinaryV(NVMatrixBinaryOps::WeightedAdd(1, scaleVec), vec, target);
820 }
821
822 void NVMatrix::addVector(NVMatrix& vec) {
823 addVector(vec, 1, *this);
824 }
825
826 void NVMatrix::addVector(NVMatrix& vec, float scaleVec) {
827 addVector(vec, scaleVec, *this);
828 }
829
830 void NVMatrix::addVector(NVMatrix& vec, NVMatrix& target) {
831 addVector(vec, 1, target);
832 }
833
834 void NVMatrix::equalsVector(NVMatrix& vec, NVMatrix& target) {
835 applyBinaryV(NVMatrixBinaryOps::Equals(), vec, target);
836 }
837
838 void NVMatrix::equalsVector(NVMatrix& vec) {
839 equalsVector(vec, *this);
840 }
841
842 void NVMatrix::eltwiseMultByVector(NVMatrix& vec, NVMatrix& target) {
843 applyBinaryV(NVMatrixBinaryOps::Multiply(), vec, target);
844 }
845
846 void NVMatrix::eltwiseMultByVector(NVMatrix& vec) {
847 eltwiseMultByVector(vec, *this);
848 }
849
850 void NVMatrix::eltwiseDivideByVector(NVMatrix& vec) {
851 eltwiseDivideByVector(vec, *this);
852 }
853
854 void NVMatrix::eltwiseDivideByVector(NVMatrix& vec, NVMatrix& target) {
855 applyBinaryV(NVMatrixBinaryOps::Divide(), vec, target);
856 }
857
858 /*
859 * num threads per block is ignored when summing rows (axis=1) because
860 * it has to be a power of 2.
861 *
862 * TODO: this is a mess, fix it. it works pretty fast but it's too ugly.
863 * TODO: this function is _really_ bad for very long aggregations of few columns.
864 */
865 template<class Agg, class BinaryOp>
866 void NVMatrix::_aggregate(int axis, NVMatrix& target, Agg agg, BinaryOp op) {
867 assert(axis == 0 || axis == 1);
868 assert(isContiguous() && target.isContiguous());
869 assert(&target != this);
870 int width = _isTrans ? _numRows : _numCols;
871 int height = _isTrans ? _numCols : _numRows;
872
873 target.setTrans(_isTrans);
874 assert(width > 0);
875 assert(height > 0);
876 if(axis == 0 && !_isTrans || axis == 1 && _isTrans) { //col sum
877 target.resize(!_isTrans ? 1 : _numRows, !_isTrans ? _numCols : 1);
878 int numBlocks = DIVUP(width, NUM_SUM_COLS_THREADS_PER_BLOCK);
879 assert(numBlocks * NUM_SUM_COLS_THREADS_PER_BLOCK >= width);
880 assert(numBlocks < NUM_BLOCKS_MAX);
881 kDumbAggCols<Agg, BinaryOp><<<numBlocks,NUM_SUM_COLS_THREADS_PER_BLOCK>>>(_devData, target._devData, width, height, agg, op);
882 cutilCheckMsg("kDumbAggCols: Kernel execution failed");
883 } else { // row sum
884 target.resize(_isTrans ? 1 : _numRows, _isTrans ? _numCols : 1);
885 if (width > 1) {
886 if (height >= 16384) { // linear aggregation
887 int numBlocksX = 1;
888 int numBlocksY = DIVUP(height, AGG_SHORT_ROWS_THREADS_Y*AGG_SHORT_ROWS_LOOPS_Y);
889 int numThreadsX = width <= 4 ? 4 : width <= 8 ? 8 : width <= 12 ? 12 : width <= 16 ? 16 : AGG_SHORT_ROWS_THREADS_X;
890 int numThreadsY = AGG_SHORT_ROWS_THREADS_Y;
891 while (numBlocksY > NUM_BLOCKS_MAX) {
892 numBlocksY = DIVUP(numBlocksY,2);
893 numBlocksX *= 2;
894 }
895 dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
896 if(width <= 16) {
897 if(width <= 4) {
898 kAggShortRows<Agg, BinaryOp, 1, 4><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
899 } else if(width <= 8) {
900 kAggShortRows<Agg, BinaryOp, 1, 8><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
901 } else if(width <= 12) {
902 kAggShortRows<Agg, BinaryOp, 1, 12><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
903 } else {
904 kAggShortRows<Agg, BinaryOp, 1, 16><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
905 }
906 } else if(width <= 32) {
907 kAggShortRows<Agg, BinaryOp, 2, AGG_SHORT_ROWS_THREADS_X><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
908 } else if(width <= 48){
909 kAggShortRows<Agg, BinaryOp, 3, AGG_SHORT_ROWS_THREADS_X><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
910 } else if(width <= 64){
911 kAggShortRows<Agg, BinaryOp, 4, AGG_SHORT_ROWS_THREADS_X><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
912 } else {
913 kAggShortRows2<Agg, BinaryOp><<<grid, threads>>>(_devData, target._devData,width, height, agg, op);
914 }
915 } else {
916 if (width >= 512) {
917 dim3 threads(AWR_NUM_THREADS);
918 dim3 blocks(1, std::min(1024, height));
919 kAggRows_wholerow_nosync<<<blocks, threads>>>(_devData, target._devData, width, height, agg, op);
920 // dim3 threads(AWR_NUM_THREADS);
921 // dim3 blocks(1, std::min(1024, height));
922 // kAggRows_wholerow<<<blocks, threads>>>(_devData, target._devData, width, height, agg, op);
923
924 } else {
925 // dim3 threads(AWR_NUM_THREADS);
926 // dim3 blocks(1, std::min(1024, height));
927 // kAggRows_wholerow<<<blocks, threads>>>(_devData, target._devData, width, height, agg, op);
928 NVMatrix *prevSum = this;
929 while (prevSum->getLeadingDim() > 1) {
930 int numThreadsX = width <= 64 ? 32 : (width <= 128 ? 64 : (width <= 256 ? 128 : (width <= 512 ? 256 : 512)));
931 int numThreadsY = 1;
932 int numBlocksX = DIVUP(width, 2*numThreadsX);
933 int numBlocksY = std::min(height, NUM_BLOCKS_MAX);
934 NVMatrix *nvSumAccum = target.getFollowingDim() == height && target.getLeadingDim() == numBlocksX ? &target : new NVMatrix(height, numBlocksX, false);
935
936 dim3 grid(numBlocksX, numBlocksY), threads(numThreadsX, numThreadsY);
937 assert(numBlocksX <= NUM_BLOCKS_MAX);
938 assert(numBlocksY <= NUM_BLOCKS_MAX);
939
940 if(width <= 64) {
941 kAggRows<Agg, BinaryOp, 32><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
942 width, height, nvSumAccum->getLeadingDim(), agg, op);
943 } else if(width <= 128) {
944 kAggRows<Agg, BinaryOp, 64><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
945 width, height, nvSumAccum->getLeadingDim(), agg, op);
946 } else if(width <= 256) {
947 kAggRows<Agg, BinaryOp, 128><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
948 width, height, nvSumAccum->getLeadingDim(), agg, op);
949 } else if(width <= 512) {
950 kAggRows<Agg, BinaryOp, 256><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
951 width, height, nvSumAccum->getLeadingDim(), agg, op);
952 } else {
953 kAggRows<Agg, BinaryOp, 512><<<grid, threads>>>(prevSum->_devData, nvSumAccum->_devData,
954 width, height, nvSumAccum->getLeadingDim(), agg, op);
955 }
956 cutilCheckMsg("agg rows: Kernel execution failed");
957 cudaThreadSynchronize();
958 width = numBlocksX; // only true in reduction agg, but for linear agg this doesn't matter anyway
959
960 if (prevSum != this) {
961 delete prevSum;
962 }
963 prevSum = nvSumAccum;
964 }
965 }
966 }
967 } else {
968 copy(target);
969 }
970 }
971 }
972
973 void NVMatrix::inRangeInc(float lower, float upper) {
974 inRangeInc(lower, upper, *this);
975 }
976 void NVMatrix::inRangeInc(float lower, float upper, NVMatrix& target) {
977 apply(NVMatrixOps::InRange<false>(lower, upper), target);
978 }
979
980 void NVMatrix::inRangeExc(float lower, float upper) {
981 inRangeExc(lower, upper, *this);
982 }
983
984 void NVMatrix::inRangeExc(float lower, float upper, NVMatrix& target) {
985 apply(NVMatrixOps::InRange<true>(lower, upper), target);
986 }
987
988 void NVMatrix::biggerThanScalar(float scalar) {
989 biggerThanScalar(scalar, *this);
990 }
991
992 void NVMatrix::biggerThanScalar(float scalar, NVMatrix& target) {
993 apply(NVMatrixOps::BiggerThanScalar(scalar), target);
994 }
995
996 void NVMatrix::smallerThanScalar(float scalar) {
997 smallerThanScalar(scalar, *this);
998 }
999
1000 void NVMatrix::smallerThanScalar(float scalar, NVMatrix& target) {
1001 apply(NVMatrixOps::SmallerThanScalar(scalar), target);
1002 }
1003
1004 void NVMatrix::addScalar(float scaleThis, float scalar, NVMatrix& target) {
1005 apply(NVMatrixOps::WeightedAddScalar(scaleThis, scalar), target);
1006 }
1007
1008 void NVMatrix::addScalar(float scalar, NVMatrix& target) {
1009 apply(NVMatrixOps::AddScalar(scalar), target);
1010 }
1011
1012 void NVMatrix::addScalar(float scalar) {
1013 addScalar(scalar, *this);
1014 }
1015
1016 void NVMatrix::minWithScalar(float scalar, NVMatrix& target) {
1017 apply(NVMatrixOps::MinWithScalar(scalar), target);
1018 }
1019
1020 void NVMatrix::minWithScalar(float scalar) {
1021 minWithScalar(scalar, *this);
1022 }
1023
1024 void NVMatrix::maxWithScalar(float scalar, NVMatrix& target) {
1025 apply(NVMatrixOps::MaxWithScalar(scalar), target);
1026 }
1027
1028 void NVMatrix::maxWithScalar(float scalar) {
1029 maxWithScalar(scalar, *this);
1030 }
1031
1032 void NVMatrix::pow(float p, NVMatrix& target) {
1033 apply(NVMatrixOps::Pow(p), target);
1034 }
1035
1036 void NVMatrix::pow(float p) {
1037 pow(p, *this);
1038 }
1039
1040 void NVMatrix::scale(float _scale) {
1041 scale(_scale, *this);
1042 }
1043
1044 void NVMatrix::scale(float _scale, NVMatrix& target) {
1045 if (_scale != 1 || &target != this) { // optimize away scale by 1
1046 apply(NVMatrixOps::MultByScalar(_scale), target);
1047 }
1048 }
1049
1050 template<class Agg, class BinaryOp>
1051 NVMatrix& NVMatrix::_aggregate(int axis, Agg agg, BinaryOp op) {
1052 NVMatrix *sumVec = new NVMatrix();
1053 _aggregate<Agg, BinaryOp>(axis, *sumVec, agg, op);
1054 return *sumVec;
1055 }
1056
1057 void NVMatrix::max(int axis, NVMatrix& target) {
1058 _aggregate(axis, target, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
1059 }
1060
1061 void NVMatrix::addSum(NVMatrix& a, int axis, float scaleThis, float scaleSum) {
1062 if (scaleThis != 0) {
1063 a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::WeightedAdd(scaleThis, scaleSum));
1064 } else {
1065 a._aggregate(axis, *this, NVMatrixAggs::Sum(), NVMatrixBinaryOps::SecondScaled(scaleSum));
1066 }
1067 }
1068 void NVMatrix::sum(int axis, NVMatrix& target) {
1069 _aggregate(axis, target, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second());
1070 }
1071 /*
1072 void NVMatrix::min(int axis, NVMatrix& target) {
1073 _aggregate(axis, target, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
1074 }
1075
1076 NVMatrix& NVMatrix::max(int axis) {
1077 return _aggregate(axis, NVMatrixAggs::Max(), NVMatrixBinaryOps::Second());
1078 }
1079
1080 NVMatrix& NVMatrix::sum(int axis) {
1081 return _aggregate(axis, NVMatrixAggs::Sum(), NVMatrixBinaryOps::Second());
1082 }
1083
1084 NVMatrix& NVMatrix::min(int axis) {
1085 return _aggregate(axis, NVMatrixAggs::Min(), NVMatrixBinaryOps::Second());
1086 }
1087 */
1088
1089 void NVMatrix::_sum_setParams(int n, dim3* blocks, dim3* threads, int* numCols) {
1090 int logn = int(ceil(log(double(n)) / log(2.)));
1091 *numCols = DIVUP(n, logn);
1092 int numThreads = *numCols;
1093 *blocks = dim3(DIVUP(numThreads, DP_BLOCKSIZE));
1094 *threads = dim3(DP_BLOCKSIZE);
1095 }
1096
1097 /*
1098 float NVMatrix::mean() {
1099 return sum() / getNumElements();
1100 }
1101
1102 float NVMatrix::sum() {
1103 return _totalAgg(NVMatrixAggs::Sum());
1104 }
1105
1106 float NVMatrix::max() {
1107 return _totalAgg(NVMatrixAggs::Max());
1108 }
1109
1110 float NVMatrix::min() {
1111 return _totalAgg(NVMatrixAggs::Min());
1112 }
1113
1114 template<class Agg>
1115 float NVMatrix::_totalAgg(Agg agg) {
1116 assert(isContiguous());
1117 dim3 blocks, threads;
1118 int numCols;
1119 // Sum most of it on GPU
1120 NVMatrix* src = this;
1121 for (NVMatrix* target = NULL; src->getNumElements() > CPUSUM_MAX; src = target) {
1122 _sum_setParams(src->getNumElements(), &blocks, &threads, &numCols);
1123 target = new NVMatrix(1, blocks.x);
1124 kTotalAgg<<<blocks, threads>>>(src->getDevData(), target->getDevData(), numCols, src->getNumElements(), agg);
1125 cutilCheckMsg("kTotalAgg: Kernel execution failed");
1126 cudaThreadSynchronize(); // not really necessary?
1127 delete (src == this ? NULL : src);
1128 }
1129
1130 Matrix srcCPU(src->getNumRows(), src->getNumCols());
1131 src->copyToHost(srcCPU);
1132 if (src->getNumElements() > 1) { // Sum remainder on CPU
1133 delete (src == this ? NULL : src);
1134 if (typeid(Agg) == typeid(NVMatrixAggs::Sum)) {
1135 return srcCPU.sum();
1136 } else if (typeid(Agg) == typeid(NVMatrixAggs::Max)) {
1137 return srcCPU.max();
1138 } else if (typeid(Agg) == typeid(NVMatrixAggs::Min)) {
1139 return srcCPU.min();
1140 } else {
1141 assert(false);
1142 }
1143 }
1144 return srcCPU(0,0);
1145 }
1146 */
1147
1148 /*
1149 * Fast dot product only for matrices with same transposedness.
1150 float NVMatrix::dotProduct(NVMatrix& b) {
1151 assert(isContiguous() && b.isContiguous());
1152 assert(isSameDims(b));
1153 assert(isTrans() == b.isTrans()); // see?
1154 dim3 blocks, threads;
1155 int numCols;
1156 _sum_setParams(getNumElements(), &blocks, &threads, &numCols);
1157 NVMatrix target(1, blocks.x);
1158 kDotProduct_r<<<blocks, threads>>>(getDevData(), b.getDevData(), target.getDevData(), numCols, getNumElements());
1159 cutilCheckMsg("kDotProduct: Kernel execution failed");
1160 cudaThreadSynchronize();
1161 return target.sum();
1162 }
1163
1164 float NVMatrix::norm2() {
1165 return dotProduct(*this);
1166 }
1167
1168 float NVMatrix::norm() {
1169 return sqrt(norm2());
1170 }
1171 */
1172
1173 /*
1174 void NVMatrix::print(int startRow, int rows, int startCol, int cols) const {
1175 cudaThreadSynchronize();
1176 Matrix hm = Matrix(_numRows, _numCols);
1177 copyToHost(hm);
1178 hm.print(startRow, rows, startCol, cols);
1179 }
1180
1181 void NVMatrix::print(int rows, int cols) const {
1182 print(0, rows, 0, cols);
1183 }
1184 */
1185
1186 void NVMatrix::printShape(const char* name) const {
1187 printf("%s: %dx%d\n", name, _numRows, _numCols);
1188 }
1189
1190 /*
1191 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
1192 * All rights reserved.
1193 *
1194 * Redistribution and use in source and binary forms, with or without modification,
1195 * are permitted provided that the following conditions are met:
1196 *
1197 * - Redistributions of source code must retain the above copyright notice,
1198 * this list of conditions and the following disclaimer.
1199 *
1200 * - Redistributions in binary form must reproduce the above copyright notice,
1201 * this list of conditions and the following disclaimer in the documentation
1202 * and/or other materials provided with the distribution.
1203 *
1204 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1205 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1206 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1207 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
1208 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
1209 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
1210 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
1211 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1212 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
1213 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1214 */
1215
1216 #ifndef _CONV_UTIL_EXPORT
1217 #define _CONV_UTIL_EXPORT
1218 #endif
1219
1220 #include <iostream>
1221 #include <assert.h>
1222 #include <nvmatrix_kernels.cuh>
1223 #include <nvmatrix.cuh>
1224 #include <conv_util.cuh>
1225
1226 using namespace std;
1227
1228 __device__ inline float square(const float a) {
1229 return a * a;
1230 }
1231
1232 /*
1233 * blockIdx.y determines module in batches of B_Y
1234 * blockIdx.x determines filter in batches of B_X * filtersPerThread
1235 *
1236 * weights: (numModules, numColors, filterPixels, numFilters)
1237 * Not fully coalesced if B_X < 32, so use cache.
1238 */
1239 template <int B_Y, int B_X, int filtersPerThread>
1240 __global__ void kNormalizeLCWeights(float* weights, const uint numFilters, const int numModules, const uint weightsPerFilter, const float norm) {
1241 const uint moduleIdx = B_Y * blockIdx.y + threadIdx.y;
1242 const uint filterIdx = B_X * blockIdx.x + threadIdx.x;
1243
1244 float prod[filtersPerThread];
1245 #pragma unroll
1246 for (uint i = 0; i < filtersPerThread; ++i) {
1247 prod[i] = 0;
1248 }
1249 if (moduleIdx < numModules) {
1250 weights += moduleIdx * weightsPerFilter * numFilters + filterIdx;
1251 for (uint p = 0; p < weightsPerFilter; ++p) {
1252 #pragma unroll
1253 for (uint i = 0; i < filtersPerThread; ++i) {
1254 prod[i] += square(weights[p * numFilters + i * B_X]);
1255 }
1256 }
1257
1258 #pragma unroll
1259 for (uint i = 0; i < filtersPerThread; ++i) {
1260 prod[i] = sqrtf(prod[i]);
1261 prod[i] = prod[i] > norm ? __fdividef(norm, prod[i]) : 1.0f;
1262 }
1263
1264 for (uint p = 0; p < weightsPerFilter; ++p) {
1265 #pragma unroll
1266 for (uint i = 0; i < filtersPerThread; ++i) {
1267 weights[p * numFilters + i * B_X] *= prod[i];
1268 }
1269 }
1270 }
1271 }
1272
1273 /*
1274 * weights: (numModules, numColors, filterPixels, numFilters)
1275 */
1276 void normalizeLocalWeights(NVMatrix& weights, int numModules, float norm) {
1277 int numFilters = weights.getNumCols();
1278 int weightsPerFilter = weights.getNumRows() / numModules;
1279 assert(numModules * weightsPerFilter == weights.getNumRows());
1280
1281 assert(!weights.isTrans());
1282 assert(weights.isContiguous());
1283 assert(numFilters % 16 == 0);
1284
1285 int bx = numFilters % 32 == 0 ? 32 : 16;
1286 int by = bx == 32 ? 4 : 8;
1287
1288 int filtersPerThread = numFilters % 128 == 0 ? 4 : numFilters % 64 == 0 ? 2 : 1;
1289 dim3 blocks(numFilters / (bx * filtersPerThread), DIVUP(numModules, by));
1290 dim3 threads(bx, by);
1291 if (filtersPerThread == 4) {
1292 cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 4>, cudaFuncCachePreferL1);
1293 kNormalizeLCWeights<4, 32, 4><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1294 } else if (filtersPerThread == 2) {
1295 cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 2>, cudaFuncCachePreferL1);
1296 kNormalizeLCWeights<4, 32, 2><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1297 } else {
1298 if (numFilters % 32 == 0) {
1299 cudaFuncSetCacheConfig(kNormalizeLCWeights<4, 32, 1>, cudaFuncCachePreferL1);
1300 kNormalizeLCWeights<4, 32, 1><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1301 } else {
1302 cudaFuncSetCacheConfig(kNormalizeLCWeights<8, 16, 1>, cudaFuncCachePreferL1);
1303 kNormalizeLCWeights<8, 16, 1><<<blocks, threads>>>(weights.getDevData(), numFilters, numModules, weightsPerFilter, norm);
1304 }
1305 }
1306 }
1307
1308 /*
1309 * Block size 4x32
1310 * blockIdx.x determines img idx in batches of 32*imgsPerThread
1311 * blockIdx.y determines channel idx, pixel idx in batches of 4
1312 *
1313 * threadIdx.x determins case idx
1314 * threadIdx.y determines pixel idx
1315 *
1316 * imgs: (numChannels, imgPixels, numImages) with given imgStride
1317 * target: (numChannels, tgtPixels, numImages)
1318 */
1319 template <int imgsPerThread, bool checkCaseBounds>
1320 __global__ void kCrop(float* imgs, float* target, const uint numImages, const int imgStride,
1321 const uint imgSize, const uint tgtSize, const uint startY, const uint startX) {
1322 const uint imgPixels = imgSize * imgSize;
1323 const uint tgtPixels = tgtSize * tgtSize;
1324 const uint caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1325 const uint blockChanIdx = blockIdx.y / DIVUP(tgtPixels, 4);
1326 const uint tgtPixelIdx = 4*(blockIdx.y % DIVUP(tgtPixels, 4)) + threadIdx.y;
1327 const uint tgtPxY = tgtPixelIdx / tgtSize;
1328 const uint tgtPxX = tgtPixelIdx % tgtSize;
1329 const uint srcPixelIdx = (startY + tgtPxY) * imgSize + startX + tgtPxX;
1330
1331 if (tgtPixelIdx < tgtPixels) {
1332 imgs += (blockChanIdx * imgPixels + srcPixelIdx) * imgStride + caseIdx;
1333 target += (blockChanIdx * tgtPixels + tgtPixelIdx) * numImages + caseIdx;
1334
1335 #pragma unroll
1336 for (uint i = 0; i < imgsPerThread; ++i) {
1337 if (!checkCaseBounds || (caseIdx + 32 * i < numImages)) {
1338 target[i * 32] = imgs[i * 32];
1339 }
1340 }
1341 }
1342 }
1343
1344 /*
1345 * Block size 4x32
1346 * blockIdx.y determines pixel idx in batches of 4
1347 * blockIdx.x determines case idx in batches of 32*imgsPerThread
1348 * threadIdx.y determines pixel idx
1349 * threadIdx.x determines case idx
1350 *
1351 * imgs: (3, imgPixels, numImages) with given imgStride
1352 * target: (3, imgPixels, numImages)
1353 *
1354 * Each thread produces (y,u,v) values for a particular (r,g,b) pixel
1355 *
1356 * The RGB --> YUV transform is (http://en.wikipedia.org/wiki/YUV):
1357 *
1358 * [Y] [0.2126 0.7152 0.0722 ][R]
1359 * [U] = [-0.09991 -0.33609 0.436 ][G]
1360 * [V] [0.615 -0.55861 -0.05639][B]
1361 */
1362 template <int imgsPerThread, bool checkCaseBounds>
1363 __global__ void kRGBToYUV(float* imgs, float* target, const int imgPixels, const int numImages, const int imgStride) {
1364 const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1365 const int pxIdx = blockIdx.y * 4 + threadIdx.y;
1366
1367 if (pxIdx < imgPixels) {
1368 const int imgChannelStride = imgPixels * imgStride;
1369 const int tgtChannelStride = imgPixels * numImages;
1370 imgs += pxIdx * imgStride + caseIdx;
1371 target += pxIdx * numImages + caseIdx;
1372
1373 #pragma unroll
1374 for (int i = 0; i < imgsPerThread; ++i) {
1375 if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
1376 const float R = imgs[0 * imgChannelStride + i * 32];
1377 const float G = imgs[1 * imgChannelStride + i * 32];
1378 const float B = imgs[2 * imgChannelStride + i * 32];
1379 target[0 * tgtChannelStride + i * 32] = 0.2126f * R + 0.7152f * G + 0.0722f * B; // Y
1380 target[1 * tgtChannelStride + i * 32] = -0.09991f * R + -0.33609f * G + 0.436f * B; // U
1381 target[2 * tgtChannelStride + i * 32] = 0.615f * R + -0.55861f * G + -0.05639f * B; // V
1382 }
1383 }
1384 }
1385 }
1386
1387 __device__ inline float labf(const float x) {
1388 if (x > 0.0088564517f) {
1389 return __powf(x, 0.3333f);
1390 }
1391 return 7.787037f * x + 0.13793103f;
1392 }
1393
1394 /*
1395 * Block size 4x32
1396 * blockIdx.y determines pixel idx in batches of 4
1397 * blockIdx.x determines case idx in batches of 32*imgsPerThread
1398 * threadIdx.y determines pixel idx
1399 * threadIdx.x determines case idx
1400 *
1401 * imgs: (3, imgPixels, numImages) with given imgStride
1402 * target: (3, imgPixels, numImages)
1403 *
1404 * This proceeds in two steps.
1405 *
1406 * - First, RGB values are linearly transformed to XYZ as per
1407 * http://en.wikipedia.org/wiki/CIE_XYZ_color_space
1408 * - Second, XYZ values are nonlinearly transformed to L*a*b* as per
1409 * http://en.wikipedia.org/wiki/Lab_color_space#The_forward_transformation
1410 *
1411 * Each thread produces (L*,a*,b*) values for a particular (r,g,b) pixel
1412 *
1413 * The RGB --> XYZ transform is:
1414 *
1415 * [X] [0.49 0.31 0.2 ][R]
1416 * [Y] = 5.6506753 * [0.17697 0.8124 0.01063 ][G]
1417 * [Z] [0 0.01 0.99 ][B]
1418 *
1419 * NOTE: The input should be in the range 0-1. Don't do mean-subtraction beforehand.
1420 *
1421 * Then X_max, Y_max, Z_max = 5.6506753.
1422 *
1423 * The range of the L* values is [0, 100].
1424 * If the center flag is given, the range will be [-50, 50].
1425 *
1426 */
1427 template <int imgsPerThread, bool checkCaseBounds, bool center>
1428 __global__ void kRGBToLAB(float* imgs, float* target, const int imgPixels, const int numImages, const int imgStride) {
1429 const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1430 const int pxIdx = blockIdx.y * 4 + threadIdx.y;
1431
1432 if (pxIdx < imgPixels) {
1433 const int imgChannelStride = imgPixels * imgStride;
1434 const int tgtChannelStride = imgPixels * numImages;
1435 imgs += pxIdx * imgStride + caseIdx;
1436 target += pxIdx * numImages + caseIdx;
1437
1438 #pragma unroll
1439 for (int i = 0; i < imgsPerThread; ++i) {
1440 if (!checkCaseBounds || caseIdx + i * 32 < numImages) {
1441 const float R = imgs[0 * imgChannelStride + i * 32];
1442 const float G = imgs[1 * imgChannelStride + i * 32];
1443 const float B = imgs[2 * imgChannelStride + i * 32];
1444
1445 const float X = (0.49f * R + 0.31f * G + 0.2f * B);
1446 const float Y = (0.17697f * R + 0.8124f * G + 0.01063f * B);
1447 const float Z = (0.01f * G + 0.99f * B);
1448
1449 const float labX = labf(X);
1450 const float labY = labf(Y);
1451 const float labZ = labf(Z);
1452
1453 target[0 * tgtChannelStride + i * 32] = 116.0f * labY - 16.0f - (center ? 50.0f : 0); // L*
1454 target[1 * tgtChannelStride + i * 32] = 500.0f * (labX - labY); // a*
1455 target[2 * tgtChannelStride + i * 32] = 200.0f * (labY - labZ); // b*
1456 }
1457 }
1458 }
1459 }
1460
1461 /*
1462 * Block size 16x32.
1463 * Each block produces a 4x4 chunk of the output image.
1464 * threadIdx.y determines pixel idx in 4x4 chunk.
1465 * threadIdx.x determines case idx.
1466 * blockIdx.x determines case idx in batches of 32*imgsPerThread.
1467 * blockIdx.y determines 4x4 chunk idx, channel idx.
1468 *
1469 * imgs: (numChannels, imgPixels, numImages) with given imgStride
1470 * target: (numChannels, tgtPixels, numImages)
1471 *
1472 * imgSize = scale * tgtSize (roughly)
1473 *
1474 * This is a rather naive kernel that relies on cache for speed. But all it's doing
1475 * is basic texture manipulation, which is very local in nature, so it should be ok.
1476 * Also, it will in practice be a tiny fraction of the runtime of a large convnet.
1477 *
1478 * So that is my justification for being lazy here.
1479 */
1480 template <int imgsPerThread, bool checkCaseBounds>
1481 __global__ void kResizeBilinear(float* imgs, float* target, const int imgSize, const int tgtSize,
1482 const int numImages, const int imgStride, const float scale,
1483 const float centerScale) {
1484 const int numChunksX = DIVUP(tgtSize, 4);
1485 const int numChunks = numChunksX * numChunksX;
1486 const int channelIdx = blockIdx.y / numChunks;
1487 const int chunkIdx = blockIdx.y % numChunks;
1488 const int chunkIdxX = chunkIdx % numChunksX;
1489 const int chunkIdxY = chunkIdx / numChunksX;
1490 const int caseIdx = blockIdx.x * 32 * imgsPerThread + threadIdx.x;
1491 const int imgPixels = imgSize * imgSize;
1492 const int tgtPixels = tgtSize * tgtSize;
1493
1494 const int pxX = 4 * chunkIdxX + threadIdx.y % 4;
1495 const int pxY = 4 * chunkIdxY + threadIdx.y / 4;
1496
1497 if (pxY < tgtSize && pxX < tgtSize) {
1498 const int pxIdx = pxY * tgtSize + pxX;
1499
1500 imgs += channelIdx * imgPixels * imgStride + caseIdx;
1501 target += channelIdx * tgtPixels * numImages + pxIdx * numImages + caseIdx;
1502
1503 // This will cause slight distortions at the edges when upsampling in some cases.
1504 // But I think that's not a big deal.
1505 const float srcPxX = fmaxf(0.0f, fminf(__int2float_rn(imgSize) - 1.01f, __int2float_rn(pxX) * scale + centerScale));
1506 const float srcPxY = fmaxf(0.0f, fminf(__int2float_rn(imgSize) - 1.01f, __int2float_rn(pxY) * scale + centerScale));
1507
1508 const float u = floorf(srcPxX + 1) - srcPxX;
1509 const float w = srcPxY - floorf(srcPxY);
1510
1511 // Consider doing max(0, min(imgSize, x)) here
1512 const int srcPx0 = (__float2int_rd(srcPxY) * imgSize + __float2int_rd(srcPxX)); // top-left
1513 const int srcPx1 = srcPx0 + 1; // top-right
1514 const int srcPx2 = srcPx0 + imgSize; // bottom-left
1515 const int srcPx3 = srcPx2 + 1; // bottom-right
1516
1517 #pragma unroll
1518 for (int c = 0; c < imgsPerThread; ++c) {
1519 if (!checkCaseBounds || caseIdx + c * 32 < numImages) {
1520 const float val0 = imgs[srcPx0 * imgStride + c * 32];
1521 const float val1 = imgs[srcPx1 * imgStride + c * 32];
1522 const float val2 = imgs[srcPx2 * imgStride + c * 32];
1523 const float val3 = imgs[srcPx3 * imgStride + c * 32];
1524
1525 const float c0 = u * (val0 - val1) + val1;
1526 const float c1 = u * (val2 - val3) + val3;
1527
1528 target[32 * c] = w * (c1 - c0) + c0;
1529 }
1530 }
1531 }
1532 }
1533
1534 /*
1535 * Block size B_YxB_X.
1536 * B_X*imgsPerThread*blockIdx.x + threadIdx.x determines img idx
1537 * B_Y*blockIdx.y + threadIdx.y determines img row (col if !horiz), channel idx
1538 *
1539 * imgs: (numChannels, imgPixels, numImages) with given imgStride
1540 * filter: (1, 2*radius + 1)
1541 * target: (numChannels, imgPixels, numImages)
1542 *
1543 * target can be the same matrix as imgs.
1544 * radius must be one of 3, 5, 7, 9.
1545 *
1546 * Tried imgsPerThread, slower.
1547 */
1548 template<int B_Y, int B_X, int radius>
1549 __global__ void kGaussianBlur(float* imgs, float* filter, float* target, const int imgSize,
1550 const int numImages, const int imgStride,
1551 const bool horiz,
1552 const float scaleTargets, const float scaleOutputs) {
1553 __shared__ float shFilter[radius];
1554
1555 const int imgPixels = imgSize * imgSize;
1556 const int ty = B_Y * blockIdx.y + threadIdx.y;
1557 const int channelIdx = ty / imgSize;
1558 const int rowIdx = ty % imgSize;
1559 const int imgIdx = B_X*blockIdx.x + threadIdx.x;
1560 const int filterWidth = 2*radius+1;
1561 // const int tidx = B_Y * threadIdx.y + threadIdx.x;
1562 if (horiz) {
1563 imgs += channelIdx * imgPixels * imgStride + rowIdx * imgSize * imgStride + imgIdx;
1564 target += channelIdx * imgPixels * numImages + rowIdx * imgSize * numImages + imgIdx;
1565 } else {
1566 imgs += channelIdx * imgPixels * imgStride + rowIdx * imgStride + imgIdx;
1567 target += channelIdx * imgPixels * numImages + rowIdx * numImages + imgIdx;
1568 }
1569 float outputs[filterWidth-1];
1570 #pragma unroll
1571 for (int r = 0; r < filterWidth-1; r++) {
1572 outputs[r] = 0;
1573 }
1574 if (threadIdx.x < filterWidth-1) {
1575 shFilter[threadIdx.x] = filter[threadIdx.x];
1576 }
1577 __syncthreads();
1578
1579 if (imgIdx < numImages) {
1580 // This writes radius*2 = filterWidth - 1 values to outputs
1581 #pragma unroll
1582 for (int col = 0; col < radius; col++) {
1583 float px = imgs[0];
1584 #pragma unroll
1585 for (int r = 0; r < radius + 1 + col; r++) {
1586 outputs[r] += px * shFilter[radius + col - r];
1587 }
1588 imgs += horiz ? imgStride : imgStride * imgSize;
1589 }
1590
1591 // Unfortunately this has to be at this level of granularity
1592 if (scaleTargets != 0) {
1593 for (int col = radius; col < imgSize ; col++) { // loop over img columns
1594 float px = imgs[0];
1595 target[0] = scaleTargets * target[0] + scaleOutputs * (outputs[0] + px * shFilter[0]);
1596
1597 #pragma unroll
1598 for (int r = 1; r < radius*2; r++) {
1599 outputs[r-1] = outputs[r] + px * shFilter[r];
1600 }
1601 outputs[filterWidth - 2] = px * shFilter[0];
1602
1603 imgs += horiz ? imgStride : imgStride * imgSize;
1604 target += horiz ? numImages : numImages * imgSize;
1605 }
1606
1607 #pragma unroll
1608 for (int r = 0; r < radius; r++) {
1609 float* t = &target[0];
1610 t[0] = scaleTargets * t[0] + scaleOutputs * outputs[r];
1611 target += horiz ? numImages : numImages * imgSize;
1612 }
1613 } else {
1614 for (int col = radius; col < imgSize ; col++) { // loop over img columns
1615 float px = imgs[0];
1616 target[0] = scaleOutputs * (outputs[0] + px * shFilter[0]);
1617 #pragma unroll
1618 for (int r = 1; r < radius*2; r++) {
1619 outputs[r-1] = outputs[r] + px * shFilter[r];
1620 }
1621 outputs[filterWidth - 2] = px * shFilter[0];
1622
1623 imgs += horiz ? imgStride : imgStride * imgSize;
1624 target += horiz ? numImages : numImages * imgSize;
1625 }
1626
1627 #pragma unroll
1628 for (int r = 0; r < radius; r++) {
1629 target[0] = scaleOutputs * outputs[r];
1630 target += horiz ? numImages : numImages * imgSize;
1631 }
1632 }
1633 }
1634 }
1635
1636 /*
1637 * Block size B_YxB_X
1638 * blockIdx.x determines output.x, image idx in batches of B_X*imgsPerThread
1639 * blockIdx.y determines output.y, filter idx in batches of B_Y*filtersPerThread
1640 *
1641 * So each block does one output for some number of images/filters.
1642 *
1643 * threadIdx.x determines img idx
1644 * threadIdx.y determines filter idx
1645 *
1646 * imgs: (numChannels, imgPixels, numImages)
1647 * target: (numChannels, numOutputs, numImages)
1648 *
1649 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
1650 * numFilters must be divisible by filtersPerThread
1651 */
1652
1653 template<int B_Y, int B_X, int imgsPerThread, int chansPerThread, bool checkCaseBounds>
1654 __global__ void kBedOfNails(float* imgs, float* target, const int imgSize, const int numChannels,
1655 const int numImages, const int startX, const int strideX, const int outputsX,
1656 const bool reverse, const float scaleTargets, const float scaleOutput) {
1657 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
1658 const int numChanBlocks = DIVUP(numChannels, B_Y*chansPerThread);
1659 const int outputIdxX = blockIdx.x / numImgBlocks;
1660 const int outputIdxY = blockIdx.y / numChanBlocks;
1661 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
1662 const int blockChanIdx = (blockIdx.y % numChanBlocks) * B_Y * chansPerThread;
1663 const int myChanIdx = (blockChanIdx + threadIdx.y*chansPerThread);
1664 if (myChanIdx >= numChannels) {
1665 return;
1666 }
1667 // if (blockIdx.x != 0 || blockIdx.y != 0) {
1668 // return;
1669 // }
1670 const int outputIdx = outputIdxY * outputsX + outputIdxX;
1671 const int numOutputs = outputsX * outputsX;
1672 const int imgPixels = imgSize * imgSize;
1673
1674 const int startImgPxX = startX + outputIdxX * strideX;
1675 const int startImgPxY = startX + outputIdxY * strideX;
1676 const int imgIdx = blockImgIdx + threadIdx.x;
1677 const int imgPx = startImgPxY * imgSize + startImgPxX;
1678
1679 imgs += myChanIdx * imgPixels * numImages + imgPx * numImages + imgIdx;
1680 target += (myChanIdx * numOutputs + outputIdx) * numImages + imgIdx;
1681
1682 if (scaleTargets != 0) {
1683 if (!reverse) {
1684 #pragma unroll
1685 for (int i = 0; i < imgsPerThread; i++) {
1686 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1687 #pragma unroll
1688 for (int c = 0; c < chansPerThread; c++) {
1689 target[c * numOutputs * numImages + i * B_X] = scaleTargets * target[c * numOutputs * numImages + i * B_X] + scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
1690 }
1691 }
1692 }
1693 } else {
1694 #pragma unroll
1695 for (int i = 0; i < imgsPerThread; i++) {
1696 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1697 #pragma unroll
1698 for (int c = 0; c < chansPerThread; c++) {
1699 imgs[c * imgPixels * numImages + i * B_X] = scaleTargets * imgs[c * imgPixels * numImages + i * B_X] + scaleOutput * target[c * numOutputs * numImages + i * B_X];
1700 }
1701 }
1702 }
1703 }
1704 } else {
1705 if (!reverse) {
1706 #pragma unroll
1707 for (int i = 0; i < imgsPerThread; i++) {
1708 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1709 #pragma unroll
1710 for (int c = 0; c < chansPerThread; c++) {
1711 target[c * numOutputs * numImages + i * B_X] = scaleOutput * imgs[c * imgPixels * numImages + i * B_X];
1712 }
1713 }
1714 }
1715 } else {
1716 #pragma unroll
1717 for (int i = 0; i < imgsPerThread; i++) {
1718 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
1719 #pragma unroll
1720 for (int c = 0; c < chansPerThread; c++) {
1721 imgs[c * imgPixels * numImages + i * B_X] = scaleOutput * target[c * numOutputs * numImages + i * B_X];
1722 }
1723 }
1724 }
1725 }
1726 }
1727
1728 }
1729
1730 /*
1731 * imgs: (numChannels, imgPixels, numImages)
1732 * target: (numChannels, outputs, numImages)
1733 */
1734 void _convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX, int strideX,
1735 bool reverse, float scaleTargets, float scaleOutput) {
1736 int numImages = reverse ? target.getNumCols() : images.getNumCols();
1737 int imgPixels = imgSize * imgSize;
1738
1739 assert(!images.isTrans());
1740 assert(!target.isTrans());
1741 assert(images.isContiguous());
1742 assert(target.isContiguous());
1743 assert(strideX > 1);
1744
1745 int outputsX = DIVUP(imgSize, strideX);
1746 int outputs = outputsX * outputsX;
1747 if (reverse) {
1748 assert(target.getNumRows() == numChannels * outputs);
1749 } else {
1750 assert(images.getNumRows() == numChannels * imgPixels);
1751 }
1752
1753 if (scaleTargets == 0) {
1754 if (reverse) {
1755 images.resize(numChannels * imgPixels, numImages);
1756 images.apply(NVMatrixOps::Zero());
1757 } else {
1758 target.resize(numChannels*outputs, numImages);
1759 }
1760 } else {
1761 if (reverse) {
1762 assert(images.getNumRows() == numChannels * outputs);
1763 assert(images.getNumCols() == numImages);
1764 } else {
1765 assert(target.getNumRows() == numChannels * outputs);
1766 assert(target.getNumCols() == numImages);
1767 }
1768 }
1769
1770
1771 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
1772 bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
1773 int chansPerThread = numChannels % 8 == 0 ? 2 : 1;
1774 dim3 threads(32, 4);
1775 dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numChannels, 4 * chansPerThread) * outputsX);
1776
1777 if (imgsPerThread == 4) {
1778 if (chansPerThread == 1) {
1779 if (checkCaseBounds) {
1780 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 1, true>, cudaFuncCachePreferL1);
1781 kBedOfNails<4, 32, 4, 1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1782 imgSize, numChannels, numImages, startX, strideX, outputsX,
1783 reverse, scaleTargets, scaleOutput);
1784 } else {
1785 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 1, false>, cudaFuncCachePreferL1);
1786 kBedOfNails<4, 32, 4, 1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1787 imgSize, numChannels, numImages, startX, strideX, outputsX,
1788 reverse, scaleTargets, scaleOutput);
1789 }
1790 } else {
1791 if (checkCaseBounds) {
1792 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
1793 kBedOfNails<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1794 imgSize, numChannels, numImages, startX, strideX, outputsX,
1795 reverse, scaleTargets, scaleOutput);
1796 } else {
1797 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
1798 kBedOfNails<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1799 imgSize, numChannels, numImages, startX, strideX, outputsX,
1800 reverse, scaleTargets, scaleOutput);
1801 }
1802 }
1803 } else if (imgsPerThread == 2) {
1804 if (chansPerThread == 1) {
1805 if (checkCaseBounds) {
1806 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 1, true>, cudaFuncCachePreferL1);
1807 kBedOfNails<4, 32, 2, 1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1808 imgSize, numChannels, numImages, startX, strideX, outputsX,
1809 reverse, scaleTargets, scaleOutput);
1810 } else {
1811 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 1, false>, cudaFuncCachePreferL1);
1812 kBedOfNails<4, 32, 2, 1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1813 imgSize, numChannels, numImages, startX, strideX, outputsX,
1814 reverse, scaleTargets, scaleOutput);
1815 }
1816 } else {
1817 if (checkCaseBounds) {
1818 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 2, true>, cudaFuncCachePreferL1);
1819 kBedOfNails<4, 32, 2, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1820 imgSize, numChannels, numImages, startX, strideX, outputsX,
1821 reverse, scaleTargets, scaleOutput);
1822 } else {
1823 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 2, 2, false>, cudaFuncCachePreferL1);
1824 kBedOfNails<4, 32, 2, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1825 imgSize, numChannels, numImages, startX, strideX, outputsX,
1826 reverse, scaleTargets, scaleOutput);
1827 }
1828 }
1829 } else {
1830 if (chansPerThread == 1) {
1831 if (checkCaseBounds) {
1832 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 1, true>, cudaFuncCachePreferL1);
1833 kBedOfNails<4, 32, 1, 1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1834 imgSize, numChannels, numImages, startX, strideX, outputsX,
1835 reverse, scaleTargets, scaleOutput);
1836 } else {
1837 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 1, false>, cudaFuncCachePreferL1);
1838 kBedOfNails<4, 32, 1, 1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1839 imgSize, numChannels, numImages, startX, strideX, outputsX,
1840 reverse, scaleTargets, scaleOutput);
1841 }
1842 } else {
1843 if (checkCaseBounds) {
1844 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 2, true>, cudaFuncCachePreferL1);
1845 kBedOfNails<4, 32, 1, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1846 imgSize, numChannels, numImages, startX, strideX, outputsX,
1847 reverse, scaleTargets, scaleOutput);
1848 } else {
1849 cudaFuncSetCacheConfig(kBedOfNails<4, 32, 1, 2, false>, cudaFuncCachePreferL1);
1850 kBedOfNails<4, 32, 1, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
1851 imgSize, numChannels, numImages, startX, strideX, outputsX,
1852 reverse, scaleTargets, scaleOutput);
1853 }
1854 }
1855 }
1856 }
1857
1858 void convBedOfNails(NVMatrix& images, NVMatrix& target, int numChannels, int imgSize, int startX,
1859 int strideX, float scaleTargets, float scaleOutput) {
1860 _convBedOfNails(images, target, numChannels, imgSize, startX, strideX, false, scaleTargets, scaleOutput);
1861 }
1862
1863 void convBedOfNailsUndo(NVMatrix& actsGrad, NVMatrix& target, int numChannels, int imgSize,
1864 int startX, int strideX, float scaleTargets, float scaleOutput) {
1865
1866 _convBedOfNails(target, actsGrad, numChannels, imgSize, startX, strideX, true, scaleTargets, scaleOutput);
1867 }
1868
1869
1870 /*
1871 * imgs: (numChannels, imgPixels, numImages) with given imgStride
1872 * filter: (1, 2*radius + 1)
1873 * target: (numChannels, imgPixels, numImages)
1874 */
1875 void convGaussianBlur(NVMatrix& images, NVMatrix& filter, NVMatrix& target, bool horiz, int numChannels,
1876 float scaleTargets, float scaleOutputs) {
1877 int numImages = images.getNumCols();
1878 int radius = filter.getNumCols() / 2;
1879 int imgPixels = images.getNumRows() / numChannels;
1880 int imgSize = int(sqrt((double)imgPixels));
1881
1882 assert(imgPixels == imgSize * imgSize);
1883 assert(radius >= 1 && radius <= 4);
1884 assert(imgSize >= 2 * radius + 1);
1885 assert(filter.getNumRows() == 1);
1886 assert(images.getNumRows() == numChannels * imgPixels);
1887 assert(!images.isTrans());
1888 assert(!filter.isTrans());
1889 assert(!target.isTrans());
1890 assert(target.isContiguous());
1891 if (scaleTargets == 0) {
1892 target.resize(images);
1893 } else {
1894 assert(target.isSameDims(images));
1895 }
1896
1897 dim3 threads(32, 4);
1898 dim3 blocks(DIVUP(numImages, threads.x), DIVUP(numChannels*imgSize, threads.y));
1899
1900 if (radius == 1) {
1901 cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 1>, cudaFuncCachePreferL1);
1902 kGaussianBlur<4, 32, 1><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1903 imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1904
1905 } else if (radius == 2) {
1906 cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 2>, cudaFuncCachePreferL1);
1907 kGaussianBlur<4, 32, 2><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1908 imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1909
1910 } else if (radius == 3) {
1911 cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 3>, cudaFuncCachePreferL1);
1912 kGaussianBlur<4, 32, 3><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1913 imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1914 } else if (radius == 4) {
1915 cudaFuncSetCacheConfig(kGaussianBlur<4, 32, 4>, cudaFuncCachePreferL1);
1916 kGaussianBlur<4, 32, 4><<<blocks, threads>>>(images.getDevData(), filter.getDevData(), target.getDevData(),
1917 imgSize, numImages, images.getStride(), horiz, scaleTargets, scaleOutputs);
1918 }
1919 }
1920
1921 /*
1922 * Block size 1x128
1923 * blockIdx.x determines pixel.x, image idx in batches of 128*imgsPerThread
1924 * blockIdx.y determines pixel.y
1925 *
1926 * So each block does one output for some number of images and all the fliters.
1927 *
1928 * threadIdx.x determines img idx
1929 *
1930 * imgs: (numFilters, imgPixels, numImages)
1931 * meanDiffs: (numFilters, imgPixels, numImages)
1932 * denoms: (numFilters, imgPixels, numImages) (out)
1933 * target: (numFilters, imgPixels, numImages) (out)
1934 *
1935 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
1936 * numFilters must be divisible by B_Y*filtersPerThread
1937 */
1938
1939 template<int imgsPerThread, int numFilters, bool checkCaseBounds>
1940 __global__ void kCNorm_fewfilter(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
1941 const int numImages, const int sizeX, const float addScale, const float powScale) {
1942
1943 const int imgPixels = imgSize * imgSize;
1944 const int numImgBlocks = DIVUP(numImages, 128*imgsPerThread);
1945 const int pxIdxX = blockIdx.x / numImgBlocks;
1946 const int pxIdxY = blockIdx.y;
1947 const int blockImgIdx = (blockIdx.x % numImgBlocks) * 128 * imgsPerThread;
1948
1949 const int pxIdx = pxIdxY * imgSize + pxIdxX;
1950
1951 const int startPxX = -sizeX/2 + pxIdxX;
1952 const int startPxY = -sizeX/2 + pxIdxY;
1953 const int imgIdx = blockImgIdx + threadIdx.x;
1954
1955 imgs += pxIdx * numImages + imgIdx;
1956 denoms += pxIdx * numImages + imgIdx;
1957 meanDiffs += imgIdx;
1958 target += pxIdx * numImages + imgIdx;
1959
1960 float prod[numFilters][imgsPerThread];
1961 #pragma unroll
1962 for (int i = 0; i < imgsPerThread; i++) {
1963 if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
1964 #pragma unroll
1965 for (int f = 0; f < numFilters; f++) {
1966 prod[f][i] = 0;
1967 }
1968 }
1969 }
1970 const int loopStartY = MAX(0, startPxY);
1971 const int loopStartX = MAX(0, startPxX);
1972 const int loopEndY = MIN(imgSize, startPxY + sizeX);
1973 const int loopEndX = MIN(imgSize, startPxX + sizeX);
1974
1975 for (int y = loopStartY; y < loopEndY; y++) {
1976 for (int x = loopStartX; x < loopEndX; x++) {
1977 const int imgPx = y * imgSize + x;
1978 #pragma unroll
1979 for (int i = 0; i < imgsPerThread; i++) {
1980 if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
1981 #pragma unroll
1982 for (int f = 0; f < numFilters; f++) {
1983 prod[f][i] += square(meanDiffs[(f * imgPixels + imgPx) * numImages + i * 128]);
1984 }
1985 }
1986 }
1987 }
1988 }
1989
1990 #pragma unroll
1991 for (int i = 0; i < imgsPerThread; i++) {
1992 if (!checkCaseBounds || imgIdx + i * 128 < numImages) {
1993 #pragma unroll
1994 for (int f = 0; f < numFilters; f++) {
1995 prod[f][i] = 1 + addScale * prod[f][i];
1996 denoms[f * imgPixels * numImages + i * 128] = prod[f][i];
1997 target[f * imgPixels * numImages + i * 128] = imgs[f * imgPixels * numImages + i * 128] * __powf(prod[f][i], -powScale);
1998 }
1999 }
2000 }
2001 }
2002
2003 /*
2004 * Block size B_YxB_X
2005 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2006 * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2007 *
2008 * So each block does one pixel for some number of images/filters.
2009 *
2010 * threadIdx.x determines img idx
2011 * threadIdx.y determines filter idx
2012 *
2013 * imgs: (numFilters, imgPixels, numImages)
2014 * means: (numFilters, imgPixels, numImages)
2015 * denoms: (numFilters, imgPixels, numImages) (out)
2016 * target: (numFilters, imgPixels, numImages) (out)
2017 *
2018 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2019 * numFilters must be divisible by B_Y*filtersPerThread
2020 */
2021 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2022 __global__ void kCNorm_manyfilter(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
2023 const int numFilters, const int numImages, const int sizeX,
2024 const float addScale, const float powScale) {
2025 const int imgPixels = imgSize * imgSize;
2026 const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2027 const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2028 const int pxIdxX = blockIdx.x / numImgBlocks;
2029 const int pxIdxY = blockIdx.y / numFilterBlocks;
2030 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2031 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2032
2033 const int pxIdx = pxIdxY * imgSize + pxIdxX;
2034
2035 const int startPxX = -sizeX/2 + pxIdxX;
2036 const int startPxY = -sizeX/2 + pxIdxY;
2037 const int imgIdx = blockImgIdx + threadIdx.x;
2038
2039 imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2040 meanDiffs += (blockFilterIdx + threadIdx.y) * imgPixels * numImages + imgIdx;
2041 denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2042 target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2043
2044 float prod[filtersPerThread][imgsPerThread];
2045 #pragma unroll
2046 for (int i = 0; i < imgsPerThread; i++) {
2047 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2048 #pragma unroll
2049 for (int f = 0; f < filtersPerThread; f++) {
2050 prod[f][i] = 0;
2051 }
2052 }
2053 }
2054
2055 const int loopStartY = MAX(0, startPxY);
2056 const int loopStartX = MAX(0, startPxX);
2057 const int loopEndY = MIN(imgSize, startPxY + sizeX);
2058 const int loopEndX = MIN(imgSize, startPxX + sizeX);
2059
2060 for (int y = loopStartY; y < loopEndY; y++) {
2061 for (int x = loopStartX; x < loopEndX; x++) {
2062 const int imgPx = y * imgSize + x;
2063 #pragma unroll
2064 for (int i = 0; i < imgsPerThread; i++) {
2065 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2066 #pragma unroll
2067 for (int f = 0; f < filtersPerThread; f++) {
2068 prod[f][i] += square(meanDiffs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]);
2069 }
2070 }
2071 }
2072 }
2073 }
2074
2075 #pragma unroll
2076 for (int i = 0; i < imgsPerThread; i++) {
2077 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2078 #pragma unroll
2079 for (int f = 0; f < filtersPerThread; f++) {
2080 prod[f][i] = 1 + addScale * prod[f][i];
2081 denoms[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2082 target[f * B_Y * imgPixels * numImages + i * B_X] = imgs[f * B_Y * imgPixels * numImages + i * B_X] * __powf(prod[f][i], -powScale);
2083 }
2084 }
2085 }
2086 }
2087
2088
2089 /*
2090 * Block size 16xB_X
2091 * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
2092 * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
2093 *
2094 * So each block does 4x4 region of pixels for some number of images/filters.
2095 *
2096 * threadIdx.x determines img idx
2097 * threadIdx.y determines pixel idx
2098 *
2099 * imgs: (numFilters, imgPixels, numImages)
2100 * means: (numFilters, imgPixels, numImages)
2101 * denoms: (numFilters, imgPixels, numImages) (out)
2102 * target: (numFilters, imgPixels, numImages) (out)
2103 *
2104 * B_X one of 8, 16, 32
2105 * imgsPerThread one of 1, 2, 4, 8, 16
2106 *
2107 * B_XximgsPerThread MUST be divisible by 32.
2108 * Number of filters MUST be divisible by filtersPerThread.
2109 *
2110 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2111 * numFilters must be divisible by filtersPerThread
2112 *
2113 * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
2114 * reading than writing here, and the reading is all coalesced, so it should be OK.
2115 */
2116 template<int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2117 __global__ void kCNorm2(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
2118 const int numFilters, const int numImages, const int sizeX, const float addScale, const float powScale) {
2119 __shared__ float shDiffs[filtersPerThread][B_X*imgsPerThread];
2120 const int imgPixels = imgSize * imgSize;
2121 const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2122 const int numFilterBlocks = numFilters/(filtersPerThread);
2123 const int blockPxX = 4*(blockIdx.x / numImgBlocks);
2124 const int blockPxY = 4*(blockIdx.y / numFilterBlocks);
2125 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2126 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
2127
2128 const int tidx = threadIdx.y * B_X + threadIdx.x;
2129 const int loadY = tidx / 32, loadX = tidx % 32;
2130
2131 const int startPxX = MAX(0, -sizeX/2 + blockPxX);
2132 const int startPxY = MAX(0, -sizeX/2 + blockPxY);
2133 const int endPxX = MIN(imgSize, blockPxX + DIVUP(sizeX, 2) + 3);
2134 const int endPxY = MIN(imgSize, blockPxY + DIVUP(sizeX, 2) + 3);
2135
2136 const int myPxX = blockPxX + threadIdx.y % 4;
2137 const int myPxY = blockPxY + threadIdx.y / 4;
2138 const int myPxIdx = myPxY * imgSize + myPxX;
2139 // const bool doWork = myPxX < imgSize && myPxY < imgSize;
2140 const int myStartPxY = -sizeX/2 + myPxY;
2141 const int myStartPxX = -sizeX/2 + myPxX;
2142 const int myEndPxY = myPxY + DIVUP(sizeX, 2);
2143 const int myEndPxX = myPxX + DIVUP(sizeX, 2);
2144
2145 const int imgIdx = blockImgIdx + threadIdx.x;
2146
2147 imgs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
2148 meanDiffs += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
2149 denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
2150 target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
2151
2152 float prod[filtersPerThread][imgsPerThread];
2153 #pragma unroll
2154 for (int i = 0; i < imgsPerThread; i++) {
2155 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2156 #pragma unroll
2157 for (int f = 0; f < filtersPerThread; f++) {
2158 prod[f][i] = 0;
2159 }
2160 }
2161 }
2162
2163 for (int y = startPxY; y < endPxY; y++) {
2164 const bool isInY = y >= myStartPxY && y < myEndPxY;
2165 for (int x = startPxX; x < endPxX; x++) {
2166 const int px = y * imgSize + x;
2167 // All the threads load a pixel from memory
2168 #pragma unroll
2169 for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
2170 if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
2171 #pragma unroll
2172 for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
2173 if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
2174 shDiffs[ly + loadY][lx + loadX] = meanDiffs[(ly * imgPixels + px) * numImages + lx];
2175 }
2176 }
2177 }
2178 }
2179 __syncthreads();
2180
2181 // Each row of threads decides if it's interested in this pixel
2182 if (isInY && x >= myStartPxX && x < myEndPxX) {
2183 #pragma unroll
2184 for (int i = 0; i < imgsPerThread; i++) {
2185 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2186 #pragma unroll
2187 for (int f = 0; f < filtersPerThread; f++) {
2188 prod[f][i] += square(shDiffs[f][threadIdx.x + i * B_X]);
2189 }
2190 }
2191 }
2192 }
2193 __syncthreads();
2194 }
2195 }
2196 // imgs -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
2197 // imgs += threadIdx.x;
2198 if (myPxX < imgSize && myPxY < imgSize) {
2199 #pragma unroll
2200 for (int i = 0; i < imgsPerThread; i++) {
2201 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2202 #pragma unroll
2203 for (int f = 0; f < filtersPerThread; f++) {
2204 prod[f][i] = 1 + addScale * prod[f][i];
2205 denoms[f * imgPixels * numImages + i * B_X] = prod[f][i];
2206 target[f * imgPixels * numImages + i * B_X] = imgs[f * imgPixels * numImages + i * B_X] * __powf(prod[f][i], -powScale);
2207 }
2208 }
2209 }
2210 }
2211 }
2212
2213 /*
2214 * Block size B_YxB_X
2215 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2216 * blockIdx.y determines pixel.y, filter idx in batches of B_Y
2217 *
2218 * So each block does one pixel for some number of images/filters.
2219 *
2220 * threadIdx.x determines img idx
2221 * threadIdx.y determines filter idx
2222 *
2223 * imgs: (numFilters, imgPixels, numImages)
2224 * meanDiffs: (numFilters, imgPixels, numImages)
2225 * denoms: (numFilters, imgPixels, numImages) (out)
2226 * target: (numFilters, imgPixels, numImages) (out)
2227 *
2228 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2229 * numFilters must be divisible by B_Y
2230 */
2231 template<int B_Y, int B_X, int imgsPerThread, bool checkCaseBounds, bool blocked>
2232 __global__ void kFCNorm(float* imgs, float* meanDiffs, float* denoms, float* target, const int imgSize,
2233 const int numFilters, const int numImages, const int sizeF,
2234 const float addScale, const float powScale) {
2235 const int imgPixels = imgSize * imgSize;
2236 const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2237 const int numFilterBlocks = numFilters/B_Y;
2238 const int pxIdxX = blockIdx.x / numImgBlocks;
2239 const int pxIdxY = blockIdx.y / numFilterBlocks;
2240 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2241 const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
2242
2243 const int pxIdx = pxIdxY * imgSize + pxIdxX;
2244
2245
2246 const int imgIdx = blockImgIdx + threadIdx.x;
2247
2248 imgs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2249 meanDiffs += pxIdx * numImages + imgIdx;
2250 denoms += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2251 target += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2252
2253 float prod[imgsPerThread];
2254 #pragma unroll
2255 for (int i = 0; i < imgsPerThread; i++) {
2256 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2257 prod[i] = 0;
2258 }
2259 }
2260
2261 const int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF/2 + filterIdx;
2262 const int loopStartF = blocked ? startF : MAX(0, startF);
2263 const int loopEndF = MIN(numFilters, startF + sizeF);
2264
2265 for (int f = loopStartF; f < loopEndF; ++f) {
2266 #pragma unroll
2267 for (int i = 0; i < imgsPerThread; i++) {
2268 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2269 prod[i] += square(meanDiffs[f * imgPixels * numImages + i * B_X]);
2270 }
2271 }
2272 }
2273
2274 #pragma unroll
2275 for (int i = 0; i < imgsPerThread; i++) {
2276 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2277 prod[i] = 1 + addScale * prod[i];
2278 denoms[i * B_X] = prod[i];
2279 target[i * B_X] = imgs[i * B_X] * __powf(prod[i], -powScale);
2280 }
2281 }
2282 }
2283
2284 /*
2285 * Block size B_YxB_X
2286 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2287 * blockIdx.y determines pixel.y, filter idx in batches of B_Y
2288 *
2289 * So each block does one output pixel for some number of images/filters.
2290 *
2291 * threadIdx.x determines img idx
2292 * threadIdx.y determines filter idx
2293 *
2294 * outGrads: (numFilters, imgPixels, numImages)
2295 * denoms: (numFilters, imgPixels, numImages)
2296 * inputs: (numFilters, imgPixels, numImages)
2297 * acts: (numFilters, imgPixels, numImages)
2298 * target: (numFilters, imgPixels, numImages)
2299 *
2300 * numImages must be divisible by B_X*imgsPerThread
2301 * numFilters must be divisible by B_Y
2302 *
2303 * TODO: this isn't really ideal
2304 */
2305 template<int B_Y, int B_X, int imgsPerThread, bool add, bool checkCaseBounds, bool blocked>
2306 __global__ void kFRNormUndo(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters,
2307 const int numImages, const int sizeF, const float powScale, const float scaleTargets, const float scaleOutputs) {
2308 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2309 const int numFilterBlocks = numFilters/B_Y;
2310
2311 const int pxIdxX = blockIdx.x / numImgBlocks;
2312 const int pxIdxY = blockIdx.y / numFilterBlocks;
2313 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2314 const int filterIdx = (blockIdx.y % numFilterBlocks) * B_Y + threadIdx.y;
2315
2316 const int imgPixels = imgSize * imgSize;
2317 const int pxIdx = pxIdxY * imgSize + pxIdxX;
2318 const int imgIdx = blockImgIdx + threadIdx.x;
2319
2320 acts += pxIdx * numImages + imgIdx;
2321 inputs += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2322 denoms += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2323 outGrads += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2324 target += ((filterIdx) * imgPixels + pxIdx) * numImages + imgIdx;
2325
2326 float prod[imgsPerThread];
2327 // if (imgIdx != 0 || pxIdx != 0 || filterIdx != 0) {
2328 // return;
2329 // }
2330 #pragma unroll
2331 for (int i = 0; i < imgsPerThread; i++) {
2332 prod[i] = 0;
2333 }
2334
2335 const int startF = blocked ? (filterIdx / sizeF) * sizeF : -sizeF + sizeF/2 + 1 + filterIdx;
2336 const int loopStartF = blocked ? startF : MAX(0, startF);
2337 const int loopEndF = MIN(numFilters, startF + sizeF);
2338
2339 for (int f = loopStartF; f < loopEndF; ++f) {
2340 #pragma unroll
2341 for (int i = 0; i < imgsPerThread; i++) {
2342 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2343 prod[i] += acts[f * imgPixels * numImages + i * B_X];
2344 }
2345 }
2346 }
2347 // printf("gpu f start: %d, end: %d\n", loopStartF, loopEndF);
2348
2349 if (!add) {
2350 #pragma unroll
2351 for (int i = 0; i < imgsPerThread; i++) {
2352 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2353 const float inp = inputs[i * B_X];
2354 const float out = outGrads[i * B_X];
2355 const float den = denoms[i * B_X];
2356 prod[i] = inp * prod[i] + out * __powf(den, -powScale);
2357 target[i * B_X] = prod[i];
2358 }
2359 }
2360 } else {
2361 #pragma unroll
2362 for (int i = 0; i < imgsPerThread; i++) {
2363 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2364 const float inp = inputs[i * B_X];
2365 const float out = outGrads[i * B_X];
2366 const float den = denoms[i * B_X];
2367 prod[i] = inp * prod[i] + out * __powf(den, -powScale);
2368 target[i * B_X] = scaleTargets * target[i * B_X] + scaleOutputs * prod[i];
2369 }
2370 }
2371 }
2372 }
2373
2374 /*
2375 * Block size B_YxB_X
2376 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2377 * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2378 *
2379 * So each block does one pixel for some number of images/filters.
2380 *
2381 * threadIdx.x determines img idx
2382 * threadIdx.y determines filter idx
2383 *
2384 * imgs: (numFilters, imgPixels, numImages)
2385 * target: (numFilters, imgPixels, numImages)
2386 *
2387 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2388 * numFilters must be divisible by B_Y*filtersPerThread
2389 *
2390 * sizeX should be something like 3 or 5 for this function. Not much more.
2391 * TODO: write variant where each block does 4x4 region or so (this'll be based on kCNorm2).
2392 */
2393 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2394 __global__ void kTICA_manyfilter(float* imgs, float* target, const int imgSize,
2395 const int numFilters, const int numImages, const int sizeX,
2396 const float scaleTarget, const float scaleOutput) {
2397 const int imgPixels = imgSize * imgSize;
2398 const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2399 const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2400 const int pxIdxX = blockIdx.x / numImgBlocks;
2401 const int pxIdxY = blockIdx.y / numFilterBlocks;
2402 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2403 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2404
2405 const int pxIdx = pxIdxY * imgSize + pxIdxX;
2406
2407 const int startPxX = -sizeX/2 + pxIdxX;
2408 const int startPxY = -sizeX/2 + pxIdxY;
2409 const int imgIdx = blockImgIdx + threadIdx.x;
2410
2411 imgs += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
2412 target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2413
2414 float prod[filtersPerThread][imgsPerThread];
2415 #pragma unroll
2416 for (int i = 0; i < imgsPerThread; i++) {
2417 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2418 #pragma unroll
2419 for (int f = 0; f < filtersPerThread; f++) {
2420 prod[f][i] = 0;
2421 }
2422 }
2423 }
2424 const int loopStartY = MAX(0, startPxY);
2425 const int loopStartX = MAX(0, startPxX);
2426 const int loopEndY = MIN(imgSize, startPxY + sizeX);
2427 const int loopEndX = MIN(imgSize, startPxX + sizeX);
2428
2429 for (int y = loopStartY; y < loopEndY; y++) {
2430 for (int x = loopStartX; x < loopEndX; x++) {
2431
2432 const int imgPx = y * imgSize + x;
2433 #pragma unroll
2434 for (int i = 0; i < imgsPerThread; i++) {
2435
2436 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2437 #pragma unroll
2438 for (int f = 0; f < filtersPerThread; f++) {
2439 prod[f][i] += square(imgs[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X]);
2440 }
2441 }
2442 }
2443 }
2444 }
2445 imgs += pxIdx * numImages;
2446 if (scaleTarget == 0) {
2447 #pragma unroll
2448 for (int i = 0; i < imgsPerThread; i++) {
2449 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2450 #pragma unroll
2451 for (int f = 0; f < filtersPerThread; f++) {
2452 target[f * B_Y * imgPixels * numImages + i * B_X] = scaleOutput * __fdividef(1.0f, 0.001 + sqrtf(prod[f][i]));
2453 }
2454 }
2455 }
2456 } else {
2457 #pragma unroll
2458 for (int i = 0; i < imgsPerThread; i++) {
2459 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2460 #pragma unroll
2461 for (int f = 0; f < filtersPerThread; f++) {
2462 target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTarget * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutput * __fdividef(1.0f, 0.001 + sqrtf(prod[f][i]));
2463 }
2464 }
2465 }
2466 }
2467 }
2468
2469 /*
2470 * Block size B_YxB_X
2471 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2472 * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2473 *
2474 * So each block does one pixel for some number of images/filters.
2475 *
2476 * threadIdx.x determines img idx
2477 * threadIdx.y determines filter idx
2478 *
2479 * imgs: (numFilters, imgPixels, numImages)
2480 * ticas: (numFilters, imgPixels, numImages)
2481 * target: (numFilters, imgPixels, numImages)
2482 *
2483 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
2484 * numFilters must be divisible by B_Y*filtersPerThread
2485 *
2486 * sizeX should be something like 3 or 5 for this function. Not much more.
2487 * TODO: write variant where each block does 4x4 region or so (this'll be based on kCNorm2).
2488 */
2489 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2490 __global__ void kTICAGrad_manyfilter(float* imgs, float* ticas, float* target, const int imgSize,
2491 const int numFilters, const int numImages, const int sizeX,
2492 const float scaleTarget, const float scaleOutput) {
2493 const int imgPixels = imgSize * imgSize;
2494 const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
2495 const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2496 const int pxIdxX = blockIdx.x / numImgBlocks;
2497 const int pxIdxY = blockIdx.y / numFilterBlocks;
2498 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2499 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2500
2501 const int pxIdx = pxIdxY * imgSize + pxIdxX;
2502
2503 const int startPxX = -sizeX/2 + pxIdxX;
2504 const int startPxY = -sizeX/2 + pxIdxY;
2505 const int imgIdx = blockImgIdx + threadIdx.x;
2506
2507 imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2508 ticas += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
2509 target += ((blockFilterIdx + threadIdx.y) * imgPixels + pxIdx) * numImages + imgIdx;
2510
2511 float prod[filtersPerThread][imgsPerThread];
2512 #pragma unroll
2513 for (int i = 0; i < imgsPerThread; i++) {
2514 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2515 #pragma unroll
2516 for (int f = 0; f < filtersPerThread; f++) {
2517 prod[f][i] = 0;
2518 }
2519 }
2520 }
2521 const int loopStartY = MAX(0, startPxY);
2522 const int loopStartX = MAX(0, startPxX);
2523 const int loopEndY = MIN(imgSize, startPxY + sizeX);
2524 const int loopEndX = MIN(imgSize, startPxX + sizeX);
2525
2526 for (int y = loopStartY; y < loopEndY; y++) {
2527 for (int x = loopStartX; x < loopEndX; x++) {
2528
2529 const int imgPx = y * imgSize + x;
2530 #pragma unroll
2531 for (int i = 0; i < imgsPerThread; i++) {
2532
2533 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2534 #pragma unroll
2535 for (int f = 0; f < filtersPerThread; f++) {
2536 // adding 1/S values
2537 prod[f][i] += ticas[(f * B_Y * imgPixels + imgPx) * numImages + i * B_X];
2538 }
2539 }
2540 }
2541 }
2542 }
2543 if (scaleTarget == 0) {
2544 #pragma unroll
2545 for (int i = 0; i < imgsPerThread; i++) {
2546 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2547 #pragma unroll
2548 for (int f = 0; f < filtersPerThread; f++) {
2549 target[f * B_Y * imgPixels * numImages + i * B_X] = scaleOutput * -imgs[f * B_Y * imgPixels * numImages + i * B_X] * prod[f][i];
2550 }
2551 }
2552 }
2553 } else {
2554 #pragma unroll
2555 for (int i = 0; i < imgsPerThread; i++) {
2556 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2557 #pragma unroll
2558 for (int f = 0; f < filtersPerThread; f++) {
2559 target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTarget * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutput * -imgs[f * B_Y * imgPixels * numImages + i * B_X] * sqrtf(prod[f][i]);
2560 }
2561 }
2562 }
2563 }
2564 }
2565
2566 /*
2567 * Block size B_YxB_X
2568 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2569 * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2570 *
2571 * So each block does one output pixel for some number of images/filters.
2572 *
2573 * threadIdx.x determines img idx
2574 * threadIdx.y determines filter idx
2575 *
2576 * imgs: (numFilters, imgPixels, numImages)
2577 * maxGrads: (numFilters, numOutputs, numImages)
2578 * rMaxActs: (numFilters, numOutputs, numImages)
2579 * target: (numFilters, imgPixels, numImages)
2580 *
2581 * numImages must be divisible by B_X*imgsPerThread
2582 * numFilters must be divisible by B_Y*filtersPerThread
2583 */
2584
2585 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
2586 __global__ void kLocalAvgUndo(float* avgGrads, float* target, const int imgSize, const int numFilters,
2587 const int numImages, const int subsX, const int startX, const int strideX, const int outputsX,
2588 const float scaleTargets, const float scaleOutputs) {
2589 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2590 const int blockPxX = blockIdx.x / numImgBlocks;
2591 const int blockPxY = blockIdx.y / (numFilters/(B_Y*filtersPerThread));
2592
2593 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2594 const int blockFilterIdx = (blockIdx.y % (numFilters/(B_Y*filtersPerThread))) * B_Y * filtersPerThread;
2595
2596 const int blockPx = blockPxY * imgSize + blockPxX;
2597 const int numOutputs = outputsX * outputsX;
2598 const int imgPixels = imgSize * imgSize;
2599
2600 const int startOutputY = blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
2601 const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
2602 const int startOutputX = blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
2603 const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
2604
2605 const int imgIdx = blockImgIdx + threadIdx.x;
2606
2607 avgGrads += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages + imgIdx;
2608 target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2609
2610 float prod[filtersPerThread][imgsPerThread];
2611 #pragma unroll
2612 for (int f = 0; f < filtersPerThread; f++) {
2613 #pragma unroll
2614 for (int i = 0; i < imgsPerThread; i++) {
2615 prod[f][i] = 0;
2616 }
2617 }
2618
2619 if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX
2620 && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) {
2621
2622 for (int my = startOutputY; my < endOutputY; my++) {
2623 const float regionStartY = fmaxf(0, startX + my * strideX);
2624 const float regionEndY = fminf(imgSize, startX + my * strideX + subsX);
2625 const float regionSizeY = regionEndY - regionStartY;
2626 for (int mx = startOutputX; mx < endOutputX; mx++) {
2627 const int outputIdx = my * outputsX + mx;
2628 const float regionStartX = fmaxf(0, startX + mx * strideX);
2629 const float regionEndX = fminf(imgSize, startX + mx * strideX + subsX);
2630 const float regionSizeX = regionEndX - regionStartX;
2631 // It's important to do the division here, because pushing division into the below
2632 // loops makes the code 4x slower.
2633 const float regionSizeInv = 1.0f / (regionSizeX * regionSizeY);
2634 #pragma unroll
2635 for (int i = 0; i < imgsPerThread; i++) {
2636 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2637 #pragma unroll
2638 for (int f = 0; f < filtersPerThread; f++) {
2639 prod[f][i] += avgGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X] * regionSizeInv;
2640 }
2641 }
2642 }
2643 }
2644 }
2645 }
2646
2647 if (!add) {
2648 #pragma unroll
2649 for (int i = 0; i < imgsPerThread; i++) {
2650 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2651 #pragma unroll
2652 for (int f = 0; f < filtersPerThread; f++) {
2653 target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2654 }
2655 }
2656 }
2657 } else {
2658 #pragma unroll
2659 for (int i = 0; i < imgsPerThread; i++) {
2660 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2661 #pragma unroll
2662 for (int f = 0; f < filtersPerThread; f++) {
2663 target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
2664 }
2665 }
2666 }
2667 }
2668 }
2669
2670 /*
2671 * Block size B_YxB_X
2672 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2673 * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2674 *
2675 * So each block does one output pixel for some number of images/filters.
2676 *
2677 * threadIdx.x determines img idx
2678 * threadIdx.y determines filter idx
2679 *
2680 * imgs: (numFilters, imgPixels, numImages)
2681 * maxGrads: (numFilters, numOutputs, numImages)
2682 * maxActs: (numFilters, numOutputs, numImages)
2683 * target: (numFilters, imgPixels, numImages)
2684 *
2685 * numImages must be divisible by B_X*imgsPerThread
2686 * numFilters must be divisible by B_Y*filtersPerThread
2687 */
2688
2689 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
2690 __global__ void kLocalMaxUndo(float* imgs, float* maxGrads, float* maxActs, float* target, const int imgSize, const int numFilters,
2691 const int numImages, const int subsX, const int startX, const int strideX, const int outputsX,
2692 const float scaleTargets, const float scaleOutputs) {
2693 __shared__ float shImgs[B_Y*filtersPerThread][B_X*imgsPerThread];
2694 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2695 const int blockPxX = blockIdx.x / numImgBlocks;
2696 const int blockPxY = blockIdx.y / (numFilters/(B_Y*filtersPerThread));
2697
2698 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2699 const int blockFilterIdx = (blockIdx.y % (numFilters/(B_Y*filtersPerThread))) * B_Y * filtersPerThread;
2700
2701 const int blockPx = blockPxY * imgSize + blockPxX;
2702 const int numOutputs = outputsX * outputsX;
2703 const int imgPixels = imgSize * imgSize;
2704
2705 const int startOutputY = blockPxY - startX < subsX ? 0 : 1 + (blockPxY - startX - subsX) / strideX;
2706 const int endOutputY = MIN(outputsX, 1 + (blockPxY - startX) / strideX);
2707 const int startOutputX = blockPxX - startX < subsX ? 0 : 1 + (blockPxX - startX - subsX) / strideX;
2708 const int endOutputX = MIN(outputsX, 1 + (blockPxX - startX) / strideX);
2709
2710 const int imgIdx = blockImgIdx + threadIdx.x;
2711
2712 imgs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2713 maxGrads += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages
2714 + imgIdx;
2715 maxActs += ((blockFilterIdx + threadIdx.y) * numOutputs) * numImages
2716 + imgIdx;
2717
2718 target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2719
2720 float prod[filtersPerThread][imgsPerThread];
2721 #pragma unroll
2722 for (int f = 0; f < filtersPerThread; f++) {
2723 #pragma unroll
2724 for (int i = 0; i < imgsPerThread; i++) {
2725 prod[f][i] = 0;
2726 }
2727 }
2728
2729 if (blockPxX >= startX && blockPxX < startX + strideX * (outputsX-1) + subsX
2730 && blockPxY >= startX && blockPxY < startX + strideX * (outputsX-1) + subsX) {
2731 #pragma unroll
2732 for (int i = 0; i < imgsPerThread; i++) {
2733 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2734 #pragma unroll
2735 for (int f = 0; f < filtersPerThread; f++) {
2736 shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i] = imgs[f * B_Y * imgPixels * numImages + i * B_X];
2737 }
2738 }
2739 }
2740 for (int my = startOutputY; my < endOutputY; my++) {
2741 for (int mx = startOutputX; mx < endOutputX; mx++) {
2742 const int outputIdx = my * outputsX + mx;
2743 #pragma unroll
2744 for (int i = 0; i < imgsPerThread; i++) {
2745 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2746 #pragma unroll
2747 for (int f = 0; f < filtersPerThread; f++) {
2748 const float ma = maxActs[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
2749 const float mg = maxGrads[(f * B_Y * numOutputs + outputIdx) * numImages + i * B_X];
2750 const float img = shImgs[threadIdx.y + B_Y * f][threadIdx.x + B_X * i];
2751
2752 prod[f][i] += (img == ma) * mg;
2753 }
2754 }
2755 }
2756 }
2757 }
2758 }
2759 if (!add) {
2760 #pragma unroll
2761 for (int i = 0; i < imgsPerThread; i++) {
2762 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2763 #pragma unroll
2764 for (int f = 0; f < filtersPerThread; f++) {
2765 target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2766 }
2767 }
2768 }
2769 } else {
2770 #pragma unroll
2771 for (int i = 0; i < imgsPerThread; i++) {
2772 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2773 #pragma unroll
2774 for (int f = 0; f < filtersPerThread; f++) {
2775 target[f * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
2776 }
2777 }
2778 }
2779 }
2780 }
2781
2782
2783
2784
2785 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool checkCaseBounds>
2786 __global__ void kLocalProbMaxUndo(float* maxout_h, float* maxout_p, float* hGrads, float* pGrads, float* target_z, float* target_t, const int imgSize, const int numFilters, const int numImages, const int subsX, const int startX, const int strideX, const int outputsX, float * gp_iszero, float * gh_iszero) {
2787
2788 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2789 const int numFilterBlocks = DIVUP(numFilters, B_Y*filtersPerThread);
2790 const int outputIdxX = blockIdx.x / numImgBlocks;
2791 const int outputIdxY = blockIdx.y / numFilterBlocks;
2792 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2793 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2794 const int myFilterIdx = (blockFilterIdx + threadIdx.y*filtersPerThread);
2795 if (myFilterIdx >= numFilters) {
2796 return;
2797 }
2798
2799 const int outputIdx = outputIdxY * outputsX + outputIdxX;
2800 const int numOutputs = outputsX * outputsX;
2801 const int imgPixels = imgSize * imgSize;
2802
2803 const int startImgPxX = startX + outputIdxX * strideX;
2804 const int startImgPxY = startX + outputIdxY * strideX;
2805 const int imgIdx = blockImgIdx + threadIdx.x;
2806
2807 maxout_h += myFilterIdx * imgPixels * numImages + imgIdx;
2808 hGrads += myFilterIdx * imgPixels * numImages + imgIdx;
2809 target_z += myFilterIdx * imgPixels * numImages + imgIdx;
2810 maxout_p += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
2811 pGrads += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
2812 target_t += (myFilterIdx * numOutputs + outputIdx) * numImages + imgIdx;
2813
2814 float prod[filtersPerThread][imgsPerThread];
2815 for (int f = 0; f < filtersPerThread; f++) {
2816 for (int i = 0; i < imgsPerThread; i++) {
2817 prod[f][i] = 0;
2818 }
2819 }
2820
2821 const int loopStartY = MAX(0, startImgPxY);
2822 const int loopStartX = MAX(0, startImgPxX);
2823 const int loopEndY = MIN(imgSize, startImgPxY + subsX);
2824 const int loopEndX = MIN(imgSize, startImgPxX + subsX);
2825
2826
2827 for (int y = loopStartY; y < loopEndY; y++) {
2828 for (int x = loopStartX; x < loopEndX; x++) {
2829 const int imgPx = y * imgSize + x;
2830 for (int i = 0; i < imgsPerThread; i++) {
2831 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2832 for (int f = 0; f < filtersPerThread; f++) {
2833 const float ma = maxout_h[(f * imgPixels + imgPx) * numImages + i * B_X];
2834 const float mg = hGrads[(f * imgPixels + imgPx) * numImages + i * B_X];
2835 prod[f][i] += ma * mg;
2836 }
2837 }
2838 }
2839 }
2840 }
2841
2842
2843 for (int i = 0; i < imgsPerThread; i++) {
2844 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2845 for (int f = 0; f < filtersPerThread; f++) {
2846 prod[f][i] -= (1 - maxout_p[f*numOutputs*numImages + i * B_X]) * pGrads[f*numOutputs*numImages + i * B_X];
2847 }
2848 }
2849 }
2850
2851
2852 for (int y = loopStartY; y < loopEndY; y++) {
2853 for (int x = loopStartX; x < loopEndX; x++) {
2854 const int imgPx = y * imgSize + x;
2855 for (int i = 0; i < imgsPerThread; i++) {
2856 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2857 for (int f = 0; f < filtersPerThread; f++) {
2858 const float ma = maxout_h[(f * imgPixels + imgPx) * numImages + i * B_X];
2859 const float mg = hGrads[(f * imgPixels + imgPx) * numImages + i * B_X];
2860 target_z[(f*imgPixels + imgPx) * numImages + i * B_X] = ma * mg - (prod[f][i] * ma);
2861 }
2862 }
2863 }
2864 }
2865 }
2866
2867 // it's wierd
2868 for (int i = 0; i < imgsPerThread; i++) {
2869 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2870 for (int f = 0; f < filtersPerThread; f++) {
2871 const float ma = maxout_p[f*numOutputs*numImages + i * B_X];
2872 float mg = pGrads[f*numOutputs*numImages + i * B_X];
2873 if (*gh_iszero == 1) {
2874 target_t[f*numOutputs*numImages + i * B_X] = - prod[f][i] * ma;
2875 } else if (*gp_iszero == 1) {
2876 target_t[f*numOutputs*numImages + i * B_X] = ma - prod[f][i] * ma;
2877 } else {
2878 target_t[f*numOutputs*numImages + i * B_X] = ma * mg - prod[f][i] * ma;
2879 }
2880 }
2881 }
2882 }
2883 }
2884
2885
2886 /*
2887 * acts := -2 x scale x acts x outGrads / denoms
2888 */
2889 template<int B_X, int eltsPerThread>
2890 __global__ void kRNormUndoPrelims(float* acts, float* denoms, float* outGrads,
2891 const uint numElements, const float scale) {
2892 const uint e = B_X * blockIdx.x * eltsPerThread + threadIdx.x;
2893 const uint numThreads = B_X * gridDim.x;
2894 for (uint i = e; i < numElements; i += numThreads*eltsPerThread) {
2895 #pragma unroll
2896 for (uint k = 0; k < eltsPerThread; k++) {
2897 if (i + k * B_X < numElements) {
2898 acts[i + k * B_X] = __fdividef(scale*outGrads[i + k * B_X] * acts[i + k * B_X], denoms[i + k * B_X]);
2899 }
2900 }
2901 }
2902 }
2903
2904 /*
2905 * Block size B_YxB_X
2906 * blockIdx.x determines pixel.x, image idx in batches of B_X*imgsPerThread
2907 * blockIdx.y determines pixel.y, filter idx in batches of B_Y*filtersPerThread
2908 *
2909 * So each block does one output pixel for some number of images/filters.
2910 *
2911 * threadIdx.x determines img idx
2912 * threadIdx.y determines filter idx
2913 *
2914 * outGrads: (numFilters, imgPixels, numImages)
2915 * denoms: (numFilters, imgPixels, numImages)
2916 * inputs: (numFilters, imgPixels, numImages)
2917 * acts: (numFilters, imgPixels, numImages)
2918 * target: (numFilters, imgPixels, numImages)
2919 *
2920 * numImages must be divisible by B_X*imgsPerThread
2921 * numFilters must be divisible by B_Y*filtersPerThread
2922 *
2923 * TODO: this isn't really ideal
2924 */
2925 template<int B_Y, int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
2926 __global__ void kRNormUndo(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters,
2927 const int numImages, const int sizeX, const float powScale, const float scaleTargets, const float scaleOutputs) {
2928 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
2929 const int numFilterBlocks = numFilters/(B_Y*filtersPerThread);
2930
2931 const int blockPxX = blockIdx.x / numImgBlocks;
2932 const int blockPxY = blockIdx.y / numFilterBlocks;
2933
2934 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
2935 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * B_Y * filtersPerThread;
2936
2937 const int blockPx = blockPxY * imgSize + blockPxX;
2938 const int imgPixels = imgSize * imgSize;
2939
2940 const int startY = MAX(0, blockPxY + sizeX/2 - sizeX + 1);
2941 const int startX = MAX(0, blockPxX + sizeX/2 - sizeX + 1);
2942 const int endY = MIN(imgSize, blockPxY + sizeX/2 + 1);
2943 const int endX = MIN(imgSize, blockPxX + sizeX/2 + 1);
2944
2945 const int imgIdx = blockImgIdx + threadIdx.x;
2946
2947 acts += ((blockFilterIdx + threadIdx.y) * imgPixels) * numImages + imgIdx;
2948 inputs += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2949 denoms += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2950 outGrads += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2951 target += ((blockFilterIdx + threadIdx.y) * imgPixels + blockPx) * numImages + imgIdx;
2952
2953 float prod[filtersPerThread][imgsPerThread];
2954 #pragma unroll
2955 for (int f = 0; f < filtersPerThread; f++) {
2956 #pragma unroll
2957 for (int i = 0; i < imgsPerThread; i++) {
2958 prod[f][i] = 0;
2959 }
2960 }
2961
2962 for (int sy = startY; sy < endY; sy++) {
2963 for (int sx = startX; sx < endX; sx++) {
2964 const int outPx = sy * imgSize + sx;
2965
2966 #pragma unroll
2967 for (int i = 0; i < imgsPerThread; i++) {
2968 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2969 #pragma unroll
2970 for (int f = 0; f < filtersPerThread; f++) {
2971 prod[f][i] += acts[(f * B_Y * imgPixels + outPx) * numImages + i * B_X];
2972 }
2973 }
2974 }
2975 }
2976 }
2977 // outGrads += blockPx * numImages;
2978 if (!add) {
2979 #pragma unroll
2980 for (int i = 0; i < imgsPerThread; i++) {
2981 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2982 #pragma unroll
2983 for (int f = 0; f < filtersPerThread; f++) {
2984 const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
2985 const float out = outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
2986 const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
2987 prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
2988 target[f * B_Y * imgPixels * numImages + i * B_X] = prod[f][i];
2989 }
2990 }
2991 }
2992 } else {
2993 #pragma unroll
2994 for (int i = 0; i < imgsPerThread; i++) {
2995 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
2996 #pragma unroll
2997 for (int f = 0; f < filtersPerThread; f++) {
2998 const float inp = inputs[(f * B_Y * imgPixels) * numImages + i * B_X];
2999 const float out = outGrads[(f * B_Y * imgPixels) * numImages + i * B_X];
3000 const float den = denoms[(f * B_Y * imgPixels) * numImages + i * B_X];
3001 prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
3002 target[f * B_Y * imgPixels * numImages + i * B_X] =
3003 scaleTargets * target[f * B_Y * imgPixels * numImages + i * B_X]
3004 + scaleOutputs * prod[f][i];
3005 }
3006 }
3007 }
3008 }
3009 }
3010
3011
3012 /*
3013 * Block size 16xB_X
3014 * blockIdx.x determines 4x4 pixel.x region, image idx in batches of B_X*imgsPerThread
3015 * blockIdx.y determines 4x4 pixel.y region, filter idx in batches of filtersPerThread
3016 *
3017 * So each block does 4x4 region for some number of images/filters.
3018 *
3019 * threadIdx.x determines img idx
3020 * threadIdx.y determines pixel idx
3021 *
3022 * outGrads: (numFilters, imgPixels, numImages)
3023 * denoms: (numFilters, imgPixels, numImages)
3024 * inputs: (numFilters, imgPixels, numImages)
3025 * acts: (numFilters, imgPixels, numImages)
3026 * target: (numFilters, imgPixels, numImages)
3027 *
3028 * B_X one of 8, 16, 32
3029 * imgsPerThread one of 1, 2, 4, 8, 16
3030 *
3031 * B_XximgsPerThread MUST be divisible by 32.
3032 * Number of filters MUST be divisible by filtersPerThread.
3033 *
3034 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false
3035 * numFilters must be divisible by filtersPerThread
3036 *
3037 * Final write-out will not be fully coalesced unless B_X is 32. But there's a lot more
3038 * reading than writing here, and the reading is all coalesced, so it should be OK.
3039 */
3040 template<int B_X, int imgsPerThread, int filtersPerThread, bool add, bool checkCaseBounds>
3041 __global__ void kRNormUndo2(float* outGrads, float* denoms, float* inputs, float* acts, float* target, const int imgSize, const int numFilters,
3042 const int numImages, const int sizeX, const float powScale, const float scaleTargets, const float scaleOutputs) {
3043 __shared__ float shActs[filtersPerThread][B_X*imgsPerThread];
3044 const int imgPixels = imgSize * imgSize;
3045 const int numImgBlocks = DIVUP(numImages, B_X*imgsPerThread);
3046 const int numFilterBlocks = numFilters/(filtersPerThread);
3047 const int blockPxX = 4*(blockIdx.x / numImgBlocks);
3048 const int blockPxY = 4*(blockIdx.y / numFilterBlocks);
3049 const int blockImgIdx = (blockIdx.x % numImgBlocks) * B_X * imgsPerThread;
3050 const int blockFilterIdx = (blockIdx.y % numFilterBlocks) * filtersPerThread;
3051
3052 const int tidx = threadIdx.y * B_X + threadIdx.x;
3053 const int loadY = tidx / 32, loadX = tidx % 32;
3054
3055 const int startPxX = MAX(0, -DIVUP(sizeX,2) + blockPxX + 1);
3056 const int startPxY = MAX(0, -DIVUP(sizeX,2) + blockPxY + 1);
3057 const int endPxX = MIN(imgSize, blockPxX + sizeX/2 + 4);
3058 const int endPxY = MIN(imgSize, blockPxY + sizeX/2 + 4);
3059
3060 const int myPxX = blockPxX + threadIdx.y % 4;
3061 const int myPxY = blockPxY + threadIdx.y / 4;
3062 const int myPxIdx = myPxY * imgSize + myPxX;
3063 // const bool doWork = myPxX < imgSize && myPxY < imgSize;
3064 const int myStartPxY = -DIVUP(sizeX,2) + myPxY + 1;
3065 const int myStartPxX = -DIVUP(sizeX,2) + myPxX + 1;
3066 const int myEndPxY = myPxY + sizeX/2 + 1;
3067 const int myEndPxX = myPxX + sizeX/2 + 1;
3068
3069 const int imgIdx = blockImgIdx + threadIdx.x;
3070
3071 acts += (blockFilterIdx + loadY) * imgPixels * numImages + blockImgIdx + loadX;
3072 denoms += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3073 inputs += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3074 outGrads += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3075 target += (blockFilterIdx * imgPixels + myPxIdx) * numImages + imgIdx;
3076
3077 float prod[filtersPerThread][imgsPerThread];
3078 #pragma unroll
3079 for (int f = 0; f < filtersPerThread; f++) {
3080 #pragma unroll
3081 for (int i = 0; i < imgsPerThread; i++) {
3082 prod[f][i] = 0;
3083 }
3084 }
3085
3086 for (int y = startPxY; y < endPxY; y++) {
3087 const bool isInY = y >= myStartPxY && y < myEndPxY;
3088 for (int x = startPxX; x < endPxX; x++) {
3089 const int px = y * imgSize + x;
3090 // All the threads load a pixel from memory
3091 #pragma unroll
3092 for (int ly = 0; ly < filtersPerThread; ly += B_X/2) {
3093 if (filtersPerThread % (B_X/2) == 0 || ly + loadY < filtersPerThread) {
3094 #pragma unroll
3095 for (int lx = 0; lx < B_X*imgsPerThread; lx += 32) {
3096 if (!checkCaseBounds || lx + loadX + blockImgIdx < numImages) {
3097 shActs[ly + loadY][lx + loadX] = acts[(ly * imgPixels + px) * numImages + lx];
3098 }
3099 }
3100 }
3101 }
3102 __syncthreads();
3103
3104 // Each row of threads decides if it's interested in this pixel
3105 if (isInY && x >= myStartPxX && x < myEndPxX) {
3106 #pragma unroll
3107 for (int i = 0; i < imgsPerThread; i++) {
3108 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
3109 #pragma unroll
3110 for (int f = 0; f < filtersPerThread; f++) {
3111 prod[f][i] += shActs[f][threadIdx.x + i * B_X];
3112 }
3113 }
3114 }
3115 }
3116 __syncthreads();
3117 }
3118 }
3119 acts -= (loadY * imgPixels - myPxIdx) * numImages + loadX;
3120 acts += threadIdx.x;
3121 if (myPxX < imgSize && myPxY < imgSize) {
3122 if (!add) {
3123 #pragma unroll
3124 for (int i = 0; i < imgsPerThread; i++) {
3125 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
3126 #pragma unroll
3127 for (int f = 0; f < filtersPerThread; f++) {
3128 const float out = outGrads[f * imgPixels * numImages + i * B_X];
3129 const float den = denoms[f * imgPixels * numImages + i * B_X];
3130 const float inp = inputs[f * imgPixels * numImages + i * B_X];
3131 prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
3132 target[f * imgPixels * numImages + i * B_X] = prod[f][i];
3133 }
3134 }
3135 }
3136 } else {
3137 #pragma unroll
3138 for (int i = 0; i < imgsPerThread; i++) {
3139 if (!checkCaseBounds || imgIdx + i * B_X < numImages) {
3140 #pragma unroll
3141 for (int f = 0; f < filtersPerThread; f++) {
3142 const float out = outGrads[f * imgPixels * numImages + i * B_X];
3143 const float den = denoms[f * imgPixels * numImages + i * B_X];
3144 const float inp = inputs[f * imgPixels * numImages + i * B_X];
3145 prod[f][i] = inp * prod[f][i] + out * __powf(den, -powScale);
3146 target[f * imgPixels * numImages + i * B_X] = scaleTargets * target[f * imgPixels * numImages + i * B_X] + scaleOutputs * prod[f][i];
3147 }
3148 }
3149 }
3150 }
3151
3152 }
3153 }
3154
3155 void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
3156 int subsX, int startX, int strideX, int outputsX) {
3157 convLocalMaxUndo(images, maxGrads, maxActs, target, subsX, startX, strideX, outputsX, 0, 1);
3158 }
3159
3160 /*
3161 * imgs: (numFilters * imgPixels, numImages)
3162 * maxGrads: (numFilters * numOutputs, numImages)
3163 * maxActs: (numFilters * numOutputs, numImages)
3164 * target: (numFilters * imgPixels, numImages)
3165 */
3166 void convLocalMaxUndo(NVMatrix& images, NVMatrix& maxGrads, NVMatrix& maxActs, NVMatrix& target,
3167 int subsX, int startX, int strideX, int outputsX, float scaleTargets, float scaleOutput) {
3168 int outputs = outputsX * outputsX;
3169 int numImages = images.getNumCols();
3170 int numFilters = maxGrads.getNumRows() / outputs;
3171 int imgPixels = images.getNumRows() / numFilters;
3172 assert(images.getNumRows() == numFilters * imgPixels);
3173 int imgSize = int(sqrt((double)imgPixels));
3174
3175 assert(imgSize * imgSize == imgPixels);
3176 assert(maxGrads.getNumRows() == numFilters * outputs);
3177 assert(maxGrads.getNumCols() == numImages);
3178 assert(!images.isTrans());
3179 assert(!target.isTrans());
3180 assert(!maxGrads.isTrans());
3181 assert(!maxActs.isTrans());
3182 assert(images.isContiguous());
3183 assert(maxGrads.isContiguous());
3184 assert(maxActs.isContiguous());
3185 assert(maxGrads.isSameDims(maxActs));
3186 assert(numFilters % 16 == 0);
3187 // assert(numImages % 128 == 0);
3188
3189 assert(strideX <= subsX);
3190
3191 target.resize(images);
3192 assert(target.isContiguous());
3193 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3194 int checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3195 dim3 threads(32, 4);
3196 dim3 blocks(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 2)) * imgSize);
3197
3198 if (imgsPerThread == 4) {
3199 if (checkCaseBounds) {
3200 if (scaleTargets == 0 && scaleOutput == 1) {
3201 kLocalMaxUndo<4, 32, 4, 2, false, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3202 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3203 } else {
3204 kLocalMaxUndo<4, 32, 4, 2, true, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3205 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3206 }
3207 } else {
3208 if (scaleTargets == 0 && scaleOutput == 1) {
3209 kLocalMaxUndo<4, 32, 4, 2, false, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3210 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3211 } else {
3212 kLocalMaxUndo<4, 32, 4, 2, true, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3213 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3214 }
3215 }
3216 } else if (imgsPerThread == 2) {
3217 if (checkCaseBounds) {
3218 if (scaleTargets == 0 && scaleOutput == 1) {
3219 kLocalMaxUndo<4, 32, 2, 2, false, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3220 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3221 } else {
3222 kLocalMaxUndo<4, 32, 2, 2, true, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3223 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3224 }
3225 } else {
3226 if (scaleTargets == 0 && scaleOutput == 1) {
3227 kLocalMaxUndo<4, 32, 2, 2, false, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3228 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3229 } else {
3230 kLocalMaxUndo<4, 32, 2, 2, true, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3231 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3232 }
3233 }
3234 } else {
3235 if (checkCaseBounds) {
3236 if (scaleTargets == 0 && scaleOutput == 1) {
3237 kLocalMaxUndo<4, 32, 1, 2, false, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3238 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3239 } else {
3240 kLocalMaxUndo<4, 32, 1, 2, true, true><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3241 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3242 }
3243 } else {
3244 if (scaleTargets == 0 && scaleOutput == 1) {
3245 kLocalMaxUndo<4, 32, 1, 2, false, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3246 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3247 } else {
3248 kLocalMaxUndo<4, 32, 1, 2, true, false><<<blocks, threads>>>(images.getDevData(), maxGrads.getDevData(), maxActs.getDevData(), target.getDevData(),
3249 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, scaleTargets, scaleOutput);
3250 }
3251 }
3252 }
3253
3254 cutilCheckMsg("convLocalMaxUndo: kernel execution failed");
3255 }
3256
3257 void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target, int subsX, int startX, int strideX, int outputsX, int imgSize) {
3258 convLocalAvgUndo(avgGrads, target, subsX, startX, strideX, outputsX, imgSize, 0, 1);
3259 }
3260
3261 /*
3262 * avgGrads: (numFilters, numOutputs, numImages)
3263 * target: (numFilters, imgPixels, numImages)
3264 */
3265 void convLocalAvgUndo(NVMatrix& avgGrads, NVMatrix& target,
3266 int subsX, int startX, int strideX, int outputsX, int imgSize,
3267 float scaleTargets, float scaleOutput) {
3268 int numImages = avgGrads.getNumCols();
3269
3270 int outputs = outputsX * outputsX;
3271 int imgPixels = imgSize * imgSize;
3272 int numFilters = avgGrads.getNumRows() / outputs;
3273 assert(avgGrads.getNumRows() == numFilters * outputs);
3274
3275 assert(!target.isTrans());
3276 assert(!avgGrads.isTrans());
3277 assert(avgGrads.isContiguous());
3278 assert(numFilters % 16 == 0);
3279 // assert(numImages % 128 == 0);
3280
3281 assert(strideX <= subsX);
3282
3283 target.resize(numFilters * imgPixels, numImages);
3284 assert(target.isContiguous());
3285 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3286 int checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3287 dim3 threads(32, 4);
3288 dim3 blocks(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 4)) * imgSize);
3289
3290 if (imgsPerThread == 4) {
3291 if (checkCaseBounds) {
3292 if (scaleTargets == 0 && scaleOutput == 1) {
3293 kLocalAvgUndo<4, 32, 4, 4, false, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3294 imgSize, numFilters, numImages, subsX, startX, strideX,
3295 outputsX, scaleTargets, scaleOutput);
3296 } else {
3297 kLocalAvgUndo<4, 32, 4, 4, true, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3298 imgSize, numFilters, numImages, subsX, startX, strideX,
3299 outputsX, scaleTargets, scaleOutput);
3300 }
3301 } else {
3302 if (scaleTargets == 0 && scaleOutput == 1) {
3303 kLocalAvgUndo<4, 32, 4, 4, false, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3304 imgSize, numFilters, numImages, subsX, startX, strideX,
3305 outputsX, scaleTargets, scaleOutput);
3306 } else {
3307 kLocalAvgUndo<4, 32, 4, 4, true, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3308 imgSize, numFilters, numImages, subsX, startX, strideX,
3309 outputsX, scaleTargets, scaleOutput);
3310 }
3311 }
3312 } else if (imgsPerThread == 2) {
3313 if (checkCaseBounds) {
3314 if (scaleTargets == 0 && scaleOutput == 1) {
3315 kLocalAvgUndo<4, 32, 2, 4, false, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3316 imgSize, numFilters, numImages, subsX, startX, strideX,
3317 outputsX, scaleTargets, scaleOutput);
3318 } else {
3319 kLocalAvgUndo<4, 32, 2, 4, true, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3320 imgSize, numFilters, numImages, subsX, startX, strideX,
3321 outputsX, scaleTargets, scaleOutput);
3322 }
3323 } else {
3324 if (scaleTargets == 0 && scaleOutput == 1) {
3325 kLocalAvgUndo<4, 32, 2, 4, false, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3326 imgSize, numFilters, numImages, subsX, startX, strideX,
3327 outputsX, scaleTargets, scaleOutput);
3328 } else {
3329 kLocalAvgUndo<4, 32, 2, 4, true, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3330 imgSize, numFilters, numImages, subsX, startX, strideX,
3331 outputsX, scaleTargets, scaleOutput);
3332 }
3333 }
3334 } else {
3335 if (checkCaseBounds) {
3336 if (scaleTargets == 0 && scaleOutput == 1) {
3337 kLocalAvgUndo<4, 32, 1, 4, false, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3338 imgSize, numFilters, numImages, subsX, startX, strideX,
3339 outputsX, scaleTargets, scaleOutput);
3340 } else {
3341 kLocalAvgUndo<4, 32, 1, 4, true, true><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3342 imgSize, numFilters, numImages, subsX, startX, strideX,
3343 outputsX, scaleTargets, scaleOutput);
3344 }
3345 } else {
3346 if (scaleTargets == 0 && scaleOutput == 1) {
3347 kLocalAvgUndo<4, 32, 1, 4, false, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3348 imgSize, numFilters, numImages, subsX, startX, strideX,
3349 outputsX, scaleTargets, scaleOutput);
3350 } else {
3351 kLocalAvgUndo<4, 32, 1, 4, true, false><<<blocks, threads>>>(avgGrads.getDevData(), target.getDevData(),
3352 imgSize, numFilters, numImages, subsX, startX, strideX,
3353 outputsX, scaleTargets, scaleOutput);
3354 }
3355 }
3356 }
3357
3358 cutilCheckMsg("convLocalAvgUndo: kernel execution failed");
3359 }
3360
3361 /*
3362 prob max undo
3363
3364 */
3365
3366 void localProbMaxUndo(NVMatrix& maxout_h, NVMatrix& maxout_p, NVMatrix& hGrads, NVMatrix& pGrads, NVMatrix& target_z,
3367 NVMatrix& target_t, int subsX, int startX, int strideX, int outputsX, int imgSize, float * gp_iszero, float * gh_iszero) {
3368 int outputs = outputsX * outputsX;
3369 int imgPixels = imgSize * imgSize;
3370 int numImages = maxout_h.getNumCols();
3371 int numFilters = maxout_h.getNumRows() / imgPixels;
3372
3373 assert(maxout_h.getNumRows() / numFilters == imgPixels);
3374 assert(maxout_h.getNumRows() == numFilters * imgPixels);
3375 assert(imgSize * imgSize == imgPixels);
3376
3377 assert(hGrads.getNumRows() == numFilters * imgPixels);
3378 assert(hGrads.getNumCols() == numImages);
3379
3380 assert(target_z.getNumRows() == numFilters * imgPixels);
3381 assert(target_z.getNumCols() == numImages);
3382
3383 assert(maxout_p.getNumRows() == numFilters * outputs);
3384 assert(maxout_p.getNumCols() == numImages);
3385
3386 assert(pGrads.getNumRows() == numFilters * outputs);
3387 assert(pGrads.getNumCols() == numImages);
3388
3389 assert(target_t.getNumRows() == numFilters * outputs);
3390 assert(target_t.getNumCols() == numImages);
3391
3392 assert(!maxout_h.isTrans());
3393 assert(!maxout_p.isTrans());
3394 assert(!target_t.isTrans());
3395 assert(!target_z.isTrans());
3396 assert(!hGrads.isTrans());
3397 assert(!pGrads.isTrans());
3398 assert(maxout_h.isContiguous());
3399 assert(maxout_p.isContiguous());
3400 assert(hGrads.isContiguous());
3401 assert(pGrads.isContiguous());
3402 assert(target_z.isContiguous());
3403 assert(target_t.isContiguous());
3404
3405 assert(numFilters % 16 == 0);
3406 assert(strideX <= subsX);
3407
3408 target_z.resize(maxout_h);
3409 target_t.resize(maxout_p);
3410
3411 int filtersPerThread = numFilters % 8 == 0 ? 2 : 1;
3412 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3413 int checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3414 dim3 threads(32, 4);
3415 dim3 blocks(DIVUP(numImages,32*imgsPerThread) * outputsX, DIVUP(numFilters, 4 * filtersPerThread) * outputsX);
3416
3417 if (imgsPerThread == 4) {
3418 if (filtersPerThread == 1) {
3419 if (checkCaseBounds) {
3420 kLocalProbMaxUndo<4, 32, 4, 1, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3421 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3422 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3423 } else {
3424 kLocalProbMaxUndo<4, 32, 4, 1, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3425 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3426 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3427 }
3428 } else {
3429 if (checkCaseBounds) {
3430 kLocalProbMaxUndo<4, 32, 4, 2, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3431 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3432 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3433 } else {
3434 kLocalProbMaxUndo<4, 32, 4, 2, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3435 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3436 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3437 }
3438 }
3439 }
3440 else if (imgsPerThread == 2) {
3441 if (filtersPerThread == 1) {
3442 if (checkCaseBounds) {
3443 kLocalProbMaxUndo<4, 32, 2, 1, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3444 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3445 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3446 } else {
3447 kLocalProbMaxUndo<4, 32, 2, 1, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3448 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3449 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3450 }
3451 } else {
3452 if (checkCaseBounds) {
3453 kLocalProbMaxUndo<4, 32, 2, 2, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3454 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3455 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3456 } else {
3457 kLocalProbMaxUndo<4, 32, 2, 2, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3458 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3459 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3460 }
3461 }
3462 }
3463 else {
3464 if (filtersPerThread == 1) {
3465 if (checkCaseBounds) {
3466 kLocalProbMaxUndo<4, 32, 1, 1, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3467 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3468 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3469 } else {
3470 kLocalProbMaxUndo<4, 32, 1, 1, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3471 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3472 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3473 }
3474 } else {
3475 if (checkCaseBounds) {
3476 kLocalProbMaxUndo<4, 32, 1, 2, true><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3477 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3478 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3479 } else {
3480 kLocalProbMaxUndo<4, 32, 1, 2, false><<<blocks, threads>>>(maxout_h.getDevData(), maxout_p.getDevData(),
3481 hGrads.getDevData(), pGrads.getDevData(), target_z.getDevData(), target_t.getDevData(),
3482 imgSize, numFilters, numImages, subsX, startX, strideX, outputsX, gp_iszero, gh_iszero);
3483 }
3484 }
3485 }
3486
3487 cutilCheckMsg("localProbMaxUndo: kernel execution failed");
3488 }
3489
3490
3491
3492 void convResponseNorm(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale) {
3493 convContrastNorm(images, images, denoms, target, numFilters, sizeX, addScale, powScale);
3494 }
3495
3496 /*
3497 * images: (numFilters, imgPixels, numImages)
3498 * meanDiffs: (numFilters, imgPixels, numImages)
3499 * denoms: (numFilters, imgPixels, numImages) (out)
3500 * target: (numFilters, imgPixels, numImages) (out)
3501 */
3502 void convContrastNorm(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeX, float addScale, float powScale) {
3503 int numImages = images.getNumCols();
3504 int imgPixels = images.getNumRows() / numFilters;
3505 assert(images.getNumRows() == numFilters * imgPixels);
3506 int imgSize = int(sqrt((double)imgPixels));
3507 assert(imgSize * imgSize == imgPixels);
3508 assert(meanDiffs.isSameDims(images));
3509
3510 assert(!meanDiffs.isTrans());
3511 assert(!images.isTrans());
3512 assert(images.isContiguous());
3513 assert(meanDiffs.isContiguous());
3514 assert(numFilters % 16 == 0 || numFilters <= 8);
3515
3516 target.resize(images);
3517 denoms.resize(images);
3518 assert(target.isContiguous());
3519 if (sizeX >= 6 && numFilters % 4 == 0) {
3520 // This one is faster for large regions (my tests show regions >= 6...)
3521 int imgsPerThread = 8;
3522 int filtersPerThread = 4;
3523 int bx = 8;
3524 bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
3525 assert((imgsPerThread * bx) % 32 == 0);
3526 assert(numFilters % filtersPerThread == 0);
3527 dim3 threads(bx, 16);
3528 dim3 blocks(DIVUP(imgSize, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(imgSize, 4) * numFilters / filtersPerThread);
3529
3530 if (checkCaseBounds) {
3531 cudaFuncSetCacheConfig(kCNorm2<8, 8, 4, true>, cudaFuncCachePreferL1); // L1 faster here
3532 kCNorm2<8, 8, 4, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3533 imgSize, numFilters, numImages, sizeX, addScale, powScale);
3534 } else {
3535 cudaFuncSetCacheConfig(kCNorm2<8, 8, 4, false>, cudaFuncCachePreferL1); // L1 faster here
3536 kCNorm2<8, 8, 4, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3537 imgSize, numFilters, numImages, sizeX, addScale, powScale);
3538 }
3539 } else {
3540 bool checkCaseBounds = numImages % 128 != 0;
3541 if (numFilters <= 8) {
3542 dim3 threads(128);
3543 dim3 blocks(DIVUP(numImages,128) * imgSize, imgSize);
3544 if (numFilters == 1) {
3545 if (checkCaseBounds) {
3546 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 1, true>, cudaFuncCachePreferL1);
3547 kCNorm_fewfilter<1, 1, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3548 imgSize, numImages, sizeX, addScale, powScale);
3549 } else {
3550 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 1, false>, cudaFuncCachePreferL1);
3551 kCNorm_fewfilter<1, 1, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3552 imgSize, numImages, sizeX, addScale, powScale);
3553 }
3554 } else if (numFilters == 2) {
3555 if (checkCaseBounds) {
3556 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 2, true>, cudaFuncCachePreferL1);
3557 kCNorm_fewfilter<1, 2, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3558 imgSize, numImages, sizeX, addScale, powScale);
3559 } else {
3560 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 2, false>, cudaFuncCachePreferL1);
3561 kCNorm_fewfilter<1, 2, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3562 imgSize, numImages, sizeX, addScale, powScale);
3563 }
3564 } else if (numFilters == 3) {
3565 if (checkCaseBounds) {
3566 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 3, true>, cudaFuncCachePreferL1);
3567 kCNorm_fewfilter<1, 3, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3568 imgSize, numImages, sizeX, addScale, powScale);
3569 } else {
3570 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 3, false>, cudaFuncCachePreferL1);
3571 kCNorm_fewfilter<1, 3, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3572 imgSize, numImages, sizeX, addScale, powScale);
3573 }
3574 } else if (numFilters == 4) {
3575 if (checkCaseBounds) {
3576 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 4, true>, cudaFuncCachePreferL1);
3577 kCNorm_fewfilter<1, 4, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3578 imgSize, numImages, sizeX, addScale, powScale);
3579 } else {
3580 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 4, false>, cudaFuncCachePreferL1);
3581 kCNorm_fewfilter<1, 4, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3582 imgSize, numImages, sizeX, addScale, powScale);
3583 }
3584 } else if (numFilters == 5) {
3585 if (checkCaseBounds) {
3586 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 5, true>, cudaFuncCachePreferL1);
3587 kCNorm_fewfilter<1, 5, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3588 imgSize, numImages, sizeX, addScale, powScale);
3589 } else {
3590 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 5, false>, cudaFuncCachePreferL1);
3591 kCNorm_fewfilter<1, 5, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3592 imgSize, numImages, sizeX, addScale, powScale);
3593 }
3594 } else if (numFilters == 6) {
3595 if (checkCaseBounds) {
3596 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 6, true>, cudaFuncCachePreferL1);
3597 kCNorm_fewfilter<1, 6, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3598 imgSize, numImages, sizeX, addScale, powScale);
3599 } else {
3600 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 6, false>, cudaFuncCachePreferL1);
3601 kCNorm_fewfilter<1, 6, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3602 imgSize, numImages, sizeX, addScale, powScale);
3603 }
3604 } else if (numFilters == 7) {
3605 if (checkCaseBounds) {
3606 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 7, true>, cudaFuncCachePreferL1);
3607 kCNorm_fewfilter<1, 7, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3608 imgSize, numImages, sizeX, addScale, powScale);
3609 } else {
3610 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 7, false>, cudaFuncCachePreferL1);
3611 kCNorm_fewfilter<1, 7, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3612 imgSize, numImages, sizeX, addScale, powScale);
3613 }
3614 } else if (numFilters == 8) {
3615 if (checkCaseBounds) {
3616 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 8, true>, cudaFuncCachePreferL1);
3617 kCNorm_fewfilter<1, 8, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3618 imgSize, numImages, sizeX, addScale, powScale);
3619 } else {
3620 cudaFuncSetCacheConfig(kCNorm_fewfilter<1, 8, false>, cudaFuncCachePreferL1);
3621 kCNorm_fewfilter<1, 8, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3622 imgSize, numImages, sizeX, addScale, powScale);
3623 }
3624 }
3625 } else {
3626 dim3 threads(32, 4);
3627 dim3 blocks(DIVUP(numImages,32*4) * imgSize, (numFilters / (4 * 2)) * imgSize);
3628 if (checkCaseBounds) {
3629 cudaFuncSetCacheConfig(kCNorm_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
3630 kCNorm_manyfilter<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3631 imgSize, numFilters, numImages, sizeX, addScale, powScale);
3632 } else {
3633 cudaFuncSetCacheConfig(kCNorm_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
3634 kCNorm_manyfilter<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
3635 imgSize, numFilters, numImages, sizeX, addScale, powScale);
3636 }
3637 }
3638 }
3639 cutilCheckMsg("convResponseNorm: kernel execution failed");
3640 }
3641
3642 void convContrastNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& meanDiffs, NVMatrix& acts, NVMatrix& target, int numFilters,
3643 int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput) {
3644 convResponseNormUndo(outGrads, denoms, meanDiffs, acts, target, numFilters, sizeX, addScale, powScale, scaleTargets, scaleOutput);
3645 }
3646
3647 /*
3648 * outGrads: (numFilters, imgPixels, numImages)
3649 * denoms: (numFilters, imgPixels, numImages)
3650 * inputs: (numFilters, imgPixels, numImages)
3651 * acts: (numFilters, imgPixels, numImages)
3652 * target: (numFilters, imgPixels, numImages)
3653 *
3654 * THIS WILL OVERWRITE THE ACTS MATRIX.
3655 */
3656 void convResponseNormUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
3657 int sizeX, float addScale, float powScale, float scaleTargets, float scaleOutput) {
3658 int numImages = outGrads.getNumCols();
3659 int imgPixels = outGrads.getNumRows() / numFilters;
3660
3661 int imgSize = int(sqrt((double)imgPixels));
3662 assert(imgSize * imgSize == imgPixels);
3663
3664 assert(outGrads.getNumRows() == numFilters * imgPixels);
3665
3666 assert(denoms.isSameDims(outGrads));
3667 assert(acts.isSameDims(denoms));
3668 assert(!denoms.isTrans());
3669 assert(!outGrads.isTrans());
3670 assert(!acts.isTrans());
3671 assert(!target.isTrans());
3672 assert(outGrads.isContiguous());
3673
3674 assert(numFilters % 16 == 0);
3675
3676 target.resize(outGrads);
3677 assert(target.isContiguous());
3678 // First do acts := -2 x scale x acts x outGrads / denoms
3679 // so that the main routine only has to do an addition in its inner loop.
3680 int prelimEltsPerThread = 4;
3681 dim3 threads(128);
3682 dim3 blocks(MIN(512, DIVUP(outGrads.getNumElements(),(threads.x * prelimEltsPerThread))));
3683 kRNormUndoPrelims<128, 4><<<blocks, threads>>>(acts.getDevData(), denoms.getDevData(), outGrads.getDevData(), outGrads.getNumElements(), -2*addScale*powScale);
3684
3685 // Now the main routine
3686 if (sizeX >= 6 && numFilters % 4 == 0) {
3687 // This one is faster for large regions (my tests show regions >= 6...)
3688 int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
3689 int filtersPerThread = 4;
3690 int bx = 16;
3691 bool checkCaseBounds = numImages % (bx*imgsPerThread) != 0;
3692 assert((imgsPerThread * bx) % 32 == 0);
3693
3694 threads = dim3(bx, 16);
3695 blocks = dim3(DIVUP(imgSize, 4) * DIVUP(numImages, bx*imgsPerThread), DIVUP(imgSize, 4) * numFilters / filtersPerThread);
3696 if (imgsPerThread == 8) {
3697 if (checkCaseBounds) {
3698 if (scaleTargets == 0 && scaleOutput == 1) {
3699 cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, true, true>, cudaFuncCachePreferL1);
3700 kRNormUndo2<16, 8, 4, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3701 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3702 scaleTargets, scaleOutput);
3703 } else {
3704 cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, false, true>, cudaFuncCachePreferL1);
3705 kRNormUndo2<16, 8, 4, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3706 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3707 scaleTargets, scaleOutput);
3708 }
3709 } else {
3710 if (scaleTargets == 0 && scaleOutput == 1) {
3711 cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, true, false>, cudaFuncCachePreferL1);
3712 kRNormUndo2<16, 8, 4, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3713 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3714 scaleTargets, scaleOutput);
3715 } else {
3716 cudaFuncSetCacheConfig(kRNormUndo2<16, 8, 4, false, false>, cudaFuncCachePreferL1);
3717 kRNormUndo2<16, 8, 4, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3718 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3719 scaleTargets, scaleOutput);
3720 }
3721 }
3722 } else if (imgsPerThread == 4) {
3723 if (checkCaseBounds) {
3724 if (scaleTargets == 0 && scaleOutput == 1) {
3725 cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, true, true>, cudaFuncCachePreferL1);
3726 kRNormUndo2<16, 4, 4, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3727 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3728 scaleTargets, scaleOutput);
3729 } else {
3730 cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, false, true>, cudaFuncCachePreferL1);
3731 kRNormUndo2<16, 4, 4, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3732 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3733 scaleTargets, scaleOutput);
3734 }
3735 } else {
3736 if (scaleTargets == 0 && scaleOutput == 1) {
3737 cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, true, false>, cudaFuncCachePreferL1);
3738 kRNormUndo2<16, 4, 4, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3739 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3740 scaleTargets, scaleOutput);
3741 } else {
3742 cudaFuncSetCacheConfig(kRNormUndo2<16, 4, 4, false, false>, cudaFuncCachePreferL1);
3743 kRNormUndo2<16, 4, 4, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3744 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3745 scaleTargets, scaleOutput);
3746 }
3747 }
3748 } else {
3749 if (checkCaseBounds) {
3750 if (scaleTargets == 0 && scaleOutput == 1) {
3751 cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, true, true>, cudaFuncCachePreferL1);
3752 kRNormUndo2<16, 2, 4, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3753 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3754 scaleTargets, scaleOutput);
3755 } else {
3756 cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, false, true>, cudaFuncCachePreferL1);
3757 kRNormUndo2<16, 2, 4, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3758 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3759 scaleTargets, scaleOutput);
3760 }
3761 } else {
3762 if (scaleTargets == 0 && scaleOutput == 1) {
3763 cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, true, false>, cudaFuncCachePreferL1);
3764 kRNormUndo2<16, 2, 4, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3765 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3766 scaleTargets, scaleOutput);
3767 } else {
3768 cudaFuncSetCacheConfig(kRNormUndo2<16, 2, 4, false, false>, cudaFuncCachePreferL1);
3769 kRNormUndo2<16, 2, 4, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3770 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3771 scaleTargets, scaleOutput);
3772 }
3773 }
3774 }
3775 } else {
3776 int imgsPerThread = numImages % 64 == 0 ? 2 : 1;
3777 bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3778 threads = dim3(32, 4);
3779 blocks = dim3(DIVUP(numImages,32*imgsPerThread) * imgSize, (numFilters / (4 * 2)) * imgSize);
3780
3781 if (imgsPerThread == 2) {
3782 if (checkCaseBounds) {
3783 if (scaleTargets == 0 && scaleOutput == 1) {
3784 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, false, true>, cudaFuncCachePreferL1);
3785 kRNormUndo<4, 32, 2, 2, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3786 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3787 scaleTargets, scaleOutput);
3788 } else {
3789 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, true, true>, cudaFuncCachePreferL1);
3790 kRNormUndo<4, 32, 2, 2, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3791 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3792 scaleTargets, scaleOutput);
3793 }
3794 } else {
3795 if (scaleTargets == 0 && scaleOutput == 1) {
3796 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, false, false>, cudaFuncCachePreferL1);
3797 kRNormUndo<4, 32, 2, 2, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3798 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3799 scaleTargets, scaleOutput);
3800 } else {
3801 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 2, 2, true, false>, cudaFuncCachePreferL1);
3802 kRNormUndo<4, 32, 2, 2, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3803 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3804 scaleTargets, scaleOutput);
3805 }
3806 }
3807 } else {
3808 if (checkCaseBounds) {
3809 if (scaleTargets == 0 && scaleOutput == 1) {
3810 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, false, true>, cudaFuncCachePreferL1);
3811 kRNormUndo<4, 32, 1, 2, false, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3812 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3813 scaleTargets, scaleOutput);
3814 } else {
3815 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, true, true>, cudaFuncCachePreferL1);
3816 kRNormUndo<4, 32, 1, 2, true, true><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3817 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3818 scaleTargets, scaleOutput);
3819 }
3820 } else {
3821 if (scaleTargets == 0 && scaleOutput == 1) {
3822 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, false, false>, cudaFuncCachePreferL1);
3823 kRNormUndo<4, 32, 1, 2, false, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3824 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3825 scaleTargets, scaleOutput);
3826 } else {
3827 cudaFuncSetCacheConfig(kRNormUndo<4, 32, 1, 2, true, false>, cudaFuncCachePreferL1);
3828 kRNormUndo<4, 32, 1, 2, true, false><<<blocks, threads>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
3829 target.getDevData(), imgSize, numFilters, numImages, sizeX, powScale,
3830 scaleTargets, scaleOutput);
3831 }
3832 }
3833 }
3834 }
3835 cutilCheckMsg("kRNormUndo: kernel execution failed");
3836 }
3837
3838 /*
3839 * imgs: (numChannels, imgPixels, numImages) with given imgStride
3840 * target: (numChannels, tgtPixels, numImages)
3841 *
3842 * imgSize = scale * tgtSize
3843 */
3844 void convResizeBilinear(NVMatrix& images, NVMatrix& target, int imgSize, int tgtSize, float scale) {
3845 assert(!images.isTrans());
3846 assert(!target.isTrans());
3847 int imgPixels = imgSize * imgSize;
3848 int tgtPixels = tgtSize * tgtSize;
3849 int numChannels = images.getNumRows() / imgPixels;
3850 int numImages = images.getNumCols();
3851 assert(images.getNumRows() == numChannels * imgPixels);
3852
3853 target.resize(numChannels * tgtPixels, numImages);
3854 assert(target.isContiguous());
3855 int numChunksX = DIVUP(tgtSize, 4);
3856 int numChunks = numChunksX * numChunksX;
3857 double imgCenter = imgSize * 0.5;
3858 double tgtCenter = tgtSize * 0.5;
3859 double centerScale = imgCenter - tgtCenter * scale;
3860
3861 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3862 bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3863
3864 dim3 threads(32, 16);
3865 dim3 blocks(DIVUP(numImages, imgsPerThread * 32), numChannels * numChunks);
3866 if (imgsPerThread == 4) {
3867 if (checkCaseBounds) {
3868 cudaFuncSetCacheConfig(kResizeBilinear<4, true>, cudaFuncCachePreferL1);
3869 kResizeBilinear<4, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3870 } else {
3871 cudaFuncSetCacheConfig(kResizeBilinear<4, false>, cudaFuncCachePreferL1);
3872 kResizeBilinear<4, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3873 }
3874 } else if (imgsPerThread == 2) {
3875 if (checkCaseBounds) {
3876 cudaFuncSetCacheConfig(kResizeBilinear<2, true>, cudaFuncCachePreferL1);
3877 kResizeBilinear<2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3878 } else {
3879 cudaFuncSetCacheConfig(kResizeBilinear<2, false>, cudaFuncCachePreferL1);
3880 kResizeBilinear<2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3881 }
3882 } else {
3883 if (checkCaseBounds) {
3884 cudaFuncSetCacheConfig(kResizeBilinear<1, true>, cudaFuncCachePreferL1);
3885 kResizeBilinear<1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3886 } else {
3887 cudaFuncSetCacheConfig(kResizeBilinear<1, false>, cudaFuncCachePreferL1);
3888 kResizeBilinear<1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgSize, tgtSize, numImages, images.getStride(), scale, centerScale);
3889 }
3890 }
3891 cutilCheckMsg("convResizeBilinear: kernel execution failed");
3892 }
3893
3894 /*
3895 * imgs: (3, imgPixels, numImages) with given imgStride
3896 * target: (3, imgPixels, numImages)
3897 */
3898 void convRGBToYUV(NVMatrix& images, NVMatrix& target) {
3899 assert(!images.isTrans());
3900 assert(!target.isTrans());
3901 int imgPixels = images.getNumRows() / 3;
3902 int numImages = images.getNumCols();
3903 assert(images.getNumRows() == 3 * imgPixels);
3904
3905 target.resize(3 * imgPixels, numImages);
3906 assert(target.isContiguous());
3907 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3908 bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3909
3910 dim3 threads(32, 4);
3911 dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
3912 if (imgsPerThread == 4) {
3913 if (checkCaseBounds) {
3914 cudaFuncSetCacheConfig(kRGBToYUV<4, true>, cudaFuncCachePreferL1);
3915 kRGBToYUV<4, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3916 } else {
3917 cudaFuncSetCacheConfig(kRGBToYUV<4, false>, cudaFuncCachePreferL1);
3918 kRGBToYUV<4, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3919 }
3920 } else if (imgsPerThread == 2) {
3921 if (checkCaseBounds) {
3922 cudaFuncSetCacheConfig(kRGBToYUV<2, true>, cudaFuncCachePreferL1);
3923 kRGBToYUV<2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3924 } else {
3925 cudaFuncSetCacheConfig(kRGBToYUV<2, false>, cudaFuncCachePreferL1);
3926 kRGBToYUV<2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3927 }
3928 } else {
3929 if (checkCaseBounds) {
3930 cudaFuncSetCacheConfig(kRGBToYUV<1, true>, cudaFuncCachePreferL1);
3931 kRGBToYUV<1, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3932 } else {
3933 cudaFuncSetCacheConfig(kRGBToYUV<1, false>, cudaFuncCachePreferL1);
3934 kRGBToYUV<1, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3935 }
3936 }
3937 cutilCheckMsg("convRGBToYUV: kernel execution failed");
3938 }
3939
3940 /*
3941 * imgs: (3, imgPixels, numImages) with given imgStride
3942 * target: (3, imgPixels, numImages)
3943 */
3944 void convRGBToLAB(NVMatrix& images, NVMatrix& target, bool center) {
3945 assert(!images.isTrans());
3946 assert(!target.isTrans());
3947 int imgPixels = images.getNumRows() / 3;
3948 int numImages = images.getNumCols();
3949 assert(images.getNumRows() == 3 * imgPixels);
3950
3951 target.resize(3 * imgPixels, numImages);
3952 assert(target.isContiguous());
3953
3954 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
3955 bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
3956 dim3 threads(32, 4);
3957 dim3 blocks(DIVUP(numImages, imgsPerThread * 32), DIVUP(imgPixels, 4));
3958
3959 if (imgsPerThread == 4) {
3960 if (center) {
3961 if (checkCaseBounds) {
3962 cudaFuncSetCacheConfig(kRGBToLAB<4, true, true>, cudaFuncCachePreferL1);
3963 kRGBToLAB<4, true, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3964 } else {
3965 cudaFuncSetCacheConfig(kRGBToLAB<4, false, true>, cudaFuncCachePreferL1);
3966 kRGBToLAB<4, false, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3967 }
3968 } else {
3969 if (checkCaseBounds) {
3970 cudaFuncSetCacheConfig(kRGBToLAB<4, true, false>, cudaFuncCachePreferL1);
3971 kRGBToLAB<4, true, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3972 } else {
3973 cudaFuncSetCacheConfig(kRGBToLAB<4, false, false>, cudaFuncCachePreferL1);
3974 kRGBToLAB<4, false, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3975 }
3976 }
3977 } else if (imgsPerThread == 2) {
3978 if (center) {
3979 if (checkCaseBounds) {
3980 cudaFuncSetCacheConfig(kRGBToLAB<2, true, true>, cudaFuncCachePreferL1);
3981 kRGBToLAB<2, true, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3982 } else {
3983 cudaFuncSetCacheConfig(kRGBToLAB<2, false, true>, cudaFuncCachePreferL1);
3984 kRGBToLAB<2, false, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3985 }
3986 } else {
3987 if (checkCaseBounds) {
3988 cudaFuncSetCacheConfig(kRGBToLAB<2, true, false>, cudaFuncCachePreferL1);
3989 kRGBToLAB<2, true, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3990 } else {
3991 cudaFuncSetCacheConfig(kRGBToLAB<2, false, false>, cudaFuncCachePreferL1);
3992 kRGBToLAB<2, false, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
3993 }
3994 }
3995 } else {
3996 if (center) {
3997 if (checkCaseBounds) {
3998 cudaFuncSetCacheConfig(kRGBToLAB<1, true, true>, cudaFuncCachePreferL1);
3999 kRGBToLAB<1, true, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4000 } else {
4001 cudaFuncSetCacheConfig(kRGBToLAB<1, false, true>, cudaFuncCachePreferL1);
4002 kRGBToLAB<1, false, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4003 }
4004 } else {
4005 if (checkCaseBounds) {
4006 cudaFuncSetCacheConfig(kRGBToLAB<1, true, false>, cudaFuncCachePreferL1);
4007 kRGBToLAB<1, true, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4008 } else {
4009 cudaFuncSetCacheConfig(kRGBToLAB<1, false, false>, cudaFuncCachePreferL1);
4010 kRGBToLAB<1, false, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(), imgPixels, numImages, images.getStride());
4011 }
4012 }
4013 }
4014 cutilCheckMsg("convRGBToLAB: kernel execution failed");
4015 }
4016
4017 /*
4018 * imgs: (numChannels, imgPixels, numImages) with given imgStride
4019 * target: (numChannels, tgtPixels, numImages)
4020 */
4021 void convCrop(NVMatrix& imgs, NVMatrix& target, int imgSize, int tgtSize, int startY, int startX) {
4022 int numImages = imgs.getNumCols();
4023 int imgPixels = imgSize * imgSize;
4024 int tgtPixels = tgtSize * tgtSize;
4025
4026 int numChannels = imgs.getNumRows() / imgPixels;
4027 assert(imgs.getNumRows() == imgPixels * numChannels);
4028 assert(imgPixels == imgSize * imgSize);
4029 assert(imgSize - startY >= tgtSize);
4030 assert(imgSize - startX >= tgtSize);
4031 assert(startY >= 0);
4032 assert(startX >= 0);
4033 target.resize(numChannels * tgtPixels, numImages);
4034 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
4035 bool checkCaseBounds = numImages % (32*imgsPerThread) != 0;
4036 dim3 blocks(DIVUP(numImages, 32 * imgsPerThread), numChannels * DIVUP(tgtPixels, 4));
4037 dim3 threads(32, 4);
4038 if (imgsPerThread == 4) {
4039 if (checkCaseBounds) {
4040 kCrop<4, true><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4041 } else {
4042 kCrop<4, false><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4043 }
4044 } else if (imgsPerThread == 2) {
4045 if (checkCaseBounds) {
4046 kCrop<2, true><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4047 } else {
4048 kCrop<2, false><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4049 }
4050 } else {
4051 if (checkCaseBounds) {
4052 kCrop<1, true><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4053 } else {
4054 kCrop<1, false><<<blocks, threads>>>(imgs.getDevData(), target.getDevData(), numImages, imgs.getStride(), imgSize, tgtSize, startY, startX);
4055 }
4056 }
4057 cutilCheckMsg("convCrop: kernel execution failed");
4058 }
4059
4060 /*
4061 * images: (numFilters, imgPixels, numImages)
4062 * ticas: (numFilters, imgPixels, numImages)
4063 * target: (numFilters, imgPixels, numImages) (out)
4064 *
4065 * Computes TICA-style gradient for given feature maps
4066 * f(x) = exp(-(sum_i{x_i^2}^(1/2)))
4067 * dlogf(x)/df(x) = -x_i / (sum_i{x_i^2}^(1/2) + eps)
4068 *
4069 * eps added for numerical stability
4070 */
4071 void convTICAGrad(NVMatrix& images, NVMatrix& ticas, NVMatrix& target, int numFilters, int sizeX, float scaleTarget, float scaleOutput) {
4072 int numImages = images.getNumCols();
4073 int imgPixels = images.getNumRows() / numFilters;
4074 assert(images.getNumRows() == numFilters * imgPixels);
4075 int imgSize = int(sqrt((double)imgPixels));
4076 assert(imgSize * imgSize == imgPixels);
4077
4078 assert(!images.isTrans());
4079 assert(images.isContiguous());
4080 assert(numFilters % 16 == 0 || numFilters <= 8);
4081
4082 assert(ticas.isSameDims(images));
4083 assert(ticas.isContiguous());
4084
4085 if (scaleTarget == 0) {
4086 target.resize(images);
4087 } else {
4088 assert(target.isSameDims(images));
4089 }
4090 assert(target.isContiguous());
4091
4092 // TEMPORARY
4093 assert(numFilters > 8);
4094 assert(sizeX < 6);
4095
4096 dim3 threads(32, 4);
4097 dim3 blocks(DIVUP(numImages, 32*4) * imgSize, (numFilters / (4 * 2)) * imgSize);
4098 bool checkCaseBounds = (numImages % 128) != 0;
4099 if (checkCaseBounds) {
4100 cudaFuncSetCacheConfig(kTICAGrad_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
4101 kTICAGrad_manyfilter<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), ticas.getDevData(), target.getDevData(),
4102 imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4103 } else {
4104 cudaFuncSetCacheConfig(kTICAGrad_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
4105 kTICAGrad_manyfilter<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), ticas.getDevData(), target.getDevData(),
4106 imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4107 }
4108
4109 cutilCheckMsg("convTICAGrad: kernel execution failed");
4110 }
4111
4112 /*
4113 * images: (numFilters, imgPixels, numImages)
4114 * target: (numFilters, imgPixels, numImages) (out)
4115 *
4116 * Computes TICA-style gradient for given feature maps
4117 * f(x) = exp(-(sum_i{x_i^2}^(1/2)))
4118 * dlogf(x)/df(x) = -x_i / (sum_i{x_i^2}^(1/2) + eps)
4119 *
4120 * eps added for numerical stability
4121 */
4122 void convTICA(NVMatrix& images, NVMatrix& target, int numFilters, int sizeX, float scaleTarget, float scaleOutput) {
4123 int numImages = images.getNumCols();
4124 int imgPixels = images.getNumRows() / numFilters;
4125 assert(images.getNumRows() == numFilters * imgPixels);
4126 int imgSize = int(sqrt((double)imgPixels));
4127 assert(imgSize * imgSize == imgPixels);
4128
4129 assert(!images.isTrans());
4130 assert(images.isContiguous());
4131 assert(numFilters % 16 == 0 || numFilters <= 8);
4132
4133 if (scaleTarget == 0) {
4134 target.resize(images);
4135 } else {
4136 assert(target.isSameDims(images));
4137 }
4138 assert(target.isContiguous());
4139
4140 // TEMPORARY
4141 assert(numFilters > 8);
4142 assert(sizeX < 6);
4143
4144 dim3 threads(32, 4);
4145 dim3 blocks(DIVUP(numImages, 32*4) * imgSize, (numFilters / (4 * 2)) * imgSize);
4146 bool checkCaseBounds = (numImages % 128) != 0;
4147 if (checkCaseBounds) {
4148 cudaFuncSetCacheConfig(kTICA_manyfilter<4, 32, 4, 2, true>, cudaFuncCachePreferL1);
4149 kTICA_manyfilter<4, 32, 4, 2, true><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
4150 imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4151 } else {
4152 cudaFuncSetCacheConfig(kTICA_manyfilter<4, 32, 4, 2, false>, cudaFuncCachePreferL1);
4153 kTICA_manyfilter<4, 32, 4, 2, false><<<blocks, threads>>>(images.getDevData(), target.getDevData(),
4154 imgSize, numFilters, numImages, sizeX, scaleTarget, scaleOutput);
4155 }
4156
4157 cutilCheckMsg("convTICA: kernel execution failed");
4158 }
4159
4160
4161 /*
4162 * images: (numFilters, imgPixels, numImages)
4163 * meanDiffs: (numFilters, imgPixels, numImages)
4164 * denoms: (numFilters, imgPixels, numImages) (out)
4165 * target: (numFilters, imgPixels, numImages) (out)
4166
4167 * Note: at present, I have no code to compute the meanDiffs. So it should be set
4168 * to be equal to images. In other words, this isn't really doing contrast normalization,
4169 * just response normalization.
4170 */
4171 void convContrastNormCrossMap(NVMatrix& images, NVMatrix& meanDiffs, NVMatrix& denoms, NVMatrix& target,
4172 int numFilters, int sizeF, float addScale, float powScale, bool blocked) {
4173 int numImages = images.getNumCols();
4174 int imgPixels = images.getNumRows() / numFilters;
4175 assert(images.getNumRows() == numFilters * imgPixels);
4176 int imgSize = int(sqrt((double)imgPixels));
4177 assert(imgSize * imgSize == imgPixels);
4178 assert(meanDiffs.isSameDims(images));
4179 assert(sizeF > 0 && sizeF <= numFilters);
4180
4181 assert(!meanDiffs.isTrans());
4182 assert(!images.isTrans());
4183 assert(images.isContiguous());
4184 assert(meanDiffs.isContiguous());
4185 assert(numFilters % 16 == 0);
4186
4187 target.resize(images);
4188 denoms.resize(images);
4189 assert(target.isContiguous());
4190
4191 bool checkCaseBounds = numImages % 128 != 0;
4192
4193 dim3 threads(32, 4);
4194 dim3 blocks(DIVUP(numImages,32*4) * imgSize, (numFilters / 4) * imgSize);
4195 if (blocked) {
4196 if (checkCaseBounds) {
4197 cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, true, true>, cudaFuncCachePreferL1);
4198 kFCNorm<4, 32, 4, true, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4199 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4200 } else {
4201 cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, false, true>, cudaFuncCachePreferL1);
4202 kFCNorm<4, 32, 4, false, true><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4203 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4204 }
4205 } else {
4206 if (checkCaseBounds) {
4207 cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, true, false>, cudaFuncCachePreferL1);
4208 kFCNorm<4, 32, 4, true, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4209 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4210 } else {
4211 cudaFuncSetCacheConfig(kFCNorm<4, 32, 4, false, false>, cudaFuncCachePreferL1);
4212 kFCNorm<4, 32, 4, false, false><<<blocks, threads>>>(images.getDevData(), meanDiffs.getDevData(), denoms.getDevData(), target.getDevData(),
4213 imgSize, numFilters, numImages, sizeF, addScale, powScale);
4214 }
4215 }
4216
4217 cutilCheckMsg("convContrastNormCrossMap: kernel execution failed");
4218 }
4219
4220 /*
4221 * outGrads: (numFilters, imgPixels, numImages)
4222 * denoms: (numFilters, imgPixels, numImages)
4223 * inputs: (numFilters, imgPixels, numImages)
4224 * acts: (numFilters, imgPixels, numImages)
4225 * target: (numFilters, imgPixels, numImages)
4226 *
4227 * THIS WILL OVERWRITE THE ACTS MATRIX.
4228 */
4229 void convResponseNormCrossMapUndo(NVMatrix& outGrads, NVMatrix& denoms, NVMatrix& inputs, NVMatrix& acts, NVMatrix& target, int numFilters,
4230 int sizeF, float addScale, float powScale, bool blocked, float scaleTargets, float scaleOutput) {
4231 int numImages = outGrads.getNumCols();
4232 int imgPixels = outGrads.getNumRows() / numFilters;
4233
4234 int imgSize = int(sqrt((double)imgPixels));
4235 assert(imgSize * imgSize == imgPixels);
4236 assert(sizeF > 0 && sizeF <= numFilters);
4237 assert(outGrads.getNumRows() == numFilters * imgPixels);
4238
4239 assert(denoms.isSameDims(outGrads));
4240 assert(acts.isSameDims(denoms));
4241 assert(!denoms.isTrans());
4242 assert(!outGrads.isTrans());
4243 assert(!acts.isTrans());
4244 assert(!target.isTrans());
4245 assert(outGrads.isContiguous());
4246
4247 assert(numFilters % 16 == 0);
4248
4249 target.resize(outGrads);
4250 assert(target.isContiguous());
4251 // First do acts := -2 x scale x acts x outGrads / denoms
4252 // so that the main routine only has to do an addition in its inner loop.
4253 int prelimEltsPerThread = 4;
4254 dim3 threads(128);
4255 dim3 blocks(MIN(512, DIVUP(outGrads.getNumElements(),(threads.x * prelimEltsPerThread))));
4256 kRNormUndoPrelims<128, 4><<<blocks, threads>>>(acts.getDevData(), denoms.getDevData(), outGrads.getDevData(), outGrads.getNumElements(), -2*addScale*powScale);
4257
4258 // Now the main routine
4259
4260 dim3 threads2 = dim3(32, 4);
4261 dim3 blocks2 = dim3(DIVUP(numImages,32*4) * imgSize, (numFilters / 4) * imgSize);
4262 bool checkCaseBounds = (numImages % 128) != 0;
4263 if (blocked) {
4264 if (scaleTargets == 0 && scaleOutput == 1) {
4265 if (checkCaseBounds) {
4266 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, true, true>, cudaFuncCachePreferL1);
4267 kFRNormUndo<4, 32, 4, false, true, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4268 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4269 scaleTargets, scaleOutput);
4270 } else {
4271 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, false, true>, cudaFuncCachePreferL1);
4272 kFRNormUndo<4, 32, 4, false, false, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4273 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4274 scaleTargets, scaleOutput);
4275 }
4276 } else {
4277 if (checkCaseBounds) {
4278 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, true, true>, cudaFuncCachePreferL1);
4279 kFRNormUndo<4, 32, 4, true, true, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4280 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4281 scaleTargets, scaleOutput);
4282 } else {
4283 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, false, true>, cudaFuncCachePreferL1);
4284 kFRNormUndo<4, 32, 4, true, false, true><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4285 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4286 scaleTargets, scaleOutput);
4287 }
4288 }
4289 } else {
4290 if (scaleTargets == 0 && scaleOutput == 1) {
4291 if (checkCaseBounds) {
4292 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, true, false>, cudaFuncCachePreferL1);
4293 kFRNormUndo<4, 32, 4, false, true, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4294 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4295 scaleTargets, scaleOutput);
4296 } else {
4297 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, false, false, false>, cudaFuncCachePreferL1);
4298 kFRNormUndo<4, 32, 4, false, false, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4299 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4300 scaleTargets, scaleOutput);
4301 }
4302 } else {
4303 if (checkCaseBounds) {
4304 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, true, false>, cudaFuncCachePreferL1);
4305 kFRNormUndo<4, 32, 4, true, true, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4306 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4307 scaleTargets, scaleOutput);
4308 } else {
4309 cudaFuncSetCacheConfig(kFRNormUndo<4, 32, 4, true, false, false>, cudaFuncCachePreferL1);
4310 kFRNormUndo<4, 32, 4, true, false, false><<<blocks2, threads2>>>(outGrads.getDevData(), denoms.getDevData(), inputs.getDevData(), acts.getDevData(),
4311 target.getDevData(), imgSize, numFilters, numImages, sizeF, powScale,
4312 scaleTargets, scaleOutput);
4313 }
4314 }
4315 }
4316
4317 cutilCheckMsg("convResponseNormCrossMapUndo: kernel execution failed");
4318 }
4319
4320 void convResponseNormCrossMap(NVMatrix& images, NVMatrix& denoms, NVMatrix& target, int numFilters, int sizeF, float addScale, float powScale, bool blocked) {
4321 convContrastNormCrossMap(images, images, denoms, target, numFilters, sizeF, addScale, powScale, blocked);
4322 }
4323
4324 /*
4325 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
4326 * All rights reserved.
4327 *
4328 * Redistribution and use in source and binary forms, with or without modification,
4329 * are permitted provided that the following conditions are met:
4330 *
4331 * - Redistributions of source code must retain the above copyright notice,
4332 * this list of conditions and the following disclaimer.
4333 *
4334 * - Redistributions in binary form must reproduce the above copyright notice,
4335 * this list of conditions and the following disclaimer in the documentation
4336 * and/or other materials provided with the distribution.
4337 *
4338 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
4339 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
4340 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
4341 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
4342 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
4343 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
4344 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
4345 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
4346 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
4347 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
4348 */
4349
4350 #ifndef _CUDACONV2_EXPORT
4351 #define _CUDACONV2_EXPORT
4352 #endif
4353
4354 #include <cutil_inline.h>
4355 #include <nvmatrix.cuh>
4356 #include <cudaconv2.cuh>
4357
4358 /*
4359 * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images.
4360 * threadIdx.x determines image
4361 * threadIdx.y determines filter
4362 *
4363 * blockIdx.x determines image batch of B_X * imgsPerThread
4364 * blockIdx.y determines filter batch of module and B_Y * filtersPerThread
4365 *
4366 * images: (numColors, imgSizeY, imgSizeX, numImages) with stride given
4367 * filters: (numColors, filterPixels, numFilters) if conv
4368 * (numModules, numColors, filterPixels, numFilters) otherwise
4369 *
4370 * targets: (numFilters, numModulesY, numModulesX, numImages)
4371 *
4372 * B_Y one of 4, 8, 16
4373 * B_X one of 16, 32
4374 * imgsPerThread one of 1, 2, 4
4375 * filtersPerThread one of 1, 2, 4, 8
4376 *
4377 * Number of filters per module should be divisible by B_Y * filtersPerThread
4378 * checkImgBounds indicates whether number of images is divisible by B_X * imgsPerThread
4379 *
4380 * The imgSize here is the size of the actual image without the padding.
4381 *
4382 */
4383 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int numColors,
4384 bool scale, bool checkImgBounds>
4385 __global__ void filterActs_YxX_color(float* images, float* filters, float* targets,
4386 const int numImages, const int numFilters,
4387 const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
4388 const int moduleStride,
4389 const int numModulesY, const int numModulesX, const int imgStride,
4390 const float scaleTargets, const float scaleOutputs,
4391 const bool conv) {
4392 __shared__ float shFilters[B_Y*numColors][B_Y * filtersPerThread]; // pre-load B_Y pixels from B_Y*filtersPerThread filters
4393 __shared__ float shImages[B_Y*numColors][B_X * imgsPerThread]; // pre-load B_Y pixels from B_X*imgsPerThread images
4394 const int imgPixels = imgSizeY * imgSizeX;
4395 const int filterPixels = filterSize * filterSize;
4396
4397 const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
4398 const int moduleIdx = blockIdx.y / blocksPerModule;
4399 const int blockFilterIdx = blockIdx.y % blocksPerModule;
4400
4401 const int tidx = threadIdx.y * B_X + threadIdx.x;
4402
4403 const int imgLoadModPosY = (moduleIdx / numModulesX) * moduleStride;
4404 const int imgLoadModPosX = (moduleIdx % numModulesX) * moduleStride;
4405
4406 const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
4407 const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
4408 const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
4409 images += myImgIdx;
4410 filters += filtersPerThread * B_Y * blockFilterIdx
4411 + shFilterLoadY * numFilters + shFilterLoadX;
4412 if (!conv) {
4413 filters += moduleIdx * numColors * filterPixels * numFilters;
4414 }
4415
4416 targets += moduleIdx * numImages
4417 + (blockFilterIdx * B_Y * filtersPerThread + threadIdx.y) * numImages * numModulesY * numModulesX
4418 + myImgIdx;
4419
4420
4421 float prod[filtersPerThread][imgsPerThread];
4422 #pragma unroll
4423 for(int f = 0; f < filtersPerThread; f++) {
4424 #pragma unroll
4425 for(int g = 0; g < imgsPerThread; g++) {
4426 prod[f][g] = 0;
4427 }
4428 }
4429
4430 for (int p = 0; p < filterPixels; p += B_Y) {
4431 /*
4432 * Load B_Y pixels from B_Y*filtersPerThread filters
4433 */
4434 if (shFilterLoadY < B_Y) {
4435 #pragma unroll
4436 for (int p2 = 0; p2 < B_Y; p2 += B_X/filtersPerThread) {
4437 if (p + p2 + shFilterLoadY < filterPixels) {
4438 #pragma unroll
4439 for (int c = 0; c < numColors; c++) {
4440 shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = filters[(c * filterPixels + p + p2) * numFilters];
4441 }
4442 } else {
4443 #pragma unroll
4444 for (int c = 0; c < numColors; c++) {
4445 shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = 0;
4446 }
4447 }
4448 }
4449 }
4450
4451 /*
4452 * Load B_Y pixels from B_X*imgsPerThread images
4453 */
4454 const int pixIdx = p + threadIdx.y;
4455 if (pixIdx < filterPixels) {
4456 const int x = paddingStart + imgLoadModPosX + pixIdx % filterSize;
4457 const int y = paddingStart + imgLoadModPosY + pixIdx / filterSize;
4458 if (y >= 0 && y< imgSizeY && x >= 0 && x < imgSizeX) {
4459 #pragma unroll
4460 for (int i = 0; i < imgsPerThread; i++) {
4461 if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
4462 #pragma unroll
4463 for (int c = 0; c < numColors; c++) {
4464 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = images[imgStride * (c * imgPixels + y * imgSizeX + x) + i * B_X];
4465 }
4466 } else {
4467 #pragma unroll
4468 for (int c = 0; c < numColors; c++) {
4469 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4470 }
4471 }
4472 }
4473 } else { // Padding
4474 #pragma unroll
4475 for (int i = 0; i < imgsPerThread; i++) {
4476 #pragma unroll
4477 for (int c = 0; c < numColors; c++) {
4478 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4479 }
4480 }
4481 }
4482 }
4483 __syncthreads();
4484 #pragma unroll
4485 for (int i = 0; i < B_Y*numColors; i++) {
4486 #pragma unroll
4487 for(int f = 0; f < filtersPerThread; f++) {
4488 #pragma unroll
4489 for(int g = 0; g < imgsPerThread; g++) {
4490 prod[f][g] += shImages[i][g * B_X + threadIdx.x] * shFilters[i][threadIdx.y + f * B_Y];
4491 }
4492 }
4493
4494 }
4495 __syncthreads();
4496 }
4497
4498 if (scale) {
4499 #pragma unroll
4500 for (int g = 0; g < imgsPerThread; g++) {
4501 if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4502 #pragma unroll
4503 for (int f = 0; f < filtersPerThread; f++) {
4504 targets[g * B_X + f * B_Y * numImages * numModulesY * numModulesX] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModulesY * numModulesX] + scaleOutputs * prod[f][g];
4505 }
4506 }
4507 }
4508 } else {
4509 #pragma unroll
4510 for (int g = 0; g < imgsPerThread; g++) {
4511 if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4512 #pragma unroll
4513 for (int f = 0; f < filtersPerThread; f++) {
4514 targets[g * B_X + f * B_Y * numImages * numModulesY * numModulesX] = scaleOutputs * prod[f][g];
4515 }
4516 }
4517 }
4518 }
4519 }
4520
4521 /*
4522 * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images.
4523 * threadIdx.x determines image
4524 * threadIdx.y determines filter
4525 *
4526 * blockIdx.x determines image batch of B_X * imgsPerThread
4527 * blockIdx.y determines filter batch of B_Y * filtersPerThread
4528 *
4529 * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
4530 * filters: (numFilterColors, filterPixels, numFilters) if conv
4531 * (numModules, numFilterColors, filterPixels, numFilters) otherwise
4532 *
4533 * targets: (numFilters, numModulesY, numModulesX, numImages)
4534 *
4535 * B_Y one of 4, 8, 16
4536 * B_X one of 16, 32
4537 * imgsPerThread one of 1, 2, 4
4538 * filtersPerThread one of 1, 2, 4, 8
4539 * colorCache: how many colors to put into shmem
4540 *
4541 * numFilters should be divisible by B_Y * filtersPerThread
4542 * numImages be divisible by B_X * imgsPerThread
4543 * numFilterColors should be divisible by colorCache.
4544 * numImgColors must be even.
4545 * numFilters must be divisible by numGroups.
4546 *
4547 * The imgSize here is the size of the actual image without the padding.
4548 *
4549 */
4550 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int colorCache,
4551 bool scale, bool checkImgBounds>
4552 __global__ void filterActs_YxX_sparse(float* images, float* filters, float* targets,
4553 const int numImages, const int numFilters,
4554 const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
4555 const int moduleStride,
4556 const int numModulesY, const int numModulesX, const int imgStride, const int numImgColors,
4557 const int numGroups,
4558 const float scaleTargets, const float scaleOutputs,
4559 const bool conv) {
4560 __shared__ float shFilters[B_Y*colorCache][B_Y * filtersPerThread]; // pre-load B_Y pixels from B_Y*filtersPerThread filters
4561 __shared__ float shImages[B_Y*colorCache][B_X * imgsPerThread]; // pre-load B_Y pixels from B_X*imgsPerThread images
4562 const int imgPixels = imgSizeY * imgSizeX;
4563 const int filterPixels = filterSize * filterSize;
4564 const int numFilterColors = numImgColors / numGroups;
4565 const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
4566 const int moduleIdx = blockIdx.y / blocksPerModule;
4567 const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
4568 const int numFiltersPerGroup = numFilters / numGroups;
4569 const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
4570
4571 const int numModules = numModulesX * numModulesY;
4572 const int blockColorIdx = numFilterColors * blockGroupIdx;
4573
4574 const int tidx = threadIdx.y * B_X + threadIdx.x;
4575
4576 const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride;
4577 const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride;
4578
4579 const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
4580 const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
4581 const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
4582
4583 images += blockColorIdx * imgPixels * imgStride + myImgIdx;
4584 filters +=blockFilterIdx
4585 + shFilterLoadY * numFilters + shFilterLoadX;
4586 if (!conv) {
4587 filters += moduleIdx * numFilterColors * filterPixels * numFilters;
4588 }
4589
4590 targets += moduleIdx * numImages
4591 + (blockFilterIdx + threadIdx.y) * numImages * numModules
4592 + myImgIdx;
4593
4594 float prod[filtersPerThread][imgsPerThread];
4595 #pragma unroll
4596 for(int f = 0; f < filtersPerThread; f++) {
4597 #pragma unroll
4598 for(int g = 0; g < imgsPerThread; g++) {
4599 prod[f][g] = 0;
4600 }
4601 }
4602 // __shared__ int imgPos[]
4603 for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop)
4604 for (int p = 0; p < filterPixels; p += B_Y) {
4605 /*
4606 * Load B_Y pixels from B_Y*filtersPerThread filters
4607 */
4608 if (shFilterLoadY < B_Y) {
4609 #pragma unroll
4610 for (int p2 = 0; p2 < B_Y; p2 += B_X/filtersPerThread) {
4611 if (p + p2 + shFilterLoadY < filterPixels) {
4612 #pragma unroll
4613 for (int c = 0; c < colorCache; c++) {
4614 shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = filters[((oc+c) * filterPixels + p + p2) * numFilters];
4615 }
4616 } else {
4617 #pragma unroll
4618 for (int c = 0; c < colorCache; c++) {
4619 shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = 0;
4620 }
4621 }
4622 }
4623 }
4624
4625 /*
4626 * Load B_Y pixels from B_X*imgsPerThread images
4627 */
4628 const int pixIdx = p + threadIdx.y;
4629 if (pixIdx < filterPixels) {
4630 const int x = imgLoadModPosX + pixIdx % filterSize;
4631 const int y = imgLoadModPosY + pixIdx / filterSize;
4632 if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) {
4633 float* m = &images[imgStride * (oc * imgPixels + y * imgSizeX + x)];
4634 #pragma unroll
4635 for (int i = 0; i < imgsPerThread; i++) {
4636 if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
4637 #pragma unroll
4638 for (int c = 0; c < colorCache; c++) {
4639 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = m[c * imgStride * imgPixels + i * B_X];
4640 }
4641 } else {
4642 #pragma unroll
4643 for (int c = 0; c < colorCache; c++) {
4644 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4645 }
4646 }
4647 }
4648 } else { // Padding
4649 #pragma unroll
4650 for (int i = 0; i < imgsPerThread; i++) {
4651 #pragma unroll
4652 for (int c = 0; c < colorCache; c++) {
4653 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4654 }
4655 }
4656 }
4657 }
4658 __syncthreads();
4659 #pragma unroll
4660 for (int i = 0; i < B_Y*colorCache; i++) {
4661 #pragma unroll
4662 for(int f = 0; f < filtersPerThread; f++) {
4663 #pragma unroll
4664 for(int g = 0; g < imgsPerThread; g++) {
4665 prod[f][g] += shImages[i][g * B_X + threadIdx.x] * shFilters[i][threadIdx.y + f * B_Y];
4666 }
4667 }
4668
4669 }
4670 __syncthreads();
4671 }
4672 }
4673
4674 if (scale) {
4675 #pragma unroll
4676 for (int g = 0; g < imgsPerThread; g++) {
4677 if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4678 #pragma unroll
4679 for (int f = 0; f < filtersPerThread; f++) {
4680 targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g];
4681 }
4682 }
4683 }
4684 } else {
4685 #pragma unroll
4686 for (int g = 0; g < imgsPerThread; g++) {
4687 if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4688 #pragma unroll
4689 for (int f = 0; f < filtersPerThread; f++) {
4690 targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g];
4691 }
4692 }
4693 }
4694 }
4695 }
4696
4697
4698 /*
4699 * Block size B_YxB_X. Each block applies B_Y * filtersPerThread filters to B_X * imgsPerThread images.
4700 * threadIdx.x determines image
4701 * threadIdx.y determines filter
4702 *
4703 * blockIdx.x determines image batch of B_X * imgsPerThread
4704 * blockIdx.y determines filter batch of B_Y * filtersPerThread
4705 *
4706 * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
4707 * filters: (numFilterColors, filterPixels, numFilters) if conv
4708 * (numModules, numFilterColors, filterPixels, numFilters) otherwise
4709 *
4710 * targets: (numFilters, numModulesY, numModulesX, numImages)
4711 * colorIndices: (numGroups, numFiltercolors)
4712 *
4713 * B_Y one of 4, 8, 16
4714 * B_X one of 16, 32
4715 * imgsPerThread one of 1, 2, 4
4716 * filtersPerThread one of 1, 2, 4, 8
4717 * colorCache: how many colors to put into shmem
4718 *
4719 * numFilters should be divisible by B_Y * filtersPerThread
4720 * numImages be divisible by B_X * imgsPerThread
4721 * numFilterColors should be divisible by colorCache.
4722 * numImgColors must be even.
4723 * numFilters must be divisible by numGroups.
4724 *
4725 * The imgSize here is the size of the actual image without the padding.
4726 */
4727 template <int B_Y, int B_X, int imgsPerThread, int filtersPerThread, int colorCache, bool scale, bool checkImgBounds>
4728 __global__ void filterActs_YxX_sparse_random(float* images, float* filters, float* targets, int* colorIndices,
4729 const int numImages, const int numFilters,
4730 const int imgSizeY, const int imgSizeX, const int filterSize, const int paddingStart,
4731 const int moduleStride,
4732 const int numModulesY, const int numModulesX, const int imgStride,
4733 /*const int numImgColors,*/ const int numFilterColors, const int numGroups,
4734 const float scaleTargets, const float scaleOutputs,
4735 const bool conv) {
4736 __shared__ float shFilters[B_Y*colorCache][B_Y * filtersPerThread]; // pre-load B_Y pixels from B_Y*filtersPerThread filters
4737 __shared__ float shImages[B_Y*colorCache][B_X * imgsPerThread]; // pre-load B_Y pixels from B_X*imgsPerThread images
4738 __shared__ int shColors[colorCache];
4739 const int imgPixels = imgSizeY * imgSizeX;
4740 const int filterPixels = filterSize * filterSize;
4741 // const int numFilterColors = numImgColors / numGroups;
4742 const int blocksPerModule = numFilters / (B_Y*filtersPerThread);
4743 const int moduleIdx = blockIdx.y / blocksPerModule;
4744 const int blockFilterIdx = filtersPerThread * B_Y * (blockIdx.y % blocksPerModule);
4745 const int numFiltersPerGroup = numFilters / numGroups;
4746 const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
4747
4748 const int numModules = numModulesY * numModulesX;
4749
4750 const int tidx = threadIdx.y * B_X + threadIdx.x;
4751
4752 const int imgLoadModPosY = paddingStart + (moduleIdx / numModulesX) * moduleStride;
4753 const int imgLoadModPosX = paddingStart + (moduleIdx % numModulesX) * moduleStride;
4754
4755 const int shFilterLoadY = tidx / (B_Y * filtersPerThread);
4756 const int shFilterLoadX = tidx % (B_Y * filtersPerThread);
4757 const int myImgIdx = blockIdx.x * B_X * imgsPerThread + threadIdx.x;
4758
4759 images += myImgIdx;
4760 filters +=blockFilterIdx
4761 + shFilterLoadY * numFilters + shFilterLoadX;
4762 if (!conv) {
4763 filters += moduleIdx * numFilterColors * filterPixels * numFilters;
4764 }
4765
4766 targets += moduleIdx * numImages
4767 + (blockFilterIdx + threadIdx.y) * numImages * numModules
4768 + myImgIdx;
4769 colorIndices += blockGroupIdx * numFilterColors;
4770
4771 float prod[filtersPerThread][imgsPerThread];
4772 #pragma unroll
4773 for(int f = 0; f < filtersPerThread; f++) {
4774 #pragma unroll
4775 for(int g = 0; g < imgsPerThread; g++) {
4776 prod[f][g] = 0;
4777 }
4778 }
4779 // __shared__ int imgPos[]
4780 for (int oc = 0; oc < numFilterColors; oc += colorCache) { // oc stands for outer color (loop)
4781
4782 // Kinda wasteful here but...shouldn't matter
4783 if (tidx < colorCache) {
4784 shColors[tidx] = colorIndices[oc + tidx] * imgStride * imgPixels;
4785 }
4786 __syncthreads();
4787 for (int p = 0; p < filterPixels; p += B_Y) {
4788 /*
4789 * Load B_Y pixels from B_Y*filtersPerThread filters
4790 */
4791 if (shFilterLoadY < B_Y) {
4792 #pragma unroll
4793 for (int p2 = 0; p2 < B_Y; p2 += B_X/filtersPerThread) {
4794 if (p + p2 + shFilterLoadY < filterPixels) {
4795 #pragma unroll
4796 for (int c = 0; c < colorCache; c++) {
4797 shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = filters[((oc+c) * filterPixels + p + p2) * numFilters];
4798 }
4799 } else {
4800 #pragma unroll
4801 for (int c = 0; c < colorCache; c++) {
4802 shFilters[shFilterLoadY + p2 + c * B_Y][shFilterLoadX] = 0;
4803 }
4804 }
4805 }
4806 }
4807
4808 /*
4809 * Load B_Y pixels from B_X*imgsPerThread images
4810 */
4811 const int pixIdx = p + threadIdx.y;
4812 if (pixIdx < filterPixels) {
4813 const int x = imgLoadModPosX + pixIdx % filterSize;
4814 const int y = imgLoadModPosY + pixIdx / filterSize;
4815 if (y >= 0 && y < imgSizeY && x >= 0 && x < imgSizeX) {
4816 float* m = &images[imgStride * (y * imgSizeX + x)];
4817 #pragma unroll
4818 for (int i = 0; i < imgsPerThread; i++) {
4819 if (!checkImgBounds || myImgIdx + i * B_X < numImages) {
4820 #pragma unroll
4821 for (int c = 0; c < colorCache; c++) {
4822 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = m[shColors[c] + i * B_X];
4823 }
4824 } else {
4825 #pragma unroll
4826 for (int c = 0; c < colorCache; c++) {
4827 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4828 }
4829 }
4830 }
4831 } else { // Padding
4832 #pragma unroll
4833 for (int i = 0; i < imgsPerThread; i++) {
4834 #pragma unroll
4835 for (int c = 0; c < colorCache; c++) {
4836 shImages[threadIdx.y + c * B_Y][threadIdx.x + i * B_X] = 0;
4837 }
4838 }
4839 }
4840 }
4841 __syncthreads();
4842 #pragma unroll
4843 for (int i = 0; i < B_Y*colorCache; i++) {
4844 #pragma unroll
4845 for(int f = 0; f < filtersPerThread; f++) {
4846 #pragma unroll
4847 for(int g = 0; g < imgsPerThread; g++) {
4848 prod[f][g] += shImages[i][g * B_X + threadIdx.x] * shFilters[i][threadIdx.y + f * B_Y];
4849 }
4850 }
4851
4852 }
4853 __syncthreads();
4854 }
4855 }
4856
4857 if (scale) {
4858 #pragma unroll
4859 for (int g = 0; g < imgsPerThread; g++) {
4860 if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4861 #pragma unroll
4862 for (int f = 0; f < filtersPerThread; f++) {
4863 targets[g * B_X + f * B_Y * numImages * numModules] = scaleTargets * targets[g * B_X + f * B_Y * numImages * numModules] + scaleOutputs * prod[f][g];
4864 }
4865 }
4866 }
4867 } else {
4868 #pragma unroll
4869 for (int g = 0; g < imgsPerThread; g++) {
4870 if (!checkImgBounds || myImgIdx + g * B_X < numImages) {
4871 #pragma unroll
4872 for (int f = 0; f < filtersPerThread; f++) {
4873 targets[g * B_X + f * B_Y * numImages * numModules] = scaleOutputs * prod[f][g];
4874 }
4875 }
4876 }
4877 }
4878 }
4879
4880 /*
4881 * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
4882 * filters: (numFilterColors, filterPixels, numFilters) if conv
4883 * (numModules, numFilterColors, filterPixels, numFilters) otherwise
4884 *
4885 * targets: (numFilters, numModules, numImages)
4886 *
4887 * Note: all of these convolution routines are optimized for the case when
4888 * the number of images (i.e. the minibatch size) is a multiple of 128.
4889 * Other batch sizes will work, but but I made no attempt whatsoever
4890 * to make them work fast.
4891 */
4892 void _filterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
4893 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
4894 int numImgColors, int numGroups,
4895 float scaleTargets, float scaleOutput, bool conv) {
4896 int numFilterColors = numImgColors / numGroups;
4897 int numFilters = filters.getNumCols();
4898 int numModules = numModulesY * numModulesX;
4899 int numImages = images.getNumCols();
4900 int imgPixels = images.getNumRows()/numImgColors;
4901 int imgSizeX = imgPixels / imgSizeY;
4902 int filterModuleMult = conv ? 1 : numModules;
4903
4904 assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0)));
4905 assert(numGroups == 1 || numFilterColors % 2 == 0);
4906 assert(numFilters % (16 * numGroups) == 0);
4907 assert(numImgColors % numGroups == 0);
4908 assert(images.getNumRows() == imgPixels * numImgColors);
4909 assert(imgSizeY * imgSizeX == imgPixels);
4910 int numFiltersPerGroup = numFilters / numGroups;
4911
4912 int imgStride = images.getStride(); // images does not need to be a contiguous matrix
4913
4914 int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
4915 int filterSize = int(sqrt((double)filterPixels));
4916 assert(filterSize * filterSize == filterPixels);
4917 assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
4918
4919 // These routines don't handle the case when only part of the image is visited in the convolution
4920 assert(paddingStart <= 0);
4921 assert(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX);
4922 assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
4923 if (moduleStride > filterSize)
4924 {
4925 printf("moduleStride: %d\n", moduleStride);
4926 printf("filterSize: %d\n", filterSize);
4927 assert(false);
4928 }
4929
4930 assert(!images.isTrans());
4931 assert(!filters.isTrans());
4932 assert(!targets.isTrans());
4933
4934 assert(filters.isContiguous());
4935 assert(targets.isContiguous());
4936 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
4937 dim3 blocks = numFiltersPerGroup % 32 == 0 ? dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 8))
4938 : dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 4));
4939 dim3 threads(32, 4);
4940 bool checkImgBounds = numImages % (32*imgsPerThread) != 0;
4941 if (scaleTargets == 0) {
4942 targets.resize(numFilters * numModules, numImages);
4943 } else {
4944 assert(targets.getNumRows() == numFilters * numModules);
4945 assert(targets.getNumCols() == numImages);
4946 }
4947
4948 if (imgsPerThread == 4) {
4949 if (numImgColors <= 3) {
4950 assert(numGroups == 1); // It has to be based on above definitions, but just to be sure.
4951 if (scaleTargets == 0) { // don't scale
4952 if (numImgColors == 1) {
4953 if (checkImgBounds) {
4954 if (numFilters % 32 == 0) {
4955 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, false, true >, cudaFuncCachePreferShared);
4956 filterActs_YxX_color < 4, 32, 4, 8, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4957 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4958 } else {
4959 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, false, true >, cudaFuncCachePreferShared);
4960 filterActs_YxX_color < 4, 32, 4, 4, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4961 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4962 }
4963 } else {
4964 if (numFilters % 32 == 0) {
4965 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, false, false >, cudaFuncCachePreferShared);
4966 filterActs_YxX_color < 4, 32, 4, 8, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4967 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4968 } else {
4969 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, false, false >, cudaFuncCachePreferShared);
4970 filterActs_YxX_color < 4, 32, 4, 4, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4971 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4972 }
4973 }
4974 } else if (numImgColors == 2) {
4975 if (checkImgBounds) {
4976 if (numFilters % 32 == 0) {
4977 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
4978 filterActs_YxX_color < 4, 32, 4, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4979 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4980 } else {
4981 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
4982 filterActs_YxX_color < 4, 32, 4, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4983 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4984 }
4985 } else {
4986 if (numFilters % 32 == 0) {
4987 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
4988 filterActs_YxX_color < 4, 32, 4, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4989 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4990 } else {
4991 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
4992 filterActs_YxX_color < 4, 32, 4, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
4993 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
4994 }
4995 }
4996 } else if (numImgColors == 3) {
4997 if (checkImgBounds) {
4998 if (numFilters % 32 == 0) {
4999 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, false, true >, cudaFuncCachePreferShared);
5000 filterActs_YxX_color < 4, 32, 4, 8, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5001 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5002 } else {
5003 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, false, true >, cudaFuncCachePreferShared);
5004 filterActs_YxX_color < 4, 32, 4, 4, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5005 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5006 }
5007 } else {
5008 if (numFilters % 32 == 0) {
5009 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, false, false >, cudaFuncCachePreferShared);
5010 filterActs_YxX_color < 4, 32, 4, 8, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5011 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5012 } else {
5013 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, false, false >, cudaFuncCachePreferShared);
5014 filterActs_YxX_color < 4, 32, 4, 4, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5015 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5016 }
5017 }
5018 }
5019 } else { // do scale
5020 if (numImgColors == 1) {
5021 if (checkImgBounds) {
5022 if (numFilters % 32 == 0) {
5023 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, true, true >, cudaFuncCachePreferShared);
5024 filterActs_YxX_color < 4, 32, 4, 8, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5025 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5026 } else {
5027 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, true, true >, cudaFuncCachePreferShared);
5028 filterActs_YxX_color < 4, 32, 4, 4, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5029 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5030 }
5031 } else {
5032 if (numFilters % 32 == 0) {
5033 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 1, true, false >, cudaFuncCachePreferShared);
5034 filterActs_YxX_color < 4, 32, 4, 8, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5035 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5036 } else {
5037 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 1, true, false >, cudaFuncCachePreferShared);
5038 filterActs_YxX_color < 4, 32, 4, 4, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5039 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5040 }
5041 }
5042 } else if (numImgColors == 2) {
5043 if (checkImgBounds) {
5044 if (numFilters % 32 == 0) {
5045 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, true, true >, cudaFuncCachePreferShared);
5046 filterActs_YxX_color < 4, 32, 4, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5047 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5048 } else {
5049 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, true, true >, cudaFuncCachePreferShared);
5050 filterActs_YxX_color < 4, 32, 4, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5051 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5052 }
5053 } else {
5054 if (numFilters % 32 == 0) {
5055 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 2, true, false >, cudaFuncCachePreferShared);
5056 filterActs_YxX_color < 4, 32, 4, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5057 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5058 } else {
5059 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 2, true, false >, cudaFuncCachePreferShared);
5060 filterActs_YxX_color < 4, 32, 4, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5061 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5062 }
5063 }
5064 } else if (numImgColors == 3) {
5065 if (checkImgBounds) {
5066 if (numFilters % 32 == 0) {
5067 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, true, true >, cudaFuncCachePreferShared);
5068 filterActs_YxX_color < 4, 32, 4, 8, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5069 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5070 } else {
5071 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, true, true >, cudaFuncCachePreferShared);
5072 filterActs_YxX_color < 4, 32, 4, 4, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5073 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5074 }
5075 } else {
5076 if (numFilters % 32 == 0) {
5077 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 8, 3, true, false >, cudaFuncCachePreferShared);
5078 filterActs_YxX_color < 4, 32, 4, 8, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5079 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5080 } else {
5081 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 4, 4, 3, true, false >, cudaFuncCachePreferShared);
5082 filterActs_YxX_color < 4, 32, 4, 4, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5083 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5084 }
5085 }
5086 }
5087 }
5088 } else {
5089 if (scaleTargets == 0) { // don't scale
5090 if (checkImgBounds) {
5091 if (numFiltersPerGroup % 32 == 0) {
5092 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5093 filterActs_YxX_sparse < 4, 32, 4, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5094 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5095 } else {
5096 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5097 filterActs_YxX_sparse < 4, 32, 4, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5098 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5099 }
5100 } else {
5101 if (numFiltersPerGroup % 32 == 0) {
5102 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5103 filterActs_YxX_sparse < 4, 32, 4, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5104 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5105 } else {
5106 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5107 filterActs_YxX_sparse < 4, 32, 4, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5108 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5109 }
5110 }
5111 } else { // do scale
5112 if (checkImgBounds) {
5113 if (numFiltersPerGroup % 32 == 0) {
5114 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5115 filterActs_YxX_sparse < 4, 32, 4, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5116 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5117 } else {
5118 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5119 filterActs_YxX_sparse < 4, 32, 4, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5120 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5121 }
5122 } else {
5123 if (numFiltersPerGroup % 32 == 0) {
5124 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5125 filterActs_YxX_sparse < 4, 32, 4, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5126 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5127 } else {
5128 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5129 filterActs_YxX_sparse < 4, 32, 4, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5130 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5131 }
5132 }
5133 }
5134 }
5135 } else if (imgsPerThread == 2) {
5136 if (numImgColors <= 3) {
5137 assert(numGroups == 1); // It has to be based on above definitions, but just to be sure.
5138 if (scaleTargets == 0) { // don't scale
5139 if (numImgColors == 1) {
5140 if (checkImgBounds) {
5141 if (numFilters % 32 == 0) {
5142 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, false, true >, cudaFuncCachePreferShared);
5143 filterActs_YxX_color < 4, 32, 2, 8, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5144 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5145 } else {
5146 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, false, true >, cudaFuncCachePreferShared);
5147 filterActs_YxX_color < 4, 32, 2, 4, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5148 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5149 }
5150 } else {
5151 if (numFilters % 32 == 0) {
5152 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, false, false >, cudaFuncCachePreferShared);
5153 filterActs_YxX_color < 4, 32, 2, 8, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5154 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5155 } else {
5156 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, false, false >, cudaFuncCachePreferShared);
5157 filterActs_YxX_color < 4, 32, 2, 4, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5158 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5159 }
5160 }
5161 } else if (numImgColors == 2) {
5162 if (checkImgBounds) {
5163 if (numFilters % 32 == 0) {
5164 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5165 filterActs_YxX_color < 4, 32, 2, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5166 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5167 } else {
5168 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5169 filterActs_YxX_color < 4, 32, 2, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5170 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5171 }
5172 } else {
5173 if (numFilters % 32 == 0) {
5174 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5175 filterActs_YxX_color < 4, 32, 2, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5176 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5177 } else {
5178 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5179 filterActs_YxX_color < 4, 32, 2, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5180 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5181 }
5182 }
5183 } else if (numImgColors == 3) {
5184 if (checkImgBounds) {
5185 if (numFilters % 32 == 0) {
5186 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, false, true >, cudaFuncCachePreferShared);
5187 filterActs_YxX_color < 4, 32, 2, 8, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5188 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5189 } else {
5190 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, false, true >, cudaFuncCachePreferShared);
5191 filterActs_YxX_color < 4, 32, 2, 4, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5192 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5193 }
5194 } else {
5195 if (numFilters % 32 == 0) {
5196 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, false, false >, cudaFuncCachePreferShared);
5197 filterActs_YxX_color < 4, 32, 2, 8, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5198 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5199 } else {
5200 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, false, false >, cudaFuncCachePreferShared);
5201 filterActs_YxX_color < 4, 32, 2, 4, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5202 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5203 }
5204 }
5205 }
5206 } else { // do scale
5207 if (numImgColors == 1) {
5208 if (checkImgBounds) {
5209 if (numFilters % 32 == 0) {
5210 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, true, true >, cudaFuncCachePreferShared);
5211 filterActs_YxX_color < 4, 32, 2, 8, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5212 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5213 } else {
5214 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, true, true >, cudaFuncCachePreferShared);
5215 filterActs_YxX_color < 4, 32, 2, 4, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5216 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5217 }
5218 } else {
5219 if (numFilters % 32 == 0) {
5220 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 1, true, false >, cudaFuncCachePreferShared);
5221 filterActs_YxX_color < 4, 32, 2, 8, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5222 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5223 } else {
5224 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 1, true, false >, cudaFuncCachePreferShared);
5225 filterActs_YxX_color < 4, 32, 2, 4, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5226 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5227 }
5228 }
5229 } else if (numImgColors == 2) {
5230 if (checkImgBounds) {
5231 if (numFilters % 32 == 0) {
5232 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, true, true >, cudaFuncCachePreferShared);
5233 filterActs_YxX_color < 4, 32, 2, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5234 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5235 } else {
5236 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, true, true >, cudaFuncCachePreferShared);
5237 filterActs_YxX_color < 4, 32, 2, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5238 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5239 }
5240 } else {
5241 if (numFilters % 32 == 0) {
5242 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 2, true, false >, cudaFuncCachePreferShared);
5243 filterActs_YxX_color < 4, 32, 2, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5244 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5245 } else {
5246 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 2, true, false >, cudaFuncCachePreferShared);
5247 filterActs_YxX_color < 4, 32, 2, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5248 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5249 }
5250 }
5251 } else if (numImgColors == 3) {
5252 if (checkImgBounds) {
5253 if (numFilters % 32 == 0) {
5254 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, true, true >, cudaFuncCachePreferShared);
5255 filterActs_YxX_color < 4, 32, 2, 8, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5256 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5257 } else {
5258 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, true, true >, cudaFuncCachePreferShared);
5259 filterActs_YxX_color < 4, 32, 2, 4, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5260 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5261 }
5262 } else {
5263 if (numFilters % 32 == 0) {
5264 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 8, 3, true, false >, cudaFuncCachePreferShared);
5265 filterActs_YxX_color < 4, 32, 2, 8, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5266 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5267 } else {
5268 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 2, 4, 3, true, false >, cudaFuncCachePreferShared);
5269 filterActs_YxX_color < 4, 32, 2, 4, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5270 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5271 }
5272 }
5273 }
5274 }
5275 } else {
5276 if (scaleTargets == 0) { // don't scale
5277 if (checkImgBounds) {
5278 if (numFiltersPerGroup % 32 == 0) {
5279 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5280 filterActs_YxX_sparse < 4, 32, 2, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5281 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5282 } else {
5283 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5284 filterActs_YxX_sparse < 4, 32, 2, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5285 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5286 }
5287 } else {
5288 if (numFiltersPerGroup % 32 == 0) {
5289 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5290 filterActs_YxX_sparse < 4, 32, 2, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5291 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5292 } else {
5293 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5294 filterActs_YxX_sparse < 4, 32, 2, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5295 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5296 }
5297 }
5298 } else { // do scale
5299 if (checkImgBounds) {
5300 if (numFiltersPerGroup % 32 == 0) {
5301 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5302 filterActs_YxX_sparse < 4, 32, 2, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5303 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5304 } else {
5305 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5306 filterActs_YxX_sparse < 4, 32, 2, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5307 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5308 }
5309 } else {
5310 if (numFiltersPerGroup % 32 == 0) {
5311 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5312 filterActs_YxX_sparse < 4, 32, 2, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5313 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5314 } else {
5315 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5316 filterActs_YxX_sparse < 4, 32, 2, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5317 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5318 }
5319 }
5320 }
5321 }
5322 } else {
5323 if (numImgColors <= 3) {
5324 assert(numGroups == 1); // It has to be based on above definitions, but just to be sure.
5325 if (scaleTargets == 0) { // don't scale
5326 if (numImgColors == 1) {
5327 if (checkImgBounds) {
5328 if (numFilters % 32 == 0) {
5329 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, false, true >, cudaFuncCachePreferShared);
5330 filterActs_YxX_color < 4, 32, 1, 8, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5331 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5332 } else {
5333 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, false, true >, cudaFuncCachePreferShared);
5334 filterActs_YxX_color < 4, 32, 1, 4, 1, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5335 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5336 }
5337 } else {
5338 if (numFilters % 32 == 0) {
5339 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, false, false >, cudaFuncCachePreferShared);
5340 filterActs_YxX_color < 4, 32, 1, 8, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5341 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5342 } else {
5343 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, false, false >, cudaFuncCachePreferShared);
5344 filterActs_YxX_color < 4, 32, 1, 4, 1, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5345 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5346 }
5347 }
5348 } else if (numImgColors == 2) {
5349 if (checkImgBounds) {
5350 if (numFilters % 32 == 0) {
5351 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5352 filterActs_YxX_color < 4, 32, 1, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5353 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5354 } else {
5355 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5356 filterActs_YxX_color < 4, 32, 1, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5357 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5358 }
5359 } else {
5360 if (numFilters % 32 == 0) {
5361 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5362 filterActs_YxX_color < 4, 32, 1, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5363 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5364 } else {
5365 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5366 filterActs_YxX_color < 4, 32, 1, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5367 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5368 }
5369 }
5370 } else if (numImgColors == 3) {
5371 if (checkImgBounds) {
5372 if (numFilters % 32 == 0) {
5373 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, false, true >, cudaFuncCachePreferShared);
5374 filterActs_YxX_color < 4, 32, 1, 8, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5375 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5376 } else {
5377 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, false, true >, cudaFuncCachePreferShared);
5378 filterActs_YxX_color < 4, 32, 1, 4, 3, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5379 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5380 }
5381 } else {
5382 if (numFilters % 32 == 0) {
5383 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, false, false >, cudaFuncCachePreferShared);
5384 filterActs_YxX_color < 4, 32, 1, 8, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5385 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5386 } else {
5387 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, false, false >, cudaFuncCachePreferShared);
5388 filterActs_YxX_color < 4, 32, 1, 4, 3, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5389 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5390 }
5391 }
5392 }
5393 } else { // do scale
5394 if (numImgColors == 1) {
5395 if (checkImgBounds) {
5396 if (numFilters % 32 == 0) {
5397 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, true, true >, cudaFuncCachePreferShared);
5398 filterActs_YxX_color < 4, 32, 1, 8, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5399 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5400 } else {
5401 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, true, true >, cudaFuncCachePreferShared);
5402 filterActs_YxX_color < 4, 32, 1, 4, 1, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5403 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5404 }
5405 } else {
5406 if (numFilters % 32 == 0) {
5407 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 1, true, false >, cudaFuncCachePreferShared);
5408 filterActs_YxX_color < 4, 32, 1, 8, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5409 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5410 } else {
5411 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 1, true, false >, cudaFuncCachePreferShared);
5412 filterActs_YxX_color < 4, 32, 1, 4, 1, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5413 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5414 }
5415 }
5416 } else if (numImgColors == 2) {
5417 if (checkImgBounds) {
5418 if (numFilters % 32 == 0) {
5419 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, true, true >, cudaFuncCachePreferShared);
5420 filterActs_YxX_color < 4, 32, 1, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5421 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5422 } else {
5423 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, true, true >, cudaFuncCachePreferShared);
5424 filterActs_YxX_color < 4, 32, 1, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5425 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5426 }
5427 } else {
5428 if (numFilters % 32 == 0) {
5429 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 2, true, false >, cudaFuncCachePreferShared);
5430 filterActs_YxX_color < 4, 32, 1, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5431 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5432 } else {
5433 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 2, true, false >, cudaFuncCachePreferShared);
5434 filterActs_YxX_color < 4, 32, 1, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5435 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5436 }
5437 }
5438 } else if (numImgColors == 3) {
5439 if (checkImgBounds) {
5440 if (numFilters % 32 == 0) {
5441 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, true, true >, cudaFuncCachePreferShared);
5442 filterActs_YxX_color < 4, 32, 1, 8, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5443 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5444 } else {
5445 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, true, true >, cudaFuncCachePreferShared);
5446 filterActs_YxX_color < 4, 32, 1, 4, 3, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5447 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5448 }
5449 } else {
5450 if (numFilters % 32 == 0) {
5451 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 8, 3, true, false >, cudaFuncCachePreferShared);
5452 filterActs_YxX_color < 4, 32, 1, 8, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5453 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5454 } else {
5455 cudaFuncSetCacheConfig(filterActs_YxX_color< 4, 32, 1, 4, 3, true, false >, cudaFuncCachePreferShared);
5456 filterActs_YxX_color < 4, 32, 1, 4, 3, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5457 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, scaleTargets, scaleOutput, conv);
5458 }
5459 }
5460 }
5461 }
5462 } else {
5463 if (scaleTargets == 0) { // don't scale
5464 if (checkImgBounds) {
5465 if (numFiltersPerGroup % 32 == 0) {
5466 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5467 filterActs_YxX_sparse < 4, 32, 1, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5468 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5469 } else {
5470 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5471 filterActs_YxX_sparse < 4, 32, 1, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5472 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5473 }
5474 } else {
5475 if (numFiltersPerGroup % 32 == 0) {
5476 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5477 filterActs_YxX_sparse < 4, 32, 1, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5478 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5479 } else {
5480 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5481 filterActs_YxX_sparse < 4, 32, 1, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5482 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5483 }
5484 }
5485 } else { // do scale
5486 if (checkImgBounds) {
5487 if (numFiltersPerGroup % 32 == 0) {
5488 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5489 filterActs_YxX_sparse < 4, 32, 1, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5490 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5491 } else {
5492 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5493 filterActs_YxX_sparse < 4, 32, 1, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5494 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5495 }
5496 } else {
5497 if (numFiltersPerGroup % 32 == 0) {
5498 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5499 filterActs_YxX_sparse < 4, 32, 1, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5500 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5501 } else {
5502 cudaFuncSetCacheConfig(filterActs_YxX_sparse< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5503 filterActs_YxX_sparse < 4, 32, 1, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(),
5504 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numImgColors, numGroups, scaleTargets, scaleOutput, conv);
5505 }
5506 }
5507 }
5508 }
5509 }
5510
5511 cutilCheckMsg("filterActs: kernel execution failed");
5512 }
5513
5514 void convFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5515 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5516 int numImgColors, int numGroups) {
5517 convFilterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1);
5518 }
5519
5520 void convFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5521 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5522 int numImgColors, int numGroups,
5523 float scaleTargets, float scaleOutput) {
5524 _filterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true);
5525 }
5526
5527 void localFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5528 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5529 int numImgColors, int numGroups) {
5530 localFilterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, 0, 1);
5531 }
5532
5533 void localFilterActs(NVMatrix& images, NVMatrix& filters, NVMatrix& targets,
5534 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5535 int numImgColors, int numGroups,
5536 float scaleTargets, float scaleOutput) {
5537 _filterActs(images, filters, targets, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false);
5538 }
5539
5540 /*
5541 * images: (numImgColors, imgSizeY, imgSizeX, numImages) with stride given
5542 * filters: (numFilterColors, filterPixels, numFilters) if conv
5543 * (numModules, numFilterColors, filterPixels, numFilters) otherwise
5544 *
5545 * targets: (numFilters, numModulesY, numModulesX, numImages)
5546 * colorIndices: (numGroups, numFilterColors)
5547 *
5548 * Note: all of these convolution routines are optimized for the case when
5549 * the number of images (i.e. the minibatch size) is a multiple of 128.
5550 * Other batch sizes will work, but but I made no attempt whatsoever
5551 * to make them work fast.
5552 */
5553 void _filterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5554 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5555 int numImgColors, int numFilterColors, int numGroups,
5556 float scaleTargets, float scaleOutput, bool conv) {
5557 int numFilters = filters.getNumCols();
5558 int numModules = numModulesY * numModulesX;
5559 int numImages = images.getNumCols();
5560 int imgPixels = images.getNumRows() / numImgColors;
5561 int imgSizeX = imgPixels / imgSizeY;
5562 int filterModuleMult = conv ? 1 : numModules;
5563
5564 assert(numGroups > 1);
5565 assert(numImgColors % numFilterColors == 0);
5566 assert((numFilterColors * numGroups) % numImgColors == 0);
5567 assert(numFilters % (16 * numGroups) == 0);
5568 assert(numFilterColors % 2 == 0);
5569
5570 assert(imgSizeY * imgSizeX == imgPixels);
5571 assert(images.getNumRows() == imgPixels * numImgColors);
5572 int numFiltersPerGroup = numFilters / numGroups;
5573
5574 int imgStride = images.getStride(); // images does not need to be a contiguous matrix
5575
5576 int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
5577 int filterSize = int(sqrt((double)filterPixels));
5578 assert(filterSize * filterSize == filterPixels);
5579 assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
5580
5581 // These routines don't handle the case when only part of the image is visited in the convolution
5582 assert(paddingStart <= 0);
5583 assert(paddingStart + (numModulesX-1) * moduleStride + filterSize >= imgSizeX);
5584 assert(paddingStart + (numModulesY-1) * moduleStride + filterSize >= imgSizeY);
5585 assert(moduleStride <= filterSize);
5586
5587 assert(!images.isTrans());
5588 assert(!filters.isTrans());
5589 assert(!targets.isTrans());
5590
5591 assert(filters.isContiguous());
5592 assert(targets.isContiguous());
5593 int imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
5594 dim3 blocks = numFiltersPerGroup % 32 == 0 ? dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 8))
5595 : dim3(DIVUP(numImages, 32 * imgsPerThread), (numModules * numFilters) / (4 * 4));
5596 dim3 threads(32, 4);
5597 bool checkImgBounds = numImages % (32*imgsPerThread) != 0;
5598 if (scaleTargets == 0) {
5599 targets.resize(numFilters * numModules, numImages);
5600 } else {
5601 assert(targets.getNumRows() == numFilters * numModules);
5602 assert(targets.getNumCols() == numImages);
5603 }
5604
5605 if (imgsPerThread == 4) {
5606 if (scaleTargets == 0) { // don't scale
5607 if (checkImgBounds) {
5608 if (numFiltersPerGroup % 32 == 0) {
5609 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5610 filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5611 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5612 } else {
5613 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5614 filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5615 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5616 }
5617 } else {
5618 if (numFiltersPerGroup % 32 == 0) {
5619 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5620 filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5621 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5622 } else {
5623 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5624 filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5625 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5626 }
5627 }
5628 } else { // do scale
5629 if (checkImgBounds) {
5630 if (numFiltersPerGroup % 32 == 0) {
5631 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, true >, cudaFuncCachePreferShared);
5632 filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5633 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5634 } else {
5635 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, true >, cudaFuncCachePreferShared);
5636 filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5637 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5638 }
5639 } else {
5640 if (numFiltersPerGroup % 32 == 0) {
5641 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 8, 2, false, false >, cudaFuncCachePreferShared);
5642 filterActs_YxX_sparse_random < 4, 32, 4, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5643 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5644 } else {
5645 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 4, 4, 2, false, false >, cudaFuncCachePreferShared);
5646 filterActs_YxX_sparse_random < 4, 32, 4, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5647 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5648 }
5649 }
5650 }
5651 } else if (imgsPerThread == 2) {
5652 if (scaleTargets == 0) { // don't scale
5653 if (checkImgBounds) {
5654 if (numFiltersPerGroup % 32 == 0) {
5655 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5656 filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5657 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5658 } else {
5659 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5660 filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5661 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5662 }
5663 } else {
5664 if (numFiltersPerGroup % 32 == 0) {
5665 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5666 filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5667 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5668 } else {
5669 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5670 filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5671 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5672 }
5673 }
5674 } else { // do scale
5675 if (checkImgBounds) {
5676 if (numFiltersPerGroup % 32 == 0) {
5677 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, true >, cudaFuncCachePreferShared);
5678 filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5679 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5680 } else {
5681 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, true >, cudaFuncCachePreferShared);
5682 filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5683 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5684 }
5685 } else {
5686 if (numFiltersPerGroup % 32 == 0) {
5687 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 8, 2, false, false >, cudaFuncCachePreferShared);
5688 filterActs_YxX_sparse_random < 4, 32, 2, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5689 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5690 } else {
5691 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 2, 4, 2, false, false >, cudaFuncCachePreferShared);
5692 filterActs_YxX_sparse_random < 4, 32, 2, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5693 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5694 }
5695 }
5696 }
5697 } else {
5698 if (scaleTargets == 0) { // don't scale
5699 if (checkImgBounds) {
5700 if (numFiltersPerGroup % 32 == 0) {
5701 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5702 filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5703 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5704 } else {
5705 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5706 filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, false, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5707 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5708 }
5709 } else {
5710 if (numFiltersPerGroup % 32 == 0) {
5711 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5712 filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5713 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5714 } else {
5715 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5716 filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, false, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5717 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5718 }
5719 }
5720 } else { // do scale
5721 if (checkImgBounds) {
5722 if (numFiltersPerGroup % 32 == 0) {
5723 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, true >, cudaFuncCachePreferShared);
5724 filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5725 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5726 } else {
5727 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, true >, cudaFuncCachePreferShared);
5728 filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, true, true > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5729 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5730 }
5731 } else {
5732 if (numFiltersPerGroup % 32 == 0) {
5733 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 8, 2, false, false >, cudaFuncCachePreferShared);
5734 filterActs_YxX_sparse_random < 4, 32, 1, 8, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5735 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5736 } else {
5737 cudaFuncSetCacheConfig(filterActs_YxX_sparse_random< 4, 32, 1, 4, 2, false, false >, cudaFuncCachePreferShared);
5738 filterActs_YxX_sparse_random < 4, 32, 1, 4, 2, true, false > <<<blocks, threads>>>(images.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
5739 numImages, numFilters, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, numModulesY, numModulesX, imgStride, numFilterColors, numGroups, scaleTargets, scaleOutput, conv);
5740 }
5741 }
5742 }
5743 }
5744
5745 cutilCheckMsg("filterActsSparse: kernel execution failed");
5746 }
5747
5748 void convFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5749 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5750 int numImgColors, int numFilterColors, int numGroups,
5751 float scaleTargets, float scaleOutput) {
5752 _filterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride,
5753 numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput, true);
5754 }
5755
5756 void convFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5757 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5758 int numImgColors, int numFilterColors, int numGroups) {
5759 convFilterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart,
5760 moduleStride, numImgColors, numFilterColors, numGroups, 0, 1);
5761 }
5762
5763 void localFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5764 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5765 int numImgColors, int numFilterColors, int numGroups,
5766 float scaleTargets, float scaleOutput) {
5767 _filterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart, moduleStride,
5768 numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput, false);
5769 }
5770
5771 void localFilterActsSparse(NVMatrix& images, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
5772 int imgSizeY, int numModulesY, int numModulesX, int paddingStart, int moduleStride,
5773 int numImgColors, int numFilterColors, int numGroups) {
5774 localFilterActsSparse(images, filters, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, paddingStart,
5775 moduleStride, numImgColors, numFilterColors, numGroups, 0, 1);
5776 }
5777
5778 /*
5779 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
5780 * All rights reserved.
5781 *
5782 * Redistribution and use in source and binary forms, with or without modification,
5783 * are permitted provided that the following conditions are met:
5784 *
5785 * - Redistributions of source code must retain the above copyright notice,
5786 * this list of conditions and the following disclaimer.
5787 *
5788 * - Redistributions in binary form must reproduce the above copyright notice,
5789 * this list of conditions and the following disclaimer in the documentation
5790 * and/or other materials provided with the distribution.
5791 *
5792 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
5793 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
5794 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
5795 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
5796 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
5797 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
5798 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
5799 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
5800 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
5801 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
5802 */
5803
5804 #ifndef _CUDACONV2_EXPORT
5805 #define _CUDACONV2_EXPORT
5806 #endif
5807
5808 #include <cudaconv2.cuh>
5809
5810 /*
5811 * Block size: 16x16.
5812 * blockIdx.x determines case in batches of 16*imgsPerThread.
5813 * blockIdx.y determines 4x4 image region in target image.
5814 *
5815 * threadIdx.x determines case.
5816 * threadIdx.y determines pixel.
5817 *
5818 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
5819 * filters: (numColors, filterPixels, numFilters) if conv
5820 * (numModulesY, numModulesX, numColors, filterPixels, numFilters) otherwise
5821 * targets: (numColors, imgSizeY, imgSizeX, numImages)
5822 *
5823 * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
5824 *
5825 * Number of filters must be divisible by 16.
5826 * Number of images must be divisible by 16*imgsPerThread if checkCaseBounds is false.
5827 * 16 * imgsPerThread must be divisible by 32.
5828 *
5829 * This version loads 32 cases at a time, so it gets full coalescing on that load.
5830 * It only loads 16 weights at a time, so those aren't fully coalesced.
5831 * This version conserves shared memory by loading 16 filters at a time rather than 32.
5832 */
5833 template <int imgsPerThread, int numColors, bool scale, bool checkCaseBounds, bool conv>
5834 __global__ void img_acts_color(const float* hidActs, const float* filters, float* targets,
5835 const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
5836 const int filterSize, const int imgSizeY, const int imgSizeX,
5837 const int paddingStart, const int moduleStride,
5838 const float scaleTargets, const float scaleOutputs) {
5839 __shared__ float shFilters[numColors*16][16 + 1];
5840 __shared__ float shHidActs[16][16*imgsPerThread];
5841
5842 const int blockCaseIdx = blockIdx.x * 16*imgsPerThread;
5843 const int numRegionsX = DIVUP(imgSizeX, 4);
5844 const int blockRegionIdx = blockIdx.y;
5845 const int blockRegionIdxX = blockRegionIdx % numRegionsX;
5846 const int blockRegionIdxY = blockRegionIdx / numRegionsX;
5847 const int blockRegionLeft = blockRegionIdxX * 4;
5848 const int blockRegionTop = blockRegionIdxY * 4;
5849 const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
5850 const int pxY = blockRegionTop + pxYInRegion;
5851 const int pxX = blockRegionLeft + pxXInRegion;
5852 const int pxIdx = pxY * imgSizeX + pxX;
5853 const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
5854 const int numModules = numModulesY * numModulesX;
5855 const int filterPixels = filterSize * filterSize;
5856 const int imgPixels = imgSizeX * imgSizeY;
5857 const int tidx = threadIdx.y * 16 + threadIdx.x;
5858 const int loadY = tidx / 32, loadX = tidx % 32;
5859
5860 hidActs += blockCaseIdx + loadY * numImages * numModules + loadX;
5861 filters += threadIdx.x;
5862 targets += pxIdx * numImages + blockCaseIdx + threadIdx.x;
5863
5864
5865 float prod[numColors][imgsPerThread];
5866 #pragma unroll
5867 for (int c = 0; c < numColors; c++) {
5868 #pragma unroll
5869 for (int i = 0; i < imgsPerThread; i++) {
5870 prod[c][i] = 0;
5871 }
5872 }
5873 const int startY = blockRegionTop - paddingStart < filterSize ? 0
5874 : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
5875 const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
5876 const int startX = blockRegionLeft - paddingStart < filterSize ? 0
5877 : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
5878 const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
5879
5880 float* shilterLoad = &shFilters[threadIdx.y][threadIdx.x];
5881 float* shHidActLoad = &shHidActs[loadY][loadX];
5882
5883 for (int my = startY; my < endY; my++) {
5884 const int moduleTop = paddingStart + my * moduleStride;
5885 const int pxInModuleY = pxY - moduleTop;
5886
5887 for (int mx = startX; mx < endX; mx++) {
5888 const int moduleIdx = my * numModulesX + mx;
5889 const int moduleLeft = paddingStart + mx * moduleStride;
5890 const int pxInModuleX = pxX - moduleLeft;
5891
5892 const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize;
5893 const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
5894
5895 for (int f = 0; f < numFilters; f += 16) { // multiply with 16 filters at a time
5896 // Now the threads split up into half-warps, and each half-warp decides if it's interested.
5897 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
5898 #pragma unroll
5899 for (int i = 0; i < imgsPerThread * 16; i += 32) {
5900 if (!checkCaseBounds || blockCaseIdx + i + loadX < numImages) {
5901 #pragma unroll
5902 for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
5903 shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
5904 }
5905 } else {
5906 #pragma unroll
5907 for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
5908 shHidActLoad[j * 16 * imgsPerThread + i] = 0;
5909 }
5910 }
5911 }
5912
5913 if (isPxInImg && isPxInModule) {
5914 // This half-warp is interested, so it's going to load the weights from this module to its pixel.
5915 // Not fully coalesced read :(
5916 // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much.
5917 const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f]
5918 : &filters[(moduleIdx * numColors * filterPixels + pxIdxInModule) * numFilters + f];
5919 #pragma unroll
5920 for (int c = 0; c < numColors; c++) {
5921 shilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters];
5922 }
5923
5924
5925 }
5926
5927 __syncthreads();
5928 // Do some actual computation
5929 if (isPxInImg && isPxInModule) {
5930 #pragma unroll
5931 for (int c = 0; c < numColors; c++) {
5932 #pragma unroll
5933 for (int w = 0; w < 16; w++) {
5934 #pragma unroll
5935 for (int i = 0; i < imgsPerThread; i++) {
5936 prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16];
5937 }
5938 }
5939 }
5940 }
5941 __syncthreads();
5942 }
5943 }
5944 }
5945 // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though
5946 if (isPxInImg) {
5947 if (scale) {
5948 #pragma unroll
5949 for (int i = 0; i < imgsPerThread; i++) {
5950 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
5951 #pragma unroll
5952 for (int c = 0; c < numColors; c++) {
5953 targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i];
5954 }
5955 }
5956 }
5957 } else {
5958 #pragma unroll
5959 for (int i = 0; i < imgsPerThread; i++) {
5960 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
5961 #pragma unroll
5962 for (int c = 0; c < numColors; c++) {
5963 targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i];
5964 }
5965 }
5966 }
5967 }
5968 }
5969 }
5970
5971 /*
5972 * Block size: 16x16.
5973 * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread.
5974 * In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread)
5975 * blockIdx.x.y = 1..numImgColors/colorsPerThread
5976 * blockIdx.y determines 4x4 image region in target image.
5977 *
5978 * threadIdx.x determines case.
5979 * threadIdx.y determines pixel.
5980 *
5981 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
5982 * filters: (numFilterColors, filterPixels, numFilters) if conv
5983 * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise
5984 * targets: (numImageColors, imgSizeY, imgSizeX, numImages)
5985 *
5986 * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
5987 *
5988 * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false.
5989 * 16 * imgsPerThread must be divisible by 32.
5990 * numImageColors/numGroups must be divisible by colorsPerThread.
5991 *
5992 * This version loads 32 cases at a time, so it gets full coalescing on that load.
5993 * It only loads 16 weights at a time, so those aren't fully coalesced.
5994 * This version conserves shared memory by loading 16 filters at a time rather than 32.
5995 *
5996 * To be used when there are 4-16 color channels.
5997 */
5998 template <int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
5999 __global__ void img_acts_mediumcolor(const float* hidActs, const float* filters, float* targets,
6000 const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6001 const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart,
6002 const int moduleStride, const int numImgColors, const int numGroups,
6003 const float scaleTargets, const float scaleOutputs) {
6004 __shared__ float shFilters[colorsPerThread*16][16 + 1];
6005 __shared__ float shHidActs[16][16*imgsPerThread];
6006
6007 const int numImgBlocks = DIVUP(numImages,16*imgsPerThread);
6008 const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16*imgsPerThread;
6009
6010 const int imgColorIdx = (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally
6011 const int numFilterColors = numImgColors / numGroups;
6012 const int blockGroupIdx = imgColorIdx / numFilterColors;
6013 const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6014 const int numFiltersPerGroup = numFilters / numGroups;
6015 const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6016
6017 const int numRegionsX = DIVUP(imgSizeX, 4);
6018 const int blockRegionIdx = blockIdx.y;
6019 const int blockRegionIdxX = blockRegionIdx % numRegionsX;
6020 const int blockRegionIdxY = blockRegionIdx / numRegionsX;
6021 const int blockRegionLeft = blockRegionIdxX * 4;
6022 const int blockRegionTop = blockRegionIdxY * 4;
6023 const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
6024 const int pxY = blockRegionTop + pxYInRegion;
6025 const int pxX = blockRegionLeft + pxXInRegion;
6026 const int pxIdx = pxY * imgSizeX + pxX;
6027 const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
6028 const uint numModules = numModulesY * numModulesX;
6029 const int filterPixels = filterSize * filterSize;
6030 const int imgPixels = imgSizeY * imgSizeX;
6031 const int tidx = threadIdx.y * 16 + threadIdx.x;
6032 const int loadY = tidx / 32, loadX = tidx % 32;
6033
6034 hidActs += blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX;
6035 filters += blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x;
6036 targets += imgColorIdx * imgPixels * numImages + pxIdx * numImages + blockCaseIdx + threadIdx.x;
6037
6038 float prod[colorsPerThread][imgsPerThread];
6039 #pragma unroll
6040 for (int c = 0; c < colorsPerThread; c++) {
6041 #pragma unroll
6042 for (int i = 0; i < imgsPerThread; i++) {
6043 prod[c][i] = 0;
6044 }
6045 }
6046 const int startY = blockRegionTop - paddingStart < filterSize ? 0
6047 : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
6048 const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
6049 const int startX = blockRegionLeft - paddingStart < filterSize ? 0
6050 : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
6051 const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
6052
6053 float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x];
6054 float* shHidActLoad = &shHidActs[loadY][loadX];
6055
6056 for (int my = startY; my < endY; my++) {
6057 const int moduleTop = paddingStart + my * moduleStride;
6058 const int pxInModuleY = pxY - moduleTop;
6059
6060 for (int mx = startX; mx < endX; mx++) {
6061 const int moduleIdx = my * numModulesX + mx;
6062 const int moduleLeft = paddingStart + mx * moduleStride;
6063 const int pxInModuleX = pxX - moduleLeft;
6064
6065 const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize;
6066 const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
6067
6068 for (int f = 0; f < numFiltersPerGroup; f += 16) { // multipply with 16 filters at a time
6069 // Now the threads split up into half-warps, and each half-warp decides if it's interested.
6070 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6071 #pragma unroll
6072 for (int i = 0; i < imgsPerThread * 16; i += 32) {
6073 if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) {
6074 #pragma unroll
6075 for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6076 shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6077 }
6078 } else {
6079 #pragma unroll
6080 for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6081 shHidActLoad[j * 16 * imgsPerThread + i] = 0;
6082 }
6083 }
6084 }
6085
6086 if (isPxInImg && isPxInModule) {
6087 // This half-warp is interested, so it's going to load the weights from this module to its pixel.
6088
6089 // Not fully coalesced read :(
6090 // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much.
6091 const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f]
6092 : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInModule * numFilters + f];
6093 #pragma unroll
6094 for (int c = 0; c < colorsPerThread; c++) {
6095 shFilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters];
6096 }
6097 }
6098
6099 __syncthreads();
6100 // Do some actual computation
6101 if (isPxInImg && isPxInModule) {
6102 #pragma unroll
6103 for (int c = 0; c < colorsPerThread; c++) {
6104 #pragma unroll
6105 for (int w = 0; w < 16; w++) {
6106 #pragma unroll
6107 for (int i = 0; i < imgsPerThread; i++) {
6108 prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16];
6109 }
6110 }
6111 }
6112 }
6113 __syncthreads();
6114 }
6115 }
6116 }
6117 // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though
6118 if (isPxInImg) {
6119 if (scale) {
6120 #pragma unroll
6121 for (int i = 0; i < imgsPerThread; i++) {
6122 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6123 #pragma unroll
6124 for (int c = 0; c < colorsPerThread; c++) {
6125 targets[c * imgPixels * numImages + i * 16] = scaleTargets * targets[c * imgPixels * numImages + i * 16] + scaleOutputs * prod[c][i];
6126 }
6127 }
6128 }
6129 } else {
6130 #pragma unroll
6131 for (int i = 0; i < imgsPerThread; i++) {
6132 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6133 #pragma unroll
6134 for (int c = 0; c < colorsPerThread; c++) {
6135 targets[c * imgPixels * numImages + i * 16] = scaleOutputs * prod[c][i];
6136 }
6137 }
6138 }
6139 }
6140 }
6141 }
6142
6143 /*
6144 * Block size: B_YxB_X.
6145 * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread.
6146 * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread)
6147 * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread)
6148 * blockIdx.y determines image pixel in target image.
6149 *
6150 * threadIdx.x determines case.
6151 * threadIdx.y determines color.
6152 *
6153 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
6154 * filters: (numFilterColors, filterPixels, numFilters) if conv
6155 * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise
6156 * targets: (numImageColors, imgSizeY, imgSizeX, numImages)
6157 *
6158 * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases.
6159 *
6160 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
6161 * numFiltersPerGroup must be divisible by 16.
6162 *
6163 * B_X * imgsPerThread must be divisible by 32.
6164 * numFilterColors must be divisible by B_Y*colorsPerThread.
6165 * B_X*B_Y must be divisible by 32.
6166 *
6167 * This version loads 32 cases at a time, so it gets full coalescing on that load.
6168 * It only loads 16 weights at a time, so those aren't fully coalesced.
6169 * This version conserves shared memory by loading 16 filters at a time rather than 32.
6170 *
6171 * To be used when there are >= 16 color channels.
6172 */
6173 template <int B_Y, int B_X, int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
6174 __global__ void conv_img_acts_manycolor(const float* hidActs, const float* filters, float* targets,
6175 const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6176 const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
6177 const int numImgColors, const int numGroups,
6178 const float scaleTargets, const float scaleOutputs) {
6179 __shared__ float shFilters[colorsPerThread*B_Y][16 + 1]; // TODO: perhaps reconsider this 16
6180 __shared__ float shHidActs[16][B_X*imgsPerThread];
6181
6182 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
6183 const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
6184
6185 const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y*colorsPerThread; // color idx globally
6186 const int numFilterColors = numImgColors / numGroups;
6187 const int blockGroupIdx = imgColorIdx / numFilterColors;
6188 const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6189 const int numFiltersPerGroup = numFilters / numGroups;
6190 const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6191
6192 const int blockPixelIdx = blockIdx.y;
6193 const int blockPixelIdxX = blockPixelIdx % imgSizeX;
6194 const int blockPixelIdxY = blockPixelIdx / imgSizeX;
6195
6196 const int filterPixels = filterSize * filterSize;
6197 const int imgPixels = imgSizeY * imgSizeX;
6198 const int tidx = threadIdx.y * B_X + threadIdx.x;
6199 const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32;
6200 const int filtersLoadY = tidx / 16, filtersLoadX = tidx % 16;
6201 const int numModules = numModulesY * numModulesX;
6202
6203 hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
6204 filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
6205 targets += (imgColorIdx + threadIdx.y) * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
6206
6207 float prod[colorsPerThread][imgsPerThread];
6208 #pragma unroll
6209 for (int c = 0; c < colorsPerThread; c++) {
6210 #pragma unroll
6211 for (int i = 0; i < imgsPerThread; i++) {
6212 prod[c][i] = 0;
6213 }
6214 }
6215
6216 const int startY = blockPixelIdxY - paddingStart < filterSize ? 0
6217 : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
6218 const int endY = MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
6219 const int startX = blockPixelIdxX - paddingStart < filterSize ? 0
6220 : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
6221 const int endX = MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
6222
6223 float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
6224 float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
6225
6226 for (int my = startY; my < endY; my++) {
6227 const int moduleTop = paddingStart + my * moduleStride;
6228 const int pxInFilterY = blockPixelIdxY - moduleTop;
6229
6230 for (int mx = startX; mx < endX; mx++) {
6231 const int moduleIdx = my * numModulesX + mx;
6232 const int moduleLeft = paddingStart + mx * moduleStride;
6233 const int pxInFilterX = blockPixelIdxX - moduleLeft;
6234
6235 const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
6236
6237 for (int f = 0; f < numFiltersPerGroup; f += 16) { // multiply with 16 filters at a time
6238 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6239 #pragma unroll
6240 for (int i = 0; i < imgsPerThread * B_X; i += 32) {
6241 if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) {
6242 #pragma unroll
6243 for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6244 shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6245 }
6246 } else {
6247 #pragma unroll
6248 for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6249 shHidActLoad[j * B_X * imgsPerThread + i] = 0;
6250 }
6251 }
6252 }
6253 const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f]
6254 : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f];
6255 #pragma unroll
6256 for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/16) {
6257 if ((colorsPerThread*B_Y) % (B_X*B_Y/16) == 0 || i + filtersLoadY < colorsPerThread*B_Y) {
6258 shFilterLoad[i * (16 + 1)] = fLoad[i * filterPixels * numFilters];
6259 }
6260 }
6261
6262 __syncthreads();
6263 // Do some actual computation
6264 #pragma unroll
6265 for (int c = 0; c < colorsPerThread; c++) {
6266 #pragma unroll
6267 for (int w = 0; w < 16; w++) {
6268 #pragma unroll
6269 for (int i = 0; i < imgsPerThread; i++) {
6270 prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * shHidActs[w][threadIdx.x + i * B_X];
6271 }
6272 }
6273 }
6274 __syncthreads();
6275 }
6276 }
6277 }
6278 if (scale) {
6279 #pragma unroll
6280 for (int i = 0; i < imgsPerThread; i++) {
6281 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6282 #pragma unroll
6283 for (int c = 0; c < colorsPerThread; c++) {
6284 targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleTargets * targets[c * B_Y * imgPixels * numImages + i * B_X] + scaleOutputs * prod[c][i];
6285 }
6286 }
6287 }
6288 } else {
6289 #pragma unroll
6290 for (int i = 0; i < imgsPerThread; i++) {
6291 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6292 #pragma unroll
6293 for (int c = 0; c < colorsPerThread; c++) {
6294 targets[c * B_Y * imgPixels * numImages + i * B_X] = scaleOutputs * prod[c][i];
6295 }
6296 }
6297 }
6298 }
6299 }
6300
6301
6302 /*
6303 * Block size: 16x16.
6304 * blockIdx.x determines case in batches of 16*imgsPerThread, also color in batches of colorsPerThread.
6305 * In essence, blockIdx.x.x = 1..numImages/(16*imgsPerThread)
6306 * blockIdx.x.y = 1..numImgColors/colorsPerThread
6307 * blockIdx.y determines 4x4 image region in target image, also sample
6308 * In essence, blockIdx.y.x = 1..numRegions
6309 * blockIdx.y.y = 1..overSample
6310 *
6311 * threadIdx.x determines case.
6312 * threadIdx.y determines pixel.
6313 *
6314 * overSample := numFilterColors*numGroups/numImgColors
6315 * ^ this is the number of groups that each color channel is connected to
6316 *
6317 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
6318 * filters: (numFilterColors, filterPixels, numFilters) if conv
6319 * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise
6320 * targets: (overSample, numImgColors, imgSizeY, imgSizeX, numImages)
6321 *
6322 * colorIndices: (numGroups, numFilterColors)
6323 *
6324 * Each block reconstructs one 4x4 pixels from 16*imgsPerThread cases.
6325 *
6326 * numImages must be divisible by 16*imgsPerThread if checkCaseBounds is false.
6327 * 16 * imgsPerThread must be divisible by 32.
6328 * numFilterColors must be divisible by colorsPerThread.
6329 *
6330 * This version loads 32 cases at a time, so it gets full coalescing on that load.
6331 * It only loads 16 weights at a time, so those aren't fully coalesced.
6332 * This version conserves shared memory by loading 16 filters at a time rather than 32.
6333 *
6334 * To be used when there are 4-16 color channels.
6335 */
6336 template <int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
6337 __global__ void img_acts_mediumcolor_sparse_rand(const float* hidActs, const float* filters, float* targets, int* colorIndices,
6338 const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6339 const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
6340 const int numImgColors, const int numFilterColors, const int numGroups,
6341 const float scaleTargets, const float scaleOutputs) {
6342 __shared__ float shFilters[colorsPerThread*16][16 + 1];
6343 __shared__ float shHidActs[16][16*imgsPerThread];
6344 __shared__ int shColors[colorsPerThread]; // not really necessary -- can repurpose the other shmems
6345
6346 const int numImgBlocks = DIVUP(numImages,16*imgsPerThread);
6347 const int blockCaseIdx = (blockIdx.x % numImgBlocks) * 16*imgsPerThread;
6348
6349 const int numRegionsX = DIVUP(imgSizeX, 4);
6350 const int numRegions = numRegionsX * numRegionsX;
6351 const int imgColorIdx = (blockIdx.x / numImgBlocks) * colorsPerThread; // color idx globally
6352
6353 const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6354 const int numFiltersPerGroup = numFilters / numGroups;
6355
6356 const int overSample = gridDim.y / numRegions;
6357 const int blockSample = blockIdx.y / numRegions;
6358 const int groupsPerSample = numGroups / overSample;
6359 const int blockGroupIdx = imgColorIdx / numFilterColors + blockSample * groupsPerSample;
6360 const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6361
6362 const int blockRegionIdx = blockIdx.y % numRegions;
6363 const int blockRegionIdxX = blockRegionIdx % numRegionsX;
6364 const int blockRegionIdxY = blockRegionIdx / numRegionsX;
6365 const int blockRegionLeft = blockRegionIdxX * 4;
6366 const int blockRegionTop = blockRegionIdxY * 4;
6367 const int pxYInRegion = threadIdx.y / 4, pxXInRegion = threadIdx.y % 4;
6368 const int pxY = blockRegionTop + pxYInRegion;
6369 const int pxX = blockRegionLeft + pxXInRegion;
6370 const int pxIdx = pxY * imgSizeX + pxX;
6371 const bool isPxInImg = pxY < imgSizeY && pxX < imgSizeX;
6372 const uint numModules = numModulesY * numModulesX;
6373 const int filterPixels = filterSize * filterSize;
6374 const int imgPixels = imgSizeY * imgSizeX;
6375 const int tidx = threadIdx.y * 16 + threadIdx.x;
6376 const int loadY = tidx / 32, loadX = tidx % 32;
6377
6378 hidActs += blockCaseIdx + (blockFilterIdx + loadY) * numImages * numModules + loadX;
6379 filters += blockFilterIdx + filterColorIdx * filterPixels * numFilters + threadIdx.x;
6380 targets += blockSample * numImgColors * imgPixels * numImages + pxIdx * numImages + blockCaseIdx + threadIdx.x;
6381
6382 float prod[colorsPerThread][imgsPerThread];
6383 #pragma unroll
6384 for (int c = 0; c < colorsPerThread; c++) {
6385 #pragma unroll
6386 for (int i = 0; i < imgsPerThread; i++) {
6387 prod[c][i] = 0;
6388 }
6389 }
6390 const int startY = blockRegionTop - paddingStart < filterSize ? 0
6391 : 1 + (blockRegionTop - paddingStart - filterSize) / moduleStride;
6392 const int endY = MIN(numModulesY, 1 + (blockRegionTop + 3 - paddingStart) / moduleStride);
6393 const int startX = blockRegionLeft - paddingStart < filterSize ? 0
6394 : 1 + (blockRegionLeft - paddingStart - filterSize) / moduleStride;
6395 const int endX = MIN(numModulesX, 1 + (blockRegionLeft + 3 - paddingStart) / moduleStride);
6396
6397 float* shFilterLoad = &shFilters[threadIdx.y][threadIdx.x];
6398 float* shHidActLoad = &shHidActs[loadY][loadX];
6399
6400 if (tidx < colorsPerThread) {
6401 shColors[tidx] = colorIndices[blockGroupIdx * numFilterColors + filterColorIdx + tidx] * imgPixels * numImages;
6402 }
6403
6404 for (int my = startY; my < endY; my++) {
6405 const int moduleTop = paddingStart + my * moduleStride;
6406 const int pxInModuleY = pxY - moduleTop;
6407
6408 for (int mx = startX; mx < endX; mx++) {
6409 const int moduleIdx = my * numModulesX + mx;
6410 const int moduleLeft = paddingStart + mx * moduleStride;
6411 const int pxInModuleX = pxX - moduleLeft;
6412
6413 const bool isPxInModule = pxInModuleY >= 0 && pxInModuleY < filterSize && pxInModuleX >= 0 && pxInModuleX < filterSize;
6414 const int pxIdxInModule = pxInModuleY * filterSize + pxInModuleX;
6415
6416 for (int f = 0; f < numFiltersPerGroup; f += 16) { // multipply with 16 filters at a time
6417 // Now the threads split up into half-warps, and each half-warp decides if it's interested.
6418 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6419 #pragma unroll
6420 for (int i = 0; i < imgsPerThread * 16; i += 32) {
6421 if (!checkCaseBounds || blockCaseIdx + loadX + i < numImages) {
6422 #pragma unroll
6423 for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6424 shHidActLoad[j * 16 * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6425 }
6426 } else {
6427 #pragma unroll
6428 for (int j = 0; j < 16; j += 8) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6429 shHidActLoad[j * 16 * imgsPerThread + i] = 0;
6430 }
6431 }
6432 }
6433
6434 if (isPxInImg && isPxInModule) {
6435 // This half-warp is interested, so it's going to load the weights from this module to its pixel.
6436
6437 // Not fully coalesced read :(
6438 // But taking out this read entirely only reduces the runtime by ~2.8%, so it isn't costing me much.
6439 const float* fLoad = conv ? &filters[pxIdxInModule * numFilters + f]
6440 : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInModule * numFilters + f];
6441 #pragma unroll
6442 for (int c = 0; c < colorsPerThread; c++) {
6443 shFilterLoad[c * 16 * (16 + 1)] = fLoad[c * filterPixels * numFilters];
6444 }
6445 }
6446
6447 __syncthreads();
6448 // Do some actual computation
6449 if (isPxInImg && isPxInModule) {
6450 #pragma unroll
6451 for (int c = 0; c < colorsPerThread; c++) {
6452 #pragma unroll
6453 for (int w = 0; w < 16; w++) {
6454 #pragma unroll
6455 for (int i = 0; i < imgsPerThread; i++) {
6456 prod[c][i] += shFilters[threadIdx.y + c * 16][w] * shHidActs[w][threadIdx.x + i * 16];
6457 }
6458 }
6459 }
6460 }
6461 __syncthreads();
6462 }
6463 }
6464 }
6465 // Not fully coalesced write :(... shmem (and fully coalesced) version is actually slightly slower, though
6466 if (isPxInImg) {
6467 if (scale) {
6468 #pragma unroll
6469 for (int i = 0; i < imgsPerThread; i++) {
6470 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6471 #pragma unroll
6472 for (int c = 0; c < colorsPerThread; c++) {
6473 targets[shColors[c] + i * 16] = scaleTargets * targets[shColors[c] + i * 16] + scaleOutputs * prod[c][i];
6474 }
6475 }
6476 }
6477 } else {
6478 #pragma unroll
6479 for (int i = 0; i < imgsPerThread; i++) {
6480 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * 16 < numImages) {
6481 #pragma unroll
6482 for (int c = 0; c < colorsPerThread; c++) {
6483 targets[shColors[c] + i * 16] = scaleOutputs * prod[c][i];
6484 }
6485 }
6486 }
6487 }
6488 }
6489 }
6490
6491 /*
6492 * Block size: B_YxB_X.
6493 * blockIdx.x determines case in batches of B_X*imgsPerThread, also color in batches of B_Y*colorsPerThread.
6494 * In essence, blockIdx.x.x = 1..numImages/(B_X*imgsPerThread)
6495 * blockIdx.x.y = 1..numImgColors/(B_Y*colorsPerThread)
6496 * blockIdx.y determines image pixel in target image, sample idx.
6497 * In essence, blockIdx.y.x = 1..imgPixels
6498 * blockIdx.y.y = 1..overSample
6499 *
6500 * threadIdx.x determines case.
6501 * threadIdx.y determines color.
6502 *
6503 * overSample := numFilterColors*numGroups/numImgColors
6504 * ^ this is the number of groups that each color channel is connected to
6505 *
6506 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
6507 * filters: (numFilterColors, filterPixels, numFilters) if conv
6508 * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise
6509 * targets: (overSample, numImgColors, imgSizeY, imgSizeX, numImages)
6510 *
6511 * colorIndices: (numGroups, numFilterColors)
6512 *
6513 * Each block reconstructs one B_Y*colorsPerThread colors from 1 pixel from B_X*imgsPerThread cases.
6514 *
6515 * numImages must be divisible by B_X*imgsPerThread if checkCaseBounds is false.
6516 * numFiltersPerGroup must be divisible by 16.
6517 * numFilterColors*numGroups must be divisible by numImgColors.
6518 *
6519 * B_X * imgsPerThread must be divisible by 32.
6520 * numFilterColors must be divisible by B_Y*colorsPerThread.
6521 * B_X*B_Y must be divisible by 32.
6522 *
6523 * This version loads 32 cases at a time, so it gets full coalescing on that load.
6524 * It only loads 16 weights at a time, so those aren't fully coalesced.
6525 * This version conserves shared memory by loading 16 filters at a time rather than 32.
6526 *
6527 * To be used when there are >= 16 color channels.
6528 */
6529 template <int B_Y, int B_X, int imgsPerThread, int colorsPerThread, bool scale, bool checkCaseBounds, bool conv>
6530 __global__ void img_acts_manycolor_sparse_rand(const float* hidActs, const float* filters, float* targets, int* colorIndices,
6531 const int numModulesY, const int numModulesX, const int numImages, const int numFilters,
6532 const int filterSize, const int imgSizeY, const int imgSizeX, const int paddingStart, const int moduleStride,
6533 const int numImgColors, const int numFilterColors, const int numGroups,
6534 const float scaleTargets, const float scaleOutputs) {
6535 __shared__ float shFilters[colorsPerThread*B_Y][16 + 1]; // TODO: perhaps reconsider this 16
6536 __shared__ float shHidActs[16][B_X*imgsPerThread];
6537 __shared__ int shColors[colorsPerThread * B_Y]; // not really necessary -- can repurpose the other shmems
6538
6539 const int numImgBlocks = DIVUP(numImages,B_X*imgsPerThread);
6540 const int blockCaseIdx = (blockIdx.x % numImgBlocks) * B_X*imgsPerThread;
6541
6542 const int filterPixels = filterSize * filterSize;
6543 const int imgPixels = imgSizeY * imgSizeX;
6544 const int tidx = threadIdx.y * B_X + threadIdx.x;
6545 const int hidActLoadY = tidx / 32, hidActLoadX = tidx % 32;
6546 const int filtersLoadY = tidx / 16, filtersLoadX = tidx % 16;
6547 const int numModules = numModulesY * numModulesX;
6548
6549 const int overSample = gridDim.y / imgPixels;
6550 const int blockSample = blockIdx.y / imgPixels;
6551 const int groupsPerSample = numGroups / overSample;
6552
6553 // const int overSample = (numFilterColors * numGroups) / numImgColors;
6554 const int imgColorIdx = (blockIdx.x / numImgBlocks) * B_Y * colorsPerThread; // color idx globally
6555 const int blockGroupIdx = imgColorIdx / numFilterColors + blockSample * groupsPerSample;
6556 // const int filterColorsPerSample = numFilterColors / overSample;
6557
6558 const int blockPixelIdx = blockIdx.y % imgPixels;
6559 const int blockPixelIdxX = blockPixelIdx % imgSizeX;
6560 const int blockPixelIdxY = blockPixelIdx / imgSizeX;
6561
6562 const int filterColorIdx = imgColorIdx % numFilterColors; // color idx within group
6563 const int numFiltersPerGroup = numFilters / numGroups;
6564 const int blockFilterIdx = blockGroupIdx * numFiltersPerGroup;
6565
6566 hidActs += blockCaseIdx + (blockFilterIdx + hidActLoadY) * numImages * numModules + hidActLoadX;
6567 filters += blockFilterIdx + (filterColorIdx + filtersLoadY) * filterPixels * numFilters + filtersLoadX;
6568 targets += blockSample * numImgColors * imgPixels * numImages + blockPixelIdx * numImages + blockCaseIdx + threadIdx.x;
6569
6570 float prod[colorsPerThread][imgsPerThread];
6571 #pragma unroll
6572 for (int c = 0; c < colorsPerThread; c++) {
6573 #pragma unroll
6574 for (int i = 0; i < imgsPerThread; i++) {
6575 prod[c][i] = 0;
6576 }
6577 }
6578
6579 const int startY = blockPixelIdxY - paddingStart < filterSize ? 0
6580 : 1 + (blockPixelIdxY - paddingStart - filterSize) / moduleStride;
6581 const int endY = MIN(numModulesY, 1 + (blockPixelIdxY - paddingStart) / moduleStride);
6582 const int startX = blockPixelIdxX - paddingStart < filterSize ? 0
6583 : 1 + (blockPixelIdxX - paddingStart - filterSize) / moduleStride;
6584 const int endX = MIN(numModulesX, 1 + (blockPixelIdxX - paddingStart) / moduleStride);
6585
6586 float* shFilterLoad = &shFilters[filtersLoadY][filtersLoadX];
6587 float* shHidActLoad = &shHidActs[hidActLoadY][hidActLoadX];
6588
6589 if (tidx < colorsPerThread * B_Y) {
6590 shColors[tidx] = colorIndices[blockGroupIdx * numFilterColors + filterColorIdx + tidx] * imgPixels * numImages;
6591 }
6592
6593 for (int my = startY; my < endY; my++) {
6594 const int moduleTop = paddingStart + my * moduleStride;
6595 const int pxInFilterY = blockPixelIdxY - moduleTop;
6596
6597 for (int mx = startX; mx < endX; mx++) {
6598 const int moduleIdx = my * numModulesX + mx;
6599 const int moduleLeft = paddingStart + mx * moduleStride;
6600 const int pxInFilterX = blockPixelIdxX - moduleLeft;
6601
6602 const int pxIdxInFilter = pxInFilterY * filterSize + pxInFilterX;
6603
6604 for (int f = 0; f < numFiltersPerGroup; f += 16) { // multiply with 16 filters at a time
6605 const float* hLoad = &hidActs[(moduleIdx + f * numModules) * numImages];
6606 #pragma unroll
6607 for (int i = 0; i < imgsPerThread * B_X; i += 32) {
6608 if (!checkCaseBounds || blockCaseIdx + hidActLoadX + i < numImages) {
6609 #pragma unroll
6610 for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6611 shHidActLoad[j * B_X * imgsPerThread + i] = hLoad[j * numModules * numImages + i];
6612 }
6613 } else {
6614 #pragma unroll
6615 for (int j = 0; j < 16; j += B_X*B_Y/32) { // load 16 rows of imgsPerThread*16 cols, 8 * 32 elements at a time.
6616 shHidActLoad[j * B_X * imgsPerThread + i] = 0;
6617 }
6618 }
6619 }
6620
6621 const float* fLoad = conv ? &filters[pxIdxInFilter * numFilters + f]
6622 : &filters[moduleIdx * numFilterColors * filterPixels * numFilters + pxIdxInFilter * numFilters + f];
6623 #pragma unroll
6624 for (int i = 0; i < colorsPerThread*B_Y; i+= B_X*B_Y/16) {
6625 if ((colorsPerThread*B_Y) % (B_X*B_Y/16) == 0 || i + filtersLoadY < colorsPerThread*B_Y) {
6626 shFilterLoad[i * (16 + 1)] = fLoad[i * filterPixels * numFilters];
6627 }
6628 }
6629
6630 __syncthreads();
6631 // Do some actual computation
6632 #pragma unroll
6633 for (int c = 0; c < colorsPerThread; c++) {
6634 #pragma unroll
6635 for (int w = 0; w < 16; w++) {
6636 #pragma unroll
6637 for (int i = 0; i < imgsPerThread; i++) {
6638 prod[c][i] += shFilters[c * B_Y + threadIdx.y][w] * shHidActs[w][threadIdx.x + i * B_X];
6639 }
6640 }
6641 }
6642 __syncthreads();
6643 }
6644 }
6645 }
6646
6647 if (scale) {
6648 #pragma unroll
6649 for (int i = 0; i < imgsPerThread; i++) {
6650 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6651 #pragma unroll
6652 for (int c = 0; c < colorsPerThread; c++) {
6653 targets[shColors[c * B_Y + threadIdx.y] + i * B_X] = scaleTargets * targets[shColors[c * B_Y + threadIdx.y] + i * B_X] + scaleOutputs * prod[c][i];
6654 }
6655 }
6656 }
6657 } else {
6658 #pragma unroll
6659 for (int i = 0; i < imgsPerThread; i++) {
6660 if (!checkCaseBounds || blockCaseIdx + threadIdx.x + i * B_X < numImages) {
6661 #pragma unroll
6662 for (int c = 0; c < colorsPerThread; c++) {
6663 targets[shColors[c * B_Y + threadIdx.y] + i * B_X] = scaleOutputs * prod[c][i];
6664 }
6665 }
6666 }
6667 }
6668 }
6669
6670 /*
6671 * hidActs: (numFilters, numModules, numImages)
6672 * filters: (numFilterColors, filterPixels, numFilters) if conv
6673 * (numModules, numFilterColors, filterPixels, numFilters) otherwise
6674 * targets: (overSample, numImgColors, imgPixels, numImages)
6675 *
6676 * Note: all of these convolution routines are optimized for the case when
6677 * the number of images (i.e. the minibatch size) is a multiple of 128.
6678 * Other batch sizes will work, but but I made no attempt whatsoever
6679 * to make them work fast.
6680 */
6681 void _imgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
6682 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups,
6683 float scaleTargets, float scaleOutput, bool conv) {
6684 int numFilterColors = numImgColors / numGroups;
6685 int numImages = hidActs.getNumCols();
6686 int numFilters = filters.getNumCols();
6687 int numModules = hidActs.getNumRows() / numFilters;
6688 int filterModuleMult = conv ? 1 : numModules;
6689 int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
6690 int filterSize = sqrt((double)filterPixels);
6691 int imgPixels = imgSizeY * imgSizeX;
6692 int numModulesX = numModules / numModulesY;
6693
6694 assert(numImgColors % numGroups == 0);
6695 assert(numFilters % (16*numGroups) == 0);
6696 assert(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 2 == 0)));
6697 assert(numGroups == 1 || numFilterColors % 4 == 0);
6698
6699 assert(filterPixels == filterSize * filterSize);
6700 assert(hidActs.getNumRows() == numModules * numFilters);
6701 assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
6702 assert(numModules == numModulesY * numModulesX);
6703
6704 assert(hidActs.isContiguous());
6705 assert(filters.isContiguous());
6706
6707 assert(!hidActs.isTrans());
6708 assert(!filters.isTrans());
6709 assert(!targets.isTrans());
6710 // These routines don't handle the case when only part of the image is visited in the convolution
6711 assert(paddingStart <= 0);
6712 // assert changed into if statement by Ian Goodfellow
6713 if (paddingStart + (numModulesX-1)*moduleStride + filterSize < imgSizeX)
6714 {
6715 printf("imgSizeX: %d\n", imgSizeX);
6716 printf("Bound on image size: %d\n", paddingStart + (numModulesX-1)*moduleStride+filterSize);
6717 printf("paddingStart: %d\n", paddingStart);
6718 printf("numModulesX: %d\n", numModulesX);
6719 printf("moduleStride: %d\n", moduleStride);
6720 printf("filterSize: %d\n", filterSize);
6721 assert(false);
6722 }
6723 assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
6724 assert(moduleStride <= filterSize);
6725
6726 assert(targets.isContiguous()); // no stride support here!
6727
6728 dim3 blocks;
6729 dim3 threads(16,16);
6730 int colorsPerThread;
6731 int imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
6732 if (numFilterColors % 8 == 0) {
6733 threads = dim3(32, 4);
6734 colorsPerThread = numFilterColors % 16 == 0 ? 4 : 2;
6735 imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
6736 assert(numFilterColors % (threads.y * colorsPerThread) == 0);
6737
6738 blocks = dim3(DIVUP(numImages, threads.x*imgsPerThread) * (numImgColors/(threads.y*colorsPerThread)), imgPixels);
6739 } else if (numFilterColors > 3) {
6740 colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2;
6741 blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread) * (numImgColors / colorsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4));
6742 } else {
6743 blocks = dim3(DIVUP(numImages,threads.x*imgsPerThread), DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4));
6744 }
6745 bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
6746
6747 if (scaleTargets == 0) { // do not scale or use targets matrix
6748 targets.resize(numImgColors*imgPixels, numImages);
6749 } else {
6750 assert(targets.getNumRows() == numImgColors * imgPixels);
6751 assert(targets.getNumCols() == numImages);
6752 }
6753 if (conv) { // convolutional units
6754 if (scaleTargets == 0) { // do not scale or use targets matrix
6755 if (numFilterColors % 8 == 0) {
6756 if (imgsPerThread == 4) {
6757 if (checkCaseBounds) {
6758 if (numFilterColors % 16 == 0) {
6759 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, true, true>, cudaFuncCachePreferShared);
6760 conv_img_acts_manycolor<4, 32, 4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6761 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6762 } else {
6763 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, true, true>, cudaFuncCachePreferShared);
6764 conv_img_acts_manycolor<4, 32, 4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6765 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6766 }
6767 } else {
6768 if (numFilterColors % 16 == 0) {
6769 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, false, true>, cudaFuncCachePreferShared);
6770 conv_img_acts_manycolor<4, 32, 4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6771 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6772 } else {
6773 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, false, true>, cudaFuncCachePreferShared);
6774 conv_img_acts_manycolor<4, 32, 4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6775 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6776 }
6777 }
6778 } else if (imgsPerThread == 2) {
6779 if (checkCaseBounds) {
6780 if (numFilterColors % 16 == 0) {
6781 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, true, true>, cudaFuncCachePreferShared);
6782 conv_img_acts_manycolor<4, 32, 2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6783 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6784 } else {
6785 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, true, true>, cudaFuncCachePreferShared);
6786 conv_img_acts_manycolor<4, 32, 2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6787 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6788 }
6789 } else {
6790 if (numFilterColors % 16 == 0) {
6791 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, false, true>, cudaFuncCachePreferShared);
6792 conv_img_acts_manycolor<4, 32, 2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6793 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6794 } else {
6795 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, false, true>, cudaFuncCachePreferShared);
6796 conv_img_acts_manycolor<4, 32, 2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6797 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6798 }
6799 }
6800 } else {
6801 if (checkCaseBounds) {
6802 if (numFilterColors % 16 == 0) {
6803 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, true, true>, cudaFuncCachePreferShared);
6804 conv_img_acts_manycolor<4, 32, 1, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6805 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6806 } else {
6807 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, true, true>, cudaFuncCachePreferShared);
6808 conv_img_acts_manycolor<4, 32, 1, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6809 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6810 }
6811 } else {
6812 if (numFilterColors % 16 == 0) {
6813 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, false, true>, cudaFuncCachePreferShared);
6814 conv_img_acts_manycolor<4, 32, 1, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6815 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6816 } else {
6817 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, false, true>, cudaFuncCachePreferShared);
6818 conv_img_acts_manycolor<4, 32, 1, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6819 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6820 }
6821 }
6822 }
6823 } else if (numFilterColors > 3) {
6824 if (imgsPerThread == 8) {
6825 if (checkCaseBounds) {
6826 if (colorsPerThread == 4) {
6827 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, true, true>, cudaFuncCachePreferShared);
6828 img_acts_mediumcolor<8, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6829 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6830 } else {
6831 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, true, true>, cudaFuncCachePreferShared);
6832 img_acts_mediumcolor<8, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6833 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6834 }
6835 } else {
6836 if (colorsPerThread == 4) {
6837 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, false, true>, cudaFuncCachePreferShared);
6838 img_acts_mediumcolor<8, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6839 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6840 } else {
6841 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, false, true>, cudaFuncCachePreferShared);
6842 img_acts_mediumcolor<8, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6843 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6844 }
6845 }
6846 } else if (imgsPerThread == 4) {
6847 if (checkCaseBounds) {
6848 if (colorsPerThread == 4) {
6849 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, true, true>, cudaFuncCachePreferShared);
6850 img_acts_mediumcolor<4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6851 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6852 } else {
6853 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, true, true>, cudaFuncCachePreferShared);
6854 img_acts_mediumcolor<4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6855 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6856 }
6857 } else {
6858 if (colorsPerThread == 4) {
6859 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, false, true>, cudaFuncCachePreferShared);
6860 img_acts_mediumcolor<4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6861 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6862 } else {
6863 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, false, true>, cudaFuncCachePreferShared);
6864 img_acts_mediumcolor<4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6865 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6866 }
6867 }
6868 } else {
6869 if (checkCaseBounds) {
6870 if (colorsPerThread == 4) {
6871 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, true, true>, cudaFuncCachePreferShared);
6872 img_acts_mediumcolor<2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6873 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6874 } else {
6875 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, true, true>, cudaFuncCachePreferShared);
6876 img_acts_mediumcolor<2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6877 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6878 }
6879 } else {
6880 if (colorsPerThread == 4) {
6881 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, false, true>, cudaFuncCachePreferShared);
6882 img_acts_mediumcolor<2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6883 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6884 } else {
6885 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, false, true>, cudaFuncCachePreferShared);
6886 img_acts_mediumcolor<2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6887 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6888 }
6889 }
6890 }
6891 } else {
6892 if (imgsPerThread == 8) {
6893 if (checkCaseBounds) {
6894 if (numFilterColors == 1) {
6895 cudaFuncSetCacheConfig(img_acts_color<8, 1, false, true, true>, cudaFuncCachePreferShared);
6896 img_acts_color<8, 1, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6897 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6898 } else if (numFilterColors == 2) {
6899 cudaFuncSetCacheConfig(img_acts_color<8, 2, false, true, true>, cudaFuncCachePreferShared);
6900 img_acts_color<8, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6901 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6902 } else if (numFilterColors == 3) {
6903 cudaFuncSetCacheConfig(img_acts_color<8, 3, false, true, true>, cudaFuncCachePreferShared);
6904 img_acts_color<8, 3, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6905 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6906 }
6907 } else {
6908 if (numFilterColors == 1) {
6909 cudaFuncSetCacheConfig(img_acts_color<8, 1, false, false, true>, cudaFuncCachePreferShared);
6910 img_acts_color<8, 1, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6911 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6912 } else if (numFilterColors == 2) {
6913 cudaFuncSetCacheConfig(img_acts_color<8, 2, false, false, true>, cudaFuncCachePreferShared);
6914 img_acts_color<8, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6915 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6916 } else if (numFilterColors == 3) {
6917 cudaFuncSetCacheConfig(img_acts_color<8, 3, false, false, true>, cudaFuncCachePreferShared);
6918 img_acts_color<8, 3, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6919 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6920 }
6921 }
6922 } else if (imgsPerThread == 4) {
6923 if (checkCaseBounds) {
6924 if (numFilterColors == 1) {
6925 cudaFuncSetCacheConfig(img_acts_color<4, 1, false, true, true>, cudaFuncCachePreferShared);
6926 img_acts_color<4, 1, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6927 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6928 } else if (numFilterColors == 2) {
6929 cudaFuncSetCacheConfig(img_acts_color<4, 2, false, true, true>, cudaFuncCachePreferShared);
6930 img_acts_color<4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6931 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6932 } else if (numFilterColors == 3) {
6933 cudaFuncSetCacheConfig(img_acts_color<4, 3, false, true, true>, cudaFuncCachePreferShared);
6934 img_acts_color<4, 3, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6935 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6936 }
6937 } else {
6938 if (numFilterColors == 1) {
6939 cudaFuncSetCacheConfig(img_acts_color<4, 1, false, false, true>, cudaFuncCachePreferShared);
6940 img_acts_color<4, 1, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6941 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6942 } else if (numFilterColors == 2) {
6943 cudaFuncSetCacheConfig(img_acts_color<4, 2, false, false, true>, cudaFuncCachePreferShared);
6944 img_acts_color<4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6945 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6946 } else if (numFilterColors == 3) {
6947 cudaFuncSetCacheConfig(img_acts_color<4, 3, false, false, true>, cudaFuncCachePreferShared);
6948 img_acts_color<4, 3, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6949 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6950 }
6951 }
6952 } else {
6953 if (checkCaseBounds) {
6954 if (numFilterColors == 1) {
6955 cudaFuncSetCacheConfig(img_acts_color<2, 1, false, true, true>, cudaFuncCachePreferShared);
6956 img_acts_color<2, 1, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6957 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6958 } else if (numFilterColors == 2) {
6959 cudaFuncSetCacheConfig(img_acts_color<2, 2, false, true, true>, cudaFuncCachePreferShared);
6960 img_acts_color<2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6961 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6962 } else if (numFilterColors == 3) {
6963 cudaFuncSetCacheConfig(img_acts_color<2, 3, false, true, true>, cudaFuncCachePreferShared);
6964 img_acts_color<2, 3, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6965 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6966 }
6967 } else {
6968 if (numFilterColors == 1) {
6969 cudaFuncSetCacheConfig(img_acts_color<2, 1, false, false, true>, cudaFuncCachePreferShared);
6970 img_acts_color<2, 1, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6971 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6972 } else if (numFilterColors == 2) {
6973 cudaFuncSetCacheConfig(img_acts_color<2, 2, false, false, true>, cudaFuncCachePreferShared);
6974 img_acts_color<2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6975 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6976 } else if (numFilterColors == 3) {
6977 cudaFuncSetCacheConfig(img_acts_color<2, 3, false, false, true>, cudaFuncCachePreferShared);
6978 img_acts_color<2, 3, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6979 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
6980 }
6981 }
6982 }
6983 }
6984 } else { // do scale
6985 if (numFilterColors % 8 == 0) {
6986 if (imgsPerThread == 4) {
6987 if (checkCaseBounds) {
6988 if (numFilterColors % 16 == 0) {
6989 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, true, true>, cudaFuncCachePreferShared);
6990 conv_img_acts_manycolor<4, 32, 4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6991 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6992 } else {
6993 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, true, true>, cudaFuncCachePreferShared);
6994 conv_img_acts_manycolor<4, 32, 4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
6995 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
6996 }
6997 } else {
6998 if (numFilterColors % 16 == 0) {
6999 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, false, true>, cudaFuncCachePreferShared);
7000 conv_img_acts_manycolor<4, 32, 4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7001 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7002 } else {
7003 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, false, true>, cudaFuncCachePreferShared);
7004 conv_img_acts_manycolor<4, 32, 4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7005 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7006 }
7007 }
7008 } else if (imgsPerThread == 2) {
7009 if (checkCaseBounds) {
7010 if (numFilterColors % 16 == 0) {
7011 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, true, true>, cudaFuncCachePreferShared);
7012 conv_img_acts_manycolor<4, 32, 2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7013 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7014 } else {
7015 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, true, true>, cudaFuncCachePreferShared);
7016 conv_img_acts_manycolor<4, 32, 2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7017 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7018 }
7019 } else {
7020 if (numFilterColors % 16 == 0) {
7021 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, false, true>, cudaFuncCachePreferShared);
7022 conv_img_acts_manycolor<4, 32, 2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7023 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7024 } else {
7025 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, false, true>, cudaFuncCachePreferShared);
7026 conv_img_acts_manycolor<4, 32, 2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7027 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7028 }
7029 }
7030 } else {
7031 if (checkCaseBounds) {
7032 if (numFilterColors % 16 == 0) {
7033 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, true, true>, cudaFuncCachePreferShared);
7034 conv_img_acts_manycolor<4, 32, 1, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7035 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7036 } else {
7037 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, true, true>, cudaFuncCachePreferShared);
7038 conv_img_acts_manycolor<4, 32, 1, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7039 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7040 }
7041 } else {
7042 if (numFilterColors % 16 == 0) {
7043 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, false, true>, cudaFuncCachePreferShared);
7044 conv_img_acts_manycolor<4, 32, 1, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7045 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7046 } else {
7047 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, false, true>, cudaFuncCachePreferShared);
7048 conv_img_acts_manycolor<4, 32, 1, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7049 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7050 }
7051 }
7052 }
7053 } else if (numFilterColors > 3) {
7054 if (imgsPerThread == 8) {
7055 if (checkCaseBounds) {
7056 if (colorsPerThread == 4) {
7057 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, true, true>, cudaFuncCachePreferShared);
7058 img_acts_mediumcolor<8, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7059 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7060 } else {
7061 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, true, true>, cudaFuncCachePreferShared);
7062 img_acts_mediumcolor<8, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7063 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7064 }
7065 } else {
7066 if (colorsPerThread == 4) {
7067 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, false, true>, cudaFuncCachePreferShared);
7068 img_acts_mediumcolor<8, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7069 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7070 } else {
7071 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, false, true>, cudaFuncCachePreferShared);
7072 img_acts_mediumcolor<8, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7073 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7074 }
7075 }
7076 } else if (imgsPerThread == 4) {
7077 if (checkCaseBounds) {
7078 if (colorsPerThread == 4) {
7079 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, true, true>, cudaFuncCachePreferShared);
7080 img_acts_mediumcolor<4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7081 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7082 } else {
7083 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, true, true>, cudaFuncCachePreferShared);
7084 img_acts_mediumcolor<4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7085 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7086 }
7087 } else {
7088 if (colorsPerThread == 4) {
7089 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, false, true>, cudaFuncCachePreferShared);
7090 img_acts_mediumcolor<4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7091 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7092 } else {
7093 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, false, true>, cudaFuncCachePreferShared);
7094 img_acts_mediumcolor<4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7095 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7096 }
7097 }
7098 } else {
7099 if (checkCaseBounds) {
7100 if (colorsPerThread == 4) {
7101 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, true, true>, cudaFuncCachePreferShared);
7102 img_acts_mediumcolor<2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7103 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7104 } else {
7105 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, true, true>, cudaFuncCachePreferShared);
7106 img_acts_mediumcolor<2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7107 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7108 }
7109 } else {
7110 if (colorsPerThread == 4) {
7111 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, false, true>, cudaFuncCachePreferShared);
7112 img_acts_mediumcolor<2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7113 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7114 } else {
7115 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, false, true>, cudaFuncCachePreferShared);
7116 img_acts_mediumcolor<2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7117 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7118 }
7119 }
7120 }
7121 } else {
7122 if (imgsPerThread == 8) {
7123 if (checkCaseBounds) {
7124 if (numFilterColors == 1) {
7125 cudaFuncSetCacheConfig(img_acts_color<8, 1, true, true, true>, cudaFuncCachePreferShared);
7126 img_acts_color<8, 1, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7127 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7128 } else if (numFilterColors == 2) {
7129 cudaFuncSetCacheConfig(img_acts_color<8, 2, true, true, true>, cudaFuncCachePreferShared);
7130 img_acts_color<8, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7131 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7132 } else if (numFilterColors == 3) {
7133 cudaFuncSetCacheConfig(img_acts_color<8, 3, true, true, true>, cudaFuncCachePreferShared);
7134 img_acts_color<8, 3, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7135 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7136 }
7137 } else {
7138 if (numFilterColors == 1) {
7139 cudaFuncSetCacheConfig(img_acts_color<8, 1, true, false, true>, cudaFuncCachePreferShared);
7140 img_acts_color<8, 1, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7141 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7142 } else if (numFilterColors == 2) {
7143 cudaFuncSetCacheConfig(img_acts_color<8, 2, true, false, true>, cudaFuncCachePreferShared);
7144 img_acts_color<8, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7145 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7146 } else if (numFilterColors == 3) {
7147 cudaFuncSetCacheConfig(img_acts_color<8, 3, true, false, true>, cudaFuncCachePreferShared);
7148 img_acts_color<8, 3, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7149 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7150 }
7151 }
7152 } else if (imgsPerThread == 4) {
7153 if (checkCaseBounds) {
7154 if (numFilterColors == 1) {
7155 cudaFuncSetCacheConfig(img_acts_color<4, 1, true, true, true>, cudaFuncCachePreferShared);
7156 img_acts_color<4, 1, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7157 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7158 } else if (numFilterColors == 2) {
7159 cudaFuncSetCacheConfig(img_acts_color<4, 2, true, true, true>, cudaFuncCachePreferShared);
7160 img_acts_color<4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7161 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7162 } else if (numFilterColors == 3) {
7163 cudaFuncSetCacheConfig(img_acts_color<4, 3, true, true, true>, cudaFuncCachePreferShared);
7164 img_acts_color<4, 3, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7165 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7166 }
7167 } else {
7168 if (numFilterColors == 1) {
7169 cudaFuncSetCacheConfig(img_acts_color<4, 1, true, false, true>, cudaFuncCachePreferShared);
7170 img_acts_color<4, 1, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7171 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7172 } else if (numFilterColors == 2) {
7173 cudaFuncSetCacheConfig(img_acts_color<4, 2, true, false, true>, cudaFuncCachePreferShared);
7174 img_acts_color<4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7175 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7176 } else if (numFilterColors == 3) {
7177 cudaFuncSetCacheConfig(img_acts_color<4, 3, true, false, true>, cudaFuncCachePreferShared);
7178 img_acts_color<4, 3, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7179 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7180 }
7181 }
7182 } else {
7183 if (checkCaseBounds) {
7184 if (numFilterColors == 1) {
7185 cudaFuncSetCacheConfig(img_acts_color<2, 1, true, true, true>, cudaFuncCachePreferShared);
7186 img_acts_color<2, 1, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7187 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7188 } else if (numFilterColors == 2) {
7189 cudaFuncSetCacheConfig(img_acts_color<2, 2, true, true, true>, cudaFuncCachePreferShared);
7190 img_acts_color<2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7191 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7192 } else if (numFilterColors == 3) {
7193 cudaFuncSetCacheConfig(img_acts_color<2, 3, true, true, true>, cudaFuncCachePreferShared);
7194 img_acts_color<2, 3, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7195 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7196 }
7197 } else {
7198 if (numFilterColors == 1) {
7199 cudaFuncSetCacheConfig(img_acts_color<2, 1, true, false, true>, cudaFuncCachePreferShared);
7200 img_acts_color<2, 1, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7201 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7202 } else if (numFilterColors == 2) {
7203 cudaFuncSetCacheConfig(img_acts_color<2, 2, true, false, true>, cudaFuncCachePreferShared);
7204 img_acts_color<2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7205 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7206 } else if (numFilterColors == 3) {
7207 cudaFuncSetCacheConfig(img_acts_color<2, 3, true, false, true>, cudaFuncCachePreferShared);
7208 img_acts_color<2, 3, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7209 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7210 }
7211 }
7212 }
7213 }
7214 }
7215 } else { // local, unshared units
7216 if (scaleTargets == 0) { // do not scale or use targets matrix
7217 if (numFilterColors % 8 == 0) {
7218 if (imgsPerThread == 4) {
7219 if (checkCaseBounds) {
7220 if (numFilterColors % 16 == 0) {
7221 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, true, false>, cudaFuncCachePreferShared);
7222 conv_img_acts_manycolor<4, 32, 4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7223 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7224 } else {
7225 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, true, false>, cudaFuncCachePreferShared);
7226 conv_img_acts_manycolor<4, 32, 4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7227 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7228 }
7229 } else {
7230 if (numFilterColors % 16 == 0) {
7231 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, false, false, false>, cudaFuncCachePreferShared);
7232 conv_img_acts_manycolor<4, 32, 4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7233 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7234 } else {
7235 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, false, false, false>, cudaFuncCachePreferShared);
7236 conv_img_acts_manycolor<4, 32, 4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7237 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7238 }
7239 }
7240 } else if (imgsPerThread == 2) {
7241 if (checkCaseBounds) {
7242 if (numFilterColors % 16 == 0) {
7243 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, true, false>, cudaFuncCachePreferShared);
7244 conv_img_acts_manycolor<4, 32, 2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7245 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7246 } else {
7247 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, true, false>, cudaFuncCachePreferShared);
7248 conv_img_acts_manycolor<4, 32, 2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7249 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7250 }
7251 } else {
7252 if (numFilterColors % 16 == 0) {
7253 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, false, false, false>, cudaFuncCachePreferShared);
7254 conv_img_acts_manycolor<4, 32, 2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7255 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7256 } else {
7257 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, false, false, false>, cudaFuncCachePreferShared);
7258 conv_img_acts_manycolor<4, 32, 2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7259 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7260 }
7261 }
7262 } else {
7263 if (checkCaseBounds) {
7264 if (numFilterColors % 16 == 0) {
7265 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, true, false>, cudaFuncCachePreferShared);
7266 conv_img_acts_manycolor<4, 32, 1, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7267 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7268 } else {
7269 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, true, false>, cudaFuncCachePreferShared);
7270 conv_img_acts_manycolor<4, 32, 1, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7271 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7272 }
7273 } else {
7274 if (numFilterColors % 16 == 0) {
7275 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, false, false, false>, cudaFuncCachePreferShared);
7276 conv_img_acts_manycolor<4, 32, 1, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7277 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7278 } else {
7279 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, false, false, false>, cudaFuncCachePreferShared);
7280 conv_img_acts_manycolor<4, 32, 1, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7281 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7282 }
7283 }
7284 }
7285 } else if (numFilterColors > 3) {
7286 if (imgsPerThread == 8) {
7287 if (checkCaseBounds) {
7288 if (colorsPerThread == 4) {
7289 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, true, false>, cudaFuncCachePreferShared);
7290 img_acts_mediumcolor<8, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7291 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7292 } else {
7293 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, true, false>, cudaFuncCachePreferShared);
7294 img_acts_mediumcolor<8, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7295 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7296 }
7297 } else {
7298 if (colorsPerThread == 4) {
7299 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, false, false, false>, cudaFuncCachePreferShared);
7300 img_acts_mediumcolor<8, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7301 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7302 } else {
7303 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, false, false, false>, cudaFuncCachePreferShared);
7304 img_acts_mediumcolor<8, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7305 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7306 }
7307 }
7308 } else if (imgsPerThread == 4) {
7309 if (checkCaseBounds) {
7310 if (colorsPerThread == 4) {
7311 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, true, false>, cudaFuncCachePreferShared);
7312 img_acts_mediumcolor<4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7313 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7314 } else {
7315 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, true, false>, cudaFuncCachePreferShared);
7316 img_acts_mediumcolor<4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7317 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7318 }
7319 } else {
7320 if (colorsPerThread == 4) {
7321 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, false, false, false>, cudaFuncCachePreferShared);
7322 img_acts_mediumcolor<4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7323 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7324 } else {
7325 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, false, false, false>, cudaFuncCachePreferShared);
7326 img_acts_mediumcolor<4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7327 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7328 }
7329 }
7330 } else {
7331 if (checkCaseBounds) {
7332 if (colorsPerThread == 4) {
7333 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, true, false>, cudaFuncCachePreferShared);
7334 img_acts_mediumcolor<2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7335 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7336 } else {
7337 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, true, false>, cudaFuncCachePreferShared);
7338 img_acts_mediumcolor<2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7339 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7340 }
7341 } else {
7342 if (colorsPerThread == 4) {
7343 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, false, false, false>, cudaFuncCachePreferShared);
7344 img_acts_mediumcolor<2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7345 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7346 } else {
7347 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, false, false, false>, cudaFuncCachePreferShared);
7348 img_acts_mediumcolor<2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7349 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7350 }
7351 }
7352 }
7353 } else {
7354 if (imgsPerThread == 8) {
7355 if (checkCaseBounds) {
7356 if (numFilterColors == 1) {
7357 cudaFuncSetCacheConfig(img_acts_color<8, 1, false, true, false>, cudaFuncCachePreferShared);
7358 img_acts_color<8, 1, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7359 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7360 } else if (numFilterColors == 2) {
7361 cudaFuncSetCacheConfig(img_acts_color<8, 2, false, true, false>, cudaFuncCachePreferShared);
7362 img_acts_color<8, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7363 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7364 } else if (numFilterColors == 3) {
7365 cudaFuncSetCacheConfig(img_acts_color<8, 3, false, true, false>, cudaFuncCachePreferShared);
7366 img_acts_color<8, 3, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7367 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7368 }
7369 } else {
7370 if (numFilterColors == 1) {
7371 cudaFuncSetCacheConfig(img_acts_color<8, 1, false, false, false>, cudaFuncCachePreferShared);
7372 img_acts_color<8, 1, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7373 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7374 } else if (numFilterColors == 2) {
7375 cudaFuncSetCacheConfig(img_acts_color<8, 2, false, false, false>, cudaFuncCachePreferShared);
7376 img_acts_color<8, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7377 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7378 } else if (numFilterColors == 3) {
7379 cudaFuncSetCacheConfig(img_acts_color<8, 3, false, false, false>, cudaFuncCachePreferShared);
7380 img_acts_color<8, 3, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7381 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7382 }
7383 }
7384 } else if (imgsPerThread == 4) {
7385 if (checkCaseBounds) {
7386 if (numFilterColors == 1) {
7387 cudaFuncSetCacheConfig(img_acts_color<4, 1, false, true, false>, cudaFuncCachePreferShared);
7388 img_acts_color<4, 1, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7389 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7390 } else if (numFilterColors == 2) {
7391 cudaFuncSetCacheConfig(img_acts_color<4, 2, false, true, false>, cudaFuncCachePreferShared);
7392 img_acts_color<4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7393 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7394 } else if (numFilterColors == 3) {
7395 cudaFuncSetCacheConfig(img_acts_color<4, 3, false, true, false>, cudaFuncCachePreferShared);
7396 img_acts_color<4, 3, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7397 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7398 }
7399 } else {
7400 if (numFilterColors == 1) {
7401 cudaFuncSetCacheConfig(img_acts_color<4, 1, false, false, false>, cudaFuncCachePreferShared);
7402 img_acts_color<4, 1, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7403 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7404 } else if (numFilterColors == 2) {
7405 cudaFuncSetCacheConfig(img_acts_color<4, 2, false, false, false>, cudaFuncCachePreferShared);
7406 img_acts_color<4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7407 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7408 } else if (numFilterColors == 3) {
7409 cudaFuncSetCacheConfig(img_acts_color<4, 3, false, false, false>, cudaFuncCachePreferShared);
7410 img_acts_color<4, 3, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7411 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7412 }
7413 }
7414 } else {
7415 if (checkCaseBounds) {
7416 if (numFilterColors == 1) {
7417 cudaFuncSetCacheConfig(img_acts_color<2, 1, false, true, false>, cudaFuncCachePreferShared);
7418 img_acts_color<2, 1, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7419 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7420 } else if (numFilterColors == 2) {
7421 cudaFuncSetCacheConfig(img_acts_color<2, 2, false, true, false>, cudaFuncCachePreferShared);
7422 img_acts_color<2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7423 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7424 } else if (numFilterColors == 3) {
7425 cudaFuncSetCacheConfig(img_acts_color<2, 3, false, true, false>, cudaFuncCachePreferShared);
7426 img_acts_color<2, 3, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7427 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7428 }
7429 } else {
7430 if (numFilterColors == 1) {
7431 cudaFuncSetCacheConfig(img_acts_color<2, 1, false, false, false>, cudaFuncCachePreferShared);
7432 img_acts_color<2, 1, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7433 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7434 } else if (numFilterColors == 2) {
7435 cudaFuncSetCacheConfig(img_acts_color<2, 2, false, false, false>, cudaFuncCachePreferShared);
7436 img_acts_color<2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7437 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7438 } else if (numFilterColors == 3) {
7439 cudaFuncSetCacheConfig(img_acts_color<2, 3, false, false, false>, cudaFuncCachePreferShared);
7440 img_acts_color<2, 3, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7441 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7442 }
7443 }
7444 }
7445 }
7446 } else { // do scale
7447 if (numFilterColors % 8 == 0) {
7448 if (imgsPerThread == 4) {
7449 if (checkCaseBounds) {
7450 if (numFilterColors % 16 == 0) {
7451 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, true, false>, cudaFuncCachePreferShared);
7452 conv_img_acts_manycolor<4, 32, 4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7453 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7454 } else {
7455 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, true, false>, cudaFuncCachePreferShared);
7456 conv_img_acts_manycolor<4, 32, 4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7457 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7458 }
7459 } else {
7460 if (numFilterColors % 16 == 0) {
7461 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 4, true, false, false>, cudaFuncCachePreferShared);
7462 conv_img_acts_manycolor<4, 32, 4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7463 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7464 } else {
7465 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 4, 2, true, false, false>, cudaFuncCachePreferShared);
7466 conv_img_acts_manycolor<4, 32, 4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7467 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7468 }
7469 }
7470 } else if (imgsPerThread == 2) {
7471 if (checkCaseBounds) {
7472 if (numFilterColors % 16 == 0) {
7473 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, true, false>, cudaFuncCachePreferShared);
7474 conv_img_acts_manycolor<4, 32, 2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7475 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7476 } else {
7477 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, true, false>, cudaFuncCachePreferShared);
7478 conv_img_acts_manycolor<4, 32, 2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7479 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7480 }
7481 } else {
7482 if (numFilterColors % 16 == 0) {
7483 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 4, true, false, false>, cudaFuncCachePreferShared);
7484 conv_img_acts_manycolor<4, 32, 2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7485 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7486 } else {
7487 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 2, 2, true, false, false>, cudaFuncCachePreferShared);
7488 conv_img_acts_manycolor<4, 32, 2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7489 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7490 }
7491 }
7492 } else {
7493 if (checkCaseBounds) {
7494 if (numFilterColors % 16 == 0) {
7495 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, true, false>, cudaFuncCachePreferShared);
7496 conv_img_acts_manycolor<4, 32, 1, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7497 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7498 } else {
7499 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, true, false>, cudaFuncCachePreferShared);
7500 conv_img_acts_manycolor<4, 32, 1, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7501 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7502 }
7503 } else {
7504 if (numFilterColors % 16 == 0) {
7505 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 4, true, false, false>, cudaFuncCachePreferShared);
7506 conv_img_acts_manycolor<4, 32, 1, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7507 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7508 } else {
7509 cudaFuncSetCacheConfig(conv_img_acts_manycolor<4, 32, 1, 2, true, false, false>, cudaFuncCachePreferShared);
7510 conv_img_acts_manycolor<4, 32, 1, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7511 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7512 }
7513 }
7514 }
7515 } else if (numFilterColors > 3) {
7516 if (imgsPerThread == 8) {
7517 if (checkCaseBounds) {
7518 if (colorsPerThread == 4) {
7519 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, true, false>, cudaFuncCachePreferShared);
7520 img_acts_mediumcolor<8, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7521 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7522 } else {
7523 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, true, false>, cudaFuncCachePreferShared);
7524 img_acts_mediumcolor<8, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7525 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7526 }
7527 } else {
7528 if (colorsPerThread == 4) {
7529 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 4, true, false, false>, cudaFuncCachePreferShared);
7530 img_acts_mediumcolor<8, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7531 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7532 } else {
7533 cudaFuncSetCacheConfig(img_acts_mediumcolor<8, 2, true, false, false>, cudaFuncCachePreferShared);
7534 img_acts_mediumcolor<8, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7535 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7536 }
7537 }
7538 } else if (imgsPerThread == 4) {
7539 if (checkCaseBounds) {
7540 if (colorsPerThread == 4) {
7541 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, true, false>, cudaFuncCachePreferShared);
7542 img_acts_mediumcolor<4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7543 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7544 } else {
7545 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, true, false>, cudaFuncCachePreferShared);
7546 img_acts_mediumcolor<4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7547 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7548 }
7549 } else {
7550 if (colorsPerThread == 4) {
7551 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 4, true, false, false>, cudaFuncCachePreferShared);
7552 img_acts_mediumcolor<4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7553 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7554 } else {
7555 cudaFuncSetCacheConfig(img_acts_mediumcolor<4, 2, true, false, false>, cudaFuncCachePreferShared);
7556 img_acts_mediumcolor<4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7557 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7558 }
7559 }
7560 } else {
7561 if (checkCaseBounds) {
7562 if (colorsPerThread == 4) {
7563 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, true, false>, cudaFuncCachePreferShared);
7564 img_acts_mediumcolor<2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7565 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7566 } else {
7567 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, true, false>, cudaFuncCachePreferShared);
7568 img_acts_mediumcolor<2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7569 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7570 }
7571 } else {
7572 if (colorsPerThread == 4) {
7573 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 4, true, false, false>, cudaFuncCachePreferShared);
7574 img_acts_mediumcolor<2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7575 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7576 } else {
7577 cudaFuncSetCacheConfig(img_acts_mediumcolor<2, 2, true, false, false>, cudaFuncCachePreferShared);
7578 img_acts_mediumcolor<2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7579 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput);
7580 }
7581 }
7582 }
7583 } else {
7584 if (imgsPerThread == 8) {
7585 if (checkCaseBounds) {
7586 if (numFilterColors == 1) {
7587 cudaFuncSetCacheConfig(img_acts_color<8, 1, true, true, false>, cudaFuncCachePreferShared);
7588 img_acts_color<8, 1, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7589 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7590 } else if (numFilterColors == 2) {
7591 cudaFuncSetCacheConfig(img_acts_color<8, 2, true, true, false>, cudaFuncCachePreferShared);
7592 img_acts_color<8, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7593 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7594 } else if (numFilterColors == 3) {
7595 cudaFuncSetCacheConfig(img_acts_color<8, 3, true, true, false>, cudaFuncCachePreferShared);
7596 img_acts_color<8, 3, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7597 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7598 }
7599 } else {
7600 if (numFilterColors == 1) {
7601 cudaFuncSetCacheConfig(img_acts_color<8, 1, true, false, false>, cudaFuncCachePreferShared);
7602 img_acts_color<8, 1, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7603 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7604 } else if (numFilterColors == 2) {
7605 cudaFuncSetCacheConfig(img_acts_color<8, 2, true, false, false>, cudaFuncCachePreferShared);
7606 img_acts_color<8, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7607 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7608 } else if (numFilterColors == 3) {
7609 cudaFuncSetCacheConfig(img_acts_color<8, 3, true, false, false>, cudaFuncCachePreferShared);
7610 img_acts_color<8, 3, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7611 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7612 }
7613 }
7614 } else if (imgsPerThread == 4) {
7615 if (checkCaseBounds) {
7616 if (numFilterColors == 1) {
7617 cudaFuncSetCacheConfig(img_acts_color<4, 1, true, true, false>, cudaFuncCachePreferShared);
7618 img_acts_color<4, 1, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7619 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7620 } else if (numFilterColors == 2) {
7621 cudaFuncSetCacheConfig(img_acts_color<4, 2, true, true, false>, cudaFuncCachePreferShared);
7622 img_acts_color<4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7623 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7624 } else if (numFilterColors == 3) {
7625 cudaFuncSetCacheConfig(img_acts_color<4, 3, true, true, false>, cudaFuncCachePreferShared);
7626 img_acts_color<4, 3, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7627 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7628 }
7629 } else {
7630 if (numFilterColors == 1) {
7631 cudaFuncSetCacheConfig(img_acts_color<4, 1, true, false, false>, cudaFuncCachePreferShared);
7632 img_acts_color<4, 1, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7633 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7634 } else if (numFilterColors == 2) {
7635 cudaFuncSetCacheConfig(img_acts_color<4, 2, true, false, false>, cudaFuncCachePreferShared);
7636 img_acts_color<4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7637 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7638 } else if (numFilterColors == 3) {
7639 cudaFuncSetCacheConfig(img_acts_color<4, 3, true, false, false>, cudaFuncCachePreferShared);
7640 img_acts_color<4, 3, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7641 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7642 }
7643 }
7644 } else {
7645 if (checkCaseBounds) {
7646 if (numFilterColors == 1) {
7647 cudaFuncSetCacheConfig(img_acts_color<2, 1, true, true, false>, cudaFuncCachePreferShared);
7648 img_acts_color<2, 1, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7649 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7650 } else if (numFilterColors == 2) {
7651 cudaFuncSetCacheConfig(img_acts_color<2, 2, true, true, false>, cudaFuncCachePreferShared);
7652 img_acts_color<2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7653 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7654 } else if (numFilterColors == 3) {
7655 cudaFuncSetCacheConfig(img_acts_color<2, 3, true, true, false>, cudaFuncCachePreferShared);
7656 img_acts_color<2, 3, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7657 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7658 }
7659 } else {
7660 if (numFilterColors == 1) {
7661 cudaFuncSetCacheConfig(img_acts_color<2, 1, true, false, false>, cudaFuncCachePreferShared);
7662 img_acts_color<2, 1, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7663 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7664 } else if (numFilterColors == 2) {
7665 cudaFuncSetCacheConfig(img_acts_color<2, 2, true, false, false>, cudaFuncCachePreferShared);
7666 img_acts_color<2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7667 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7668 } else if (numFilterColors == 3) {
7669 cudaFuncSetCacheConfig(img_acts_color<2, 3, true, false, false>, cudaFuncCachePreferShared);
7670 img_acts_color<2, 3, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(),
7671 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, scaleTargets, scaleOutput);
7672 }
7673 }
7674 }
7675 }
7676 }
7677 }
7678
7679 cutilCheckMsg("imgActs: kernel execution failed");
7680 }
7681
7682
7683 void convImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7684 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) {
7685 _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, true);
7686 }
7687
7688 void convImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7689 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups,
7690 float scaleTargets, float scaleOutput) {
7691 _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, true);
7692 }
7693
7694 void localImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7695 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups) {
7696 _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, 0, 1, false);
7697 }
7698
7699 void localImgActs(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets,
7700 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numGroups,
7701 float scaleTargets, float scaleOutput) {
7702 _imgActs(hidActs, filters, targets, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride, numImgColors, numGroups, scaleTargets, scaleOutput, false);
7703 }
7704
7705
7706 /*
7707 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
7708 * filters: (numFilterColors, filterPixels, numFilters) if conv
7709 * (numModulesY, numModulesX, numFilterColors, filterPixels, numFilters) otherwise
7710 * targets: (overSample, numImgColors, imgSizeY, imgSizeX, numImages)
7711 * colorIndices: (numGroups, numFilterColors)
7712 *
7713 * where overSample := (numFilterColors * numGroups) / numImgColors
7714 *
7715 * Note: all of these convolution routines are optimized for the case when
7716 * the number of images (i.e. the minibatch size) is a multiple of 128.
7717 * Other batch sizes will work, but but I made no attempt whatsoever
7718 * to make them work fast.
7719 */
7720 void _imgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
7721 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride,
7722 int numImgColors, int numFilterColors, int numGroups,
7723 float scaleTargets, float scaleOutput, bool conv) {
7724 int numImages = hidActs.getNumCols();
7725 int numFilters = filters.getNumCols();
7726 // int numFiltersPerGroup = numFilters / numGroups;
7727 int numModules = hidActs.getNumRows() / numFilters;
7728 int filterModuleMult = conv ? 1 : numModules;
7729 int filterPixels = filters.getNumRows() / (filterModuleMult * numFilterColors);
7730 int filterSize = sqrt((double)filterPixels);
7731 int imgPixels = imgSizeY * imgSizeX;
7732 int numModulesX = numModules / numModulesY;
7733 int overSample = (numFilterColors * numGroups) / numImgColors;
7734
7735 assert(numImgColors % numFilterColors == 0);
7736 assert(numFilters % (16*numGroups) == 0);
7737 assert((numFilterColors * numGroups) % numImgColors == 0);
7738 assert(numGroups > 1);
7739 assert(numFilterColors > 3 && numFilterColors % 2 == 0);
7740
7741 assert(filterPixels == filterSize * filterSize);
7742 assert(hidActs.getNumRows() == numModules * numFilters);
7743 assert(filters.getNumRows() == filterModuleMult * numFilterColors * filterPixels);
7744 assert(numModules == numModulesY * numModulesX);
7745
7746 assert(hidActs.isContiguous());
7747 assert(filters.isContiguous());
7748
7749 assert(!hidActs.isTrans());
7750 assert(!filters.isTrans());
7751 assert(!targets.isTrans());
7752 // These routines don't handle the case when only part of the image is visited in the convolution
7753 assert(paddingStart <= 0);
7754 assert(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX);
7755 assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
7756 assert(moduleStride <= filterSize);
7757
7758 assert(targets.isContiguous()); // no stride support here!
7759
7760 dim3 blocks;
7761 dim3 threads;
7762 int colorsPerThread;
7763 int imgsPerThread;
7764 if (numFilterColors % 8 == 0) {
7765 threads = dim3(32, 4);
7766 colorsPerThread = numFilterColors % 16 == 0 ? 4 : 2;
7767 imgsPerThread = numImages % 128 == 0 ? 4 : numImages % 64 == 0 ? 2 : 1;
7768 assert(numFilterColors % (threads.y * colorsPerThread) == 0);
7769 blocks = dim3(DIVUP(numImages, threads.x*imgsPerThread) * (numImgColors/(threads.y*colorsPerThread)), overSample * imgPixels);
7770 } else if (numFilterColors > 3) {
7771 imgsPerThread = numImages % 128 == 0 ? 8 : numImages % 64 == 0 ? 4 : 2;
7772 threads = dim3(16, 16);
7773 colorsPerThread = numFilterColors % 4 == 0 ? 4 : 2;
7774 blocks = dim3(DIVUP(numImages,16*imgsPerThread) * (numImgColors / colorsPerThread), overSample * DIVUP(imgSizeY,4) * DIVUP(imgSizeX,4));
7775 }
7776
7777 bool checkCaseBounds = numImages % (threads.x * imgsPerThread) != 0;
7778
7779 if (scaleTargets == 0) { // do not scale or use targets matrix
7780 targets.resize(overSample*numImgColors*imgPixels, numImages);
7781 } else {
7782 assert(targets.getNumRows() == overSample * numImgColors * imgPixels);
7783 assert(targets.getNumCols() == numImages);
7784 }
7785
7786 if (conv) {
7787 if (scaleTargets == 0) { // do not scale or use targets matrix
7788 if (numFilterColors % 8 == 0) {
7789 if (imgsPerThread == 4) {
7790 if (checkCaseBounds) {
7791 if (numFilterColors % 16 == 0) {
7792 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, true>, cudaFuncCachePreferShared);
7793 img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7794 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7795 } else {
7796 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, true>, cudaFuncCachePreferShared);
7797 img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7798 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7799 }
7800 } else {
7801 if (numFilterColors % 16 == 0) {
7802 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, true>, cudaFuncCachePreferShared);
7803 img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7804 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7805 } else {
7806 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, true>, cudaFuncCachePreferShared);
7807 img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7808 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7809 }
7810 }
7811 } else if (imgsPerThread == 2) {
7812 if (checkCaseBounds) {
7813 if (numFilterColors % 16 == 0) {
7814 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, true>, cudaFuncCachePreferShared);
7815 img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7816 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7817 } else {
7818 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, true>, cudaFuncCachePreferShared);
7819 img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7820 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7821 }
7822 } else {
7823 if (numFilterColors % 16 == 0) {
7824 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, true>, cudaFuncCachePreferShared);
7825 img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7826 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7827 } else {
7828 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, true>, cudaFuncCachePreferShared);
7829 img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7830 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7831 }
7832 }
7833 } else {
7834 if (checkCaseBounds) {
7835 if (numFilterColors % 16 == 0) {
7836 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, true>, cudaFuncCachePreferShared);
7837 img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7838 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7839 } else {
7840 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, true>, cudaFuncCachePreferShared);
7841 img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7842 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7843 }
7844 } else {
7845 if (numFilterColors % 16 == 0) {
7846 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, true>, cudaFuncCachePreferShared);
7847 img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7848 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7849 } else {
7850 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, true>, cudaFuncCachePreferShared);
7851 img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7852 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7853 }
7854 }
7855 }
7856
7857 } else if (numFilterColors > 3) {
7858 if (imgsPerThread == 8) {
7859 if (checkCaseBounds) {
7860 if (colorsPerThread == 4) {
7861 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, true, true>, cudaFuncCachePreferShared);
7862 img_acts_mediumcolor_sparse_rand<8, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7863 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7864 } else {
7865 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, true, true>, cudaFuncCachePreferShared);
7866 img_acts_mediumcolor_sparse_rand<8, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7867 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7868 }
7869 } else {
7870 if (colorsPerThread == 4) {
7871 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, false, true>, cudaFuncCachePreferShared);
7872 img_acts_mediumcolor_sparse_rand<8, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7873 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7874 } else {
7875 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, false, true>, cudaFuncCachePreferShared);
7876 img_acts_mediumcolor_sparse_rand<8, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7877 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7878 }
7879 }
7880 } else if (imgsPerThread == 4) {
7881 if (checkCaseBounds) {
7882 if (colorsPerThread == 4) {
7883 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, true, true>, cudaFuncCachePreferShared);
7884 img_acts_mediumcolor_sparse_rand<4, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7885 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7886 } else {
7887 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, true, true>, cudaFuncCachePreferShared);
7888 img_acts_mediumcolor_sparse_rand<4, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7889 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7890 }
7891 } else {
7892 if (colorsPerThread == 4) {
7893 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, false, true>, cudaFuncCachePreferShared);
7894 img_acts_mediumcolor_sparse_rand<4, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7895 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7896 } else {
7897 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, false, true>, cudaFuncCachePreferShared);
7898 img_acts_mediumcolor_sparse_rand<4, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7899 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7900 }
7901 }
7902 } else {
7903 if (checkCaseBounds) {
7904 if (colorsPerThread == 4) {
7905 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, true, true>, cudaFuncCachePreferShared);
7906 img_acts_mediumcolor_sparse_rand<2, 4, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7907 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7908 } else {
7909 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, true, true>, cudaFuncCachePreferShared);
7910 img_acts_mediumcolor_sparse_rand<2, 2, false, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7911 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7912 }
7913 } else {
7914 if (colorsPerThread == 4) {
7915 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, false, true>, cudaFuncCachePreferShared);
7916 img_acts_mediumcolor_sparse_rand<2, 4, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7917 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7918 } else {
7919 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, false, true>, cudaFuncCachePreferShared);
7920 img_acts_mediumcolor_sparse_rand<2, 2, false, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7921 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7922 }
7923 }
7924 }
7925 }
7926 } else { // do scale
7927 if (numFilterColors % 8 == 0) {
7928 if (imgsPerThread == 4) {
7929 if (checkCaseBounds) {
7930 if (numFilterColors % 16 == 0) {
7931 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, true>, cudaFuncCachePreferShared);
7932 img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7933 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7934 } else {
7935 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, true>, cudaFuncCachePreferShared);
7936 img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7937 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7938 }
7939 } else {
7940 if (numFilterColors % 16 == 0) {
7941 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, true>, cudaFuncCachePreferShared);
7942 img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7943 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7944 } else {
7945 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, true>, cudaFuncCachePreferShared);
7946 img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7947 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7948 }
7949 }
7950 } else if (imgsPerThread == 2) {
7951 if (checkCaseBounds) {
7952 if (numFilterColors % 16 == 0) {
7953 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, true>, cudaFuncCachePreferShared);
7954 img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7955 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7956 } else {
7957 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, true>, cudaFuncCachePreferShared);
7958 img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7959 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7960 }
7961 } else {
7962 if (numFilterColors % 16 == 0) {
7963 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, true>, cudaFuncCachePreferShared);
7964 img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7965 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7966 } else {
7967 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, true>, cudaFuncCachePreferShared);
7968 img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7969 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7970 }
7971 }
7972 } else {
7973 if (checkCaseBounds) {
7974 if (numFilterColors % 16 == 0) {
7975 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, true>, cudaFuncCachePreferShared);
7976 img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7977 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7978 } else {
7979 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, true>, cudaFuncCachePreferShared);
7980 img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7981 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7982 }
7983 } else {
7984 if (numFilterColors % 16 == 0) {
7985 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, true>, cudaFuncCachePreferShared);
7986 img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7987 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7988 } else {
7989 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, true>, cudaFuncCachePreferShared);
7990 img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
7991 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
7992 }
7993 }
7994 }
7995
7996 } else if (numFilterColors > 3) {
7997 if (imgsPerThread == 8) {
7998 if (checkCaseBounds) {
7999 if (colorsPerThread == 4) {
8000 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, true, true>, cudaFuncCachePreferShared);
8001 img_acts_mediumcolor_sparse_rand<8, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8002 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8003 } else {
8004 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, true, true>, cudaFuncCachePreferShared);
8005 img_acts_mediumcolor_sparse_rand<8, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8006 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8007 }
8008 } else {
8009 if (colorsPerThread == 4) {
8010 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, false, true>, cudaFuncCachePreferShared);
8011 img_acts_mediumcolor_sparse_rand<8, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8012 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8013 } else {
8014 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, false, true>, cudaFuncCachePreferShared);
8015 img_acts_mediumcolor_sparse_rand<8, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8016 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8017 }
8018 }
8019 } else if (imgsPerThread == 4) {
8020 if (checkCaseBounds) {
8021 if (colorsPerThread == 4) {
8022 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, true, true>, cudaFuncCachePreferShared);
8023 img_acts_mediumcolor_sparse_rand<4, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8024 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8025 } else {
8026 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, true, true>, cudaFuncCachePreferShared);
8027 img_acts_mediumcolor_sparse_rand<4, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8028 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8029 }
8030 } else {
8031 if (colorsPerThread == 4) {
8032 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, false, true>, cudaFuncCachePreferShared);
8033 img_acts_mediumcolor_sparse_rand<4, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8034 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8035 } else {
8036 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, false, true>, cudaFuncCachePreferShared);
8037 img_acts_mediumcolor_sparse_rand<4, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8038 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8039 }
8040 }
8041 } else {
8042 if (checkCaseBounds) {
8043 if (colorsPerThread == 4) {
8044 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, true, true>, cudaFuncCachePreferShared);
8045 img_acts_mediumcolor_sparse_rand<2, 4, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8046 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8047 } else {
8048 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, true, true>, cudaFuncCachePreferShared);
8049 img_acts_mediumcolor_sparse_rand<2, 2, true, true, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8050 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8051 }
8052 } else {
8053 if (colorsPerThread == 4) {
8054 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, false, true>, cudaFuncCachePreferShared);
8055 img_acts_mediumcolor_sparse_rand<2, 4, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8056 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8057 } else {
8058 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, false, true>, cudaFuncCachePreferShared);
8059 img_acts_mediumcolor_sparse_rand<2, 2, true, false, true><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8060 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8061 }
8062 }
8063 }
8064 }
8065 }
8066 } else {
8067 if (scaleTargets == 0) { // do not scale or use targets matrix
8068 if (numFilterColors % 8 == 0) {
8069 if (imgsPerThread == 4) {
8070 if (checkCaseBounds) {
8071 if (numFilterColors % 16 == 0) {
8072 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, false>, cudaFuncCachePreferShared);
8073 img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8074 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8075 } else {
8076 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, false>, cudaFuncCachePreferShared);
8077 img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8078 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8079 }
8080 } else {
8081 if (numFilterColors % 16 == 0) {
8082 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, false>, cudaFuncCachePreferShared);
8083 img_acts_manycolor_sparse_rand<4, 32, 4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8084 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8085 } else {
8086 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, false>, cudaFuncCachePreferShared);
8087 img_acts_manycolor_sparse_rand<4, 32, 4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8088 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8089 }
8090 }
8091 } else if (imgsPerThread == 2) {
8092 if (checkCaseBounds) {
8093 if (numFilterColors % 16 == 0) {
8094 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, false>, cudaFuncCachePreferShared);
8095 img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8096 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8097 } else {
8098 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, false>, cudaFuncCachePreferShared);
8099 img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8100 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8101 }
8102 } else {
8103 if (numFilterColors % 16 == 0) {
8104 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, false>, cudaFuncCachePreferShared);
8105 img_acts_manycolor_sparse_rand<4, 32, 2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8106 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8107 } else {
8108 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, false>, cudaFuncCachePreferShared);
8109 img_acts_manycolor_sparse_rand<4, 32, 2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8110 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8111 }
8112 }
8113 } else {
8114 if (checkCaseBounds) {
8115 if (numFilterColors % 16 == 0) {
8116 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, false>, cudaFuncCachePreferShared);
8117 img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8118 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8119 } else {
8120 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, false>, cudaFuncCachePreferShared);
8121 img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8122 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8123 }
8124 } else {
8125 if (numFilterColors % 16 == 0) {
8126 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, false>, cudaFuncCachePreferShared);
8127 img_acts_manycolor_sparse_rand<4, 32, 1, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8128 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8129 } else {
8130 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, false>, cudaFuncCachePreferShared);
8131 img_acts_manycolor_sparse_rand<4, 32, 1, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8132 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8133 }
8134 }
8135 }
8136
8137 } else if (numFilterColors > 3) {
8138 if (imgsPerThread == 8) {
8139 if (checkCaseBounds) {
8140 if (colorsPerThread == 4) {
8141 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, true, false>, cudaFuncCachePreferShared);
8142 img_acts_mediumcolor_sparse_rand<8, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8143 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8144 } else {
8145 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, true, false>, cudaFuncCachePreferShared);
8146 img_acts_mediumcolor_sparse_rand<8, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8147 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8148 }
8149 } else {
8150 if (colorsPerThread == 4) {
8151 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, false, false, false>, cudaFuncCachePreferShared);
8152 img_acts_mediumcolor_sparse_rand<8, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8153 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8154 } else {
8155 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, false, false, false>, cudaFuncCachePreferShared);
8156 img_acts_mediumcolor_sparse_rand<8, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8157 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8158 }
8159 }
8160 } else if (imgsPerThread == 4) {
8161 if (checkCaseBounds) {
8162 if (colorsPerThread == 4) {
8163 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, true, false>, cudaFuncCachePreferShared);
8164 img_acts_mediumcolor_sparse_rand<4, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8165 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8166 } else {
8167 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, true, false>, cudaFuncCachePreferShared);
8168 img_acts_mediumcolor_sparse_rand<4, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8169 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8170 }
8171 } else {
8172 if (colorsPerThread == 4) {
8173 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, false, false, false>, cudaFuncCachePreferShared);
8174 img_acts_mediumcolor_sparse_rand<4, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8175 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8176 } else {
8177 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, false, false, false>, cudaFuncCachePreferShared);
8178 img_acts_mediumcolor_sparse_rand<4, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8179 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8180 }
8181 }
8182 } else {
8183 if (checkCaseBounds) {
8184 if (colorsPerThread == 4) {
8185 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, true, false>, cudaFuncCachePreferShared);
8186 img_acts_mediumcolor_sparse_rand<2, 4, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8187 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8188 } else {
8189 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, true, false>, cudaFuncCachePreferShared);
8190 img_acts_mediumcolor_sparse_rand<2, 2, false, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8191 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8192 }
8193 } else {
8194 if (colorsPerThread == 4) {
8195 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, false, false, false>, cudaFuncCachePreferShared);
8196 img_acts_mediumcolor_sparse_rand<2, 4, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8197 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8198 } else {
8199 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, false, false, false>, cudaFuncCachePreferShared);
8200 img_acts_mediumcolor_sparse_rand<2, 2, false, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8201 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8202 }
8203 }
8204 }
8205 }
8206 } else { // do scale
8207 if (numFilterColors % 8 == 0) {
8208 if (imgsPerThread == 4) {
8209 if (checkCaseBounds) {
8210 if (numFilterColors % 16 == 0) {
8211 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, false>, cudaFuncCachePreferShared);
8212 img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8213 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8214 } else {
8215 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, false>, cudaFuncCachePreferShared);
8216 img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8217 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8218 }
8219 } else {
8220 if (numFilterColors % 16 == 0) {
8221 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, false>, cudaFuncCachePreferShared);
8222 img_acts_manycolor_sparse_rand<4, 32, 4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8223 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8224 } else {
8225 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, false>, cudaFuncCachePreferShared);
8226 img_acts_manycolor_sparse_rand<4, 32, 4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8227 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8228 }
8229 }
8230 } else if (imgsPerThread == 2) {
8231 if (checkCaseBounds) {
8232 if (numFilterColors % 16 == 0) {
8233 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, false>, cudaFuncCachePreferShared);
8234 img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8235 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8236 } else {
8237 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, false>, cudaFuncCachePreferShared);
8238 img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8239 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8240 }
8241 } else {
8242 if (numFilterColors % 16 == 0) {
8243 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, false>, cudaFuncCachePreferShared);
8244 img_acts_manycolor_sparse_rand<4, 32, 2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8245 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8246 } else {
8247 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, false>, cudaFuncCachePreferShared);
8248 img_acts_manycolor_sparse_rand<4, 32, 2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8249 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8250 }
8251 }
8252 } else {
8253 if (checkCaseBounds) {
8254 if (numFilterColors % 16 == 0) {
8255 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, false>, cudaFuncCachePreferShared);
8256 img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8257 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8258 } else {
8259 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, false>, cudaFuncCachePreferShared);
8260 img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8261 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8262 }
8263 } else {
8264 if (numFilterColors % 16 == 0) {
8265 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, false>, cudaFuncCachePreferShared);
8266 img_acts_manycolor_sparse_rand<4, 32, 1, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8267 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8268 } else {
8269 cudaFuncSetCacheConfig(img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, false>, cudaFuncCachePreferShared);
8270 img_acts_manycolor_sparse_rand<4, 32, 1, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8271 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8272 }
8273 }
8274 }
8275 } else if (numFilterColors > 3) {
8276 if (imgsPerThread == 8) {
8277 if (checkCaseBounds) {
8278 if (colorsPerThread == 4) {
8279 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, true, false>, cudaFuncCachePreferShared);
8280 img_acts_mediumcolor_sparse_rand<8, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8281 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8282 } else {
8283 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, true, false>, cudaFuncCachePreferShared);
8284 img_acts_mediumcolor_sparse_rand<8, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8285 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8286 }
8287 } else {
8288 if (colorsPerThread == 4) {
8289 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 4, true, false, false>, cudaFuncCachePreferShared);
8290 img_acts_mediumcolor_sparse_rand<8, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8291 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8292 } else {
8293 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<8, 2, true, false, false>, cudaFuncCachePreferShared);
8294 img_acts_mediumcolor_sparse_rand<8, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8295 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8296 }
8297 }
8298 } else if (imgsPerThread == 4) {
8299 if (checkCaseBounds) {
8300 if (colorsPerThread == 4) {
8301 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, true, false>, cudaFuncCachePreferShared);
8302 img_acts_mediumcolor_sparse_rand<4, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8303 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8304 } else {
8305 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, true, false>, cudaFuncCachePreferShared);
8306 img_acts_mediumcolor_sparse_rand<4, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8307 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8308 }
8309 } else {
8310 if (colorsPerThread == 4) {
8311 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 4, true, false, false>, cudaFuncCachePreferShared);
8312 img_acts_mediumcolor_sparse_rand<4, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8313 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8314 } else {
8315 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<4, 2, true, false, false>, cudaFuncCachePreferShared);
8316 img_acts_mediumcolor_sparse_rand<4, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8317 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8318 }
8319 }
8320 } else {
8321 if (checkCaseBounds) {
8322 if (colorsPerThread == 4) {
8323 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, true, false>, cudaFuncCachePreferShared);
8324 img_acts_mediumcolor_sparse_rand<2, 4, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8325 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8326 } else {
8327 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, true, false>, cudaFuncCachePreferShared);
8328 img_acts_mediumcolor_sparse_rand<2, 2, true, true, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8329 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8330 }
8331 } else {
8332 if (colorsPerThread == 4) {
8333 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 4, true, false, false>, cudaFuncCachePreferShared);
8334 img_acts_mediumcolor_sparse_rand<2, 4, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8335 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8336 } else {
8337 cudaFuncSetCacheConfig(img_acts_mediumcolor_sparse_rand<2, 2, true, false, false>, cudaFuncCachePreferShared);
8338 img_acts_mediumcolor_sparse_rand<2, 2, true, false, false><<<blocks, threads>>>(hidActs.getDevData(), filters.getDevData(), targets.getDevData(), dColorIndices,
8339 numModulesY, numModulesX, numImages, numFilters, filterSize, imgSizeY, imgSizeX, paddingStart, moduleStride, numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput);
8340 }
8341 }
8342 }
8343 }
8344 }
8345 }
8346
8347 cutilCheckMsg("imgActsSparse: kernel execution failed");
8348 }
8349
8350 void convImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8351 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups) {
8352 _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart,
8353 moduleStride, numImgColors, numFilterColors, numGroups, 0, 1, true);
8354 }
8355
8356 void convImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8357 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups,
8358 float scaleTargets, float scaleOutput) {
8359 _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride,
8360 numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput, true);
8361 }
8362
8363 void localImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8364 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups) {
8365 _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart,
8366 moduleStride, numImgColors, numFilterColors, numGroups, 0, 1, false);
8367 }
8368
8369 void localImgActsSparse(NVMatrix& hidActs, NVMatrix& filters, NVMatrix& targets, int* dColorIndices,
8370 int imgSizeY, int imgSizeX, int numModulesY, int paddingStart, int moduleStride, int numImgColors, int numFilterColors, int numGroups,
8371 float scaleTargets, float scaleOutput) {
8372 _imgActsSparse(hidActs, filters, targets, dColorIndices, imgSizeY, imgSizeX, numModulesY, paddingStart, moduleStride,
8373 numImgColors, numFilterColors, numGroups, scaleTargets, scaleOutput, false);
8374 }
8375
8376 /*
8377 * Copyright (c) 2011, Alex Krizhevsky (akrizhevsky@gmail.com)
8378 * All rights reserved.
8379 *
8380 * Redistribution and use in source and binary forms, with or without modification,
8381 * are permitted provided that the following conditions are met:
8382 *
8383 * - Redistributions of source code must retain the above copyright notice,
8384 * this list of conditions and the following disclaimer.
8385 *
8386 * - Redistributions in binary form must reproduce the above copyright notice,
8387 * this list of conditions and the following disclaimer in the documentation
8388 * and/or other materials provided with the distribution.
8389 *
8390 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
8391 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
8392 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
8393 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
8394 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
8395 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
8396 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
8397 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
8398 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
8399 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8400 */
8401
8402 #ifndef _WEIGHT_ACTS_EXPORT
8403 #define _WEIGHT_ACTS_EXPORT
8404 #endif
8405
8406 #include <weight_acts.cuh>
8407 #include <cudaconv2.cuh>
8408
8409 #define LO16(x) ((x) & 0x0000FFFF)
8410 #define HI16(x) ((x) >> 16)
8411
8412 /*
8413 * Each block computes weight gradients for B_Y * pixelsPerThread pixels and B_X filters
8414 * threadIdx.x determines filter
8415 * threadIdx.y determines pixel in filter
8416 *
8417 * blockIdx.x determines filter batch of B_X, module batch of partialSum
8418 * blockIdx.y determines pixel batch of B_Y * pixelsPerThread
8419 *
8420 * Number of filters must be divisible by B_X
8421 * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false.
8422 *
8423 * images: (numColors, imgSizeY, imgSizeX, numImages), with stride given
8424 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
8425 *
8426 * targets: (numModulesY*numModulesX/partialSum, numColors, filterPixels, numFilters)
8427 *
8428 * B_Y * B_X should be divisible by preloadCases.
8429 * preloadCases one of 16, 32.
8430 * B_X one of 4, 8, 16, 32
8431 * B_Y arbitrary (satisfying divisibility constraints)
8432 * numModules must be divisible by partialSum
8433 *
8434 * After adding pixelsPerThread, register usage went from 20 to 23 (when pixelsPerThread = 1)...
8435 * so the compiler is messing up here somehow. It's unable to optimize that case away.
8436 */
8437 template <int B_Y, int B_X, int pixelsPerThread, int preloadCases, int numColors, bool scale, bool checkCaseBounds>
8438 __global__ void conv_weight_acts_c(float* images, float* hidActs, float* targets,
8439 const int numImages, const int numFilters,
8440 const int numModulesY, const int numModulesX,
8441 const int imgSizeY, const int imgSizeX, const int filterSize,
8442 const int paddingStart, const int moduleStride, const int imgStride,
8443 const int partialSum,
8444 const float scaleTargets, const float scaleOutputs) {
8445 __shared__ float shImages[pixelsPerThread * B_Y * numColors][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels
8446 __shared__ float shHidActs[B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidActs
8447
8448 const int tidx = B_X * threadIdx.y + threadIdx.x;
8449 const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
8450
8451 const int filterPixels = filterSize * filterSize;
8452 const int imgPixels = imgSizeY * imgSizeX;
8453
8454 const int filterBlocksPerModule = numFilters / B_X;
8455 const int outputModuleIdx = blockIdx.x / filterBlocksPerModule;
8456 const int moduleIdx = partialSum * outputModuleIdx;
8457 const int blockFilterIdx = B_X * (blockIdx.x % filterBlocksPerModule);
8458
8459 // const int moduleStride = (imgSize - filterSize + 1) / numModulesX;
8460 const int numModules = numModulesY * numModulesX;
8461
8462 const int blockPixelOffset = blockIdx.y * B_Y * pixelsPerThread;
8463
8464 images += loadX;
8465 hidActs += moduleIdx * numImages
8466 + blockFilterIdx * numImages * numModules
8467 + loadY * numImages * numModules
8468 + loadX;
8469
8470 targets += (outputModuleIdx * numFilters) * filterPixels * numColors
8471 + blockPixelOffset * numFilters
8472 + blockFilterIdx
8473 + threadIdx.y * numFilters + threadIdx.x;
8474
8475 float* shImgLoad = &shImages[loadY][loadX];
8476 float* shHidActLoad = &shHidActs[loadY][loadX];
8477
8478 float prod[numColors][pixelsPerThread];
8479 #pragma unroll
8480 for (int c = 0; c < numColors; c++) {
8481 #pragma unroll
8482 for (int p = 0; p < pixelsPerThread; p++) {
8483 prod[c][p] = 0;
8484 }
8485 }
8486
8487 __shared__ int pxDivs[B_Y*pixelsPerThread];
8488 if (tidx < B_Y * pixelsPerThread) {
8489 pxDivs[tidx] = (((blockPixelOffset + tidx) / filterSize) << 16) + ((blockPixelOffset + tidx) % filterSize);
8490 }
8491 __syncthreads();
8492 for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
8493 const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
8494 const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
8495 for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
8496 if (loadY < B_Y * pixelsPerThread) {
8497 /*
8498 * As long as B_Y * B_X is divisible by preloadCases this will loop the right
8499 * number of times.
8500 *
8501 * This will load some imgGrads from filter pixels that don't exit (it'll set those to 0),
8502 * but the code does not produce any output for those pixels (see last lines).
8503 */
8504 // #pragma unroll
8505 for (int y = 0; y < B_Y * pixelsPerThread; y += (B_X * B_Y) / preloadCases) {
8506 // Make sure number of rows in the array is divisible by number of rows filled per iteration
8507 if ((B_Y * pixelsPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y * pixelsPerThread) {
8508 const int pxIdx = loadY + y; // pixel idx in filter
8509
8510 if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8511 const int pxY = imgLoadModPosY + HI16(pxDivs[pxIdx]); // pixel x,y coords in image
8512 const int pxX = imgLoadModPosX + LO16(pxDivs[pxIdx]);
8513 if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
8514 const int pixIdx = (pxY * imgSizeX + pxX) * imgStride;
8515 #pragma unroll
8516 for (int c = 0; c < numColors; c++) {
8517 shImgLoad[(y + c * pixelsPerThread * B_Y) * preloadCases] = images[caseIdx + c * imgPixels * imgStride + pixIdx];
8518 }
8519 } else {
8520 #pragma unroll
8521 for (int c = 0; c < numColors; c++) {
8522 shImgLoad[(y + c * pixelsPerThread * B_Y) * preloadCases] = 0;
8523 }
8524 }
8525 } else {
8526 #pragma unroll
8527 for (int c = 0; c < numColors; c++) {
8528 shImgLoad[(y + c * pixelsPerThread * B_Y) * preloadCases] = 0;
8529 }
8530 }
8531 }
8532 }
8533 }
8534 if (loadY < B_X && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8535 #pragma unroll
8536 for (int y = 0; y < B_X; y += (B_X * B_Y) / preloadCases) {
8537 // Make sure number of rows in the array is divisible by number of rows filled per iteration
8538 if (B_X % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X) {
8539 shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules];
8540 }
8541 }
8542 }
8543
8544 __syncthreads();
8545 #pragma unroll
8546 for (int p = 0; p < pixelsPerThread; p++) {
8547 #pragma unroll
8548 for (int i = 0; i < preloadCases; i++) {
8549 #pragma unroll
8550 for (int c = 0; c < numColors; c++) {
8551 prod[c][p] += shImages[threadIdx.y + p * B_Y + c * pixelsPerThread * B_Y][i] * shHidActs[threadIdx.x][i];
8552 }
8553 }
8554 }
8555 __syncthreads();
8556 }
8557 hidActs += numImages;
8558 }
8559
8560 if (scale) {
8561 #pragma unroll
8562 for (int p = 0; p < pixelsPerThread; p++) {
8563 if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
8564 #pragma unroll
8565 for (int c = 0; c < numColors; c++) {
8566 targets[p * B_Y * numFilters + c * filterPixels * numFilters] = scaleTargets * targets[p * B_Y * numFilters + c * filterPixels * numFilters] + scaleOutputs * prod[c][p];
8567 }
8568 }
8569 }
8570 } else {
8571 #pragma unroll
8572 for (int p = 0; p < pixelsPerThread; p++) {
8573 if (blockPixelOffset + p * B_Y + threadIdx.y < filterPixels) {
8574 #pragma unroll
8575 for (int c = 0; c < numColors; c++) {
8576 targets[p * B_Y * numFilters + c * filterPixels * numFilters] = scaleOutputs * prod[c][p];
8577 }
8578 }
8579 }
8580 }
8581 }
8582
8583 /*
8584 * Each block computes weight gradients for B_Y pixels and B_X * filtersPerThread filters
8585 * threadIdx.x determines filter
8586 * threadIdx.y determines pixel in filter
8587 *
8588 * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum
8589 * blockIdx.y determines pixel, color batch of B_Y * colorsPerThread
8590 * In essence, blockIdx.y.x = 0...numFilterColors / colorsPerThread
8591 * blockIdx.y.y = 0...DIVUP(numPixels, B_Y)
8592 * ============
8593 * CONSTRAINTS:
8594 * ============
8595 * numFilters/numGroups must be divisible by B_X * filtersPerThread
8596 * numImgColors/numGroups must be divisible by colorsPerThread
8597 * numFilters must be divisible by numGroups
8598 * numImgColors must be divisible by numGroups
8599 * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false.
8600 *
8601 * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
8602 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
8603 *
8604 * targets: (numModulesY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters)
8605 *
8606 * B_Y * B_X should be divisible by preloadCases.
8607 * preloadCases one of 16, 32.
8608 * B_X one of 4, 8, 16, 32
8609 * B_Y arbitrary (satisfying divisibility constraints)
8610 *
8611 * This routine is especially fast when numFilters >= 32. That's when it should be used.
8612 */
8613 template <int B_Y, int B_X, int filtersPerThread, int colorsPerThread, int preloadCases, bool scale, bool checkCaseBounds>
8614 __global__ void conv_weight_acts_mc_mf(float* images, float* hidActs, float* targets,
8615 const int numImages, const int numFilters,
8616 const int numModulesY, const int numModulesX,
8617 const int imgSizeY, const int imgSizeX, const int filterSize,
8618 const int paddingStart, const int moduleStride, const int imgStride,
8619 const int numImgColors, const int numGroups, const int partialSum,
8620 const float scaleTargets, const float scaleOutputs) {
8621 __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels
8622 __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts
8623
8624 const int tidx = B_X * threadIdx.y + threadIdx.x;
8625 const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
8626
8627 const int filterPixels = filterSize * filterSize;
8628 const int imgPixels = imgSizeY * imgSizeX;
8629
8630 const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
8631 const int outputModuleIdx = blockIdx.x / numFilterBlocks;
8632 const int moduleIdx = partialSum * outputModuleIdx;
8633 const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
8634 const int numModules = numModulesY * numModulesX;
8635
8636 const int numFiltersPerGroup = numFilters / numGroups;
8637 const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
8638 const int numFilterColors = numImgColors / numGroups;
8639
8640 const int blockPixelOffset = (blockIdx.y / (numFilterColors/colorsPerThread)) * B_Y;
8641 const int filterColorIdx = (blockIdx.y % (numFilterColors/colorsPerThread)) * colorsPerThread;
8642 const int imgColorIdx = filterColorIdx + blockGroupIdx * numFilterColors;
8643
8644 images += imgColorIdx * imgPixels * imgStride + loadX;
8645
8646 hidActs += moduleIdx * numImages
8647 + blockFilterIdx * numImages * numModules
8648 + loadY * numImages * numModules
8649 + loadX;
8650
8651 targets += outputModuleIdx * numFilters * filterPixels * numFilterColors
8652 + filterColorIdx * filterPixels * numFilters
8653 + blockPixelOffset * numFilters
8654 + blockFilterIdx
8655 + threadIdx.y * numFilters + threadIdx.x;
8656
8657 float* shHidActLoad = &shHidActs[loadY][loadX];
8658 float* shImgLoad = &shImages[loadY][loadX];
8659 float prod[colorsPerThread][filtersPerThread];
8660 #pragma unroll
8661 for (int c = 0; c < colorsPerThread; c++) {
8662 #pragma unroll
8663 for (int f = 0; f < filtersPerThread; f++) {
8664 prod[c][f] = 0;
8665 }
8666 }
8667 // This avoids doing a division in an inner loop
8668 __shared__ int pxDivs[B_Y];
8669 if (tidx < B_Y) {
8670 pxDivs[tidx] = (((blockPixelOffset + tidx) / filterSize) << 16) + (blockPixelOffset + tidx) % filterSize;
8671 }
8672 __syncthreads();
8673 for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
8674 const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
8675 const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
8676 for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
8677 if (loadY < B_Y) {
8678 /*
8679 * As long as B_Y * B_X is divisible by preloadCases this will loop the right
8680 * number of times.
8681 *
8682 * This will load some images from filter pixels that don't exist (it'll set those to 0),
8683 * but the code does not produce any output for those pixels (see last lines).
8684 */
8685 // #pragma unroll
8686 for (int y = 0; y < B_Y; y += (B_X * B_Y) / preloadCases) {
8687 // Make sure number of rows in the array is divisible by number of rows filled per iteration
8688 if (B_Y % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y) {
8689 const int pxIdx = loadY + y; // pixel idx in filter
8690
8691 if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8692 const int pxY = imgLoadModPosY + HI16(pxDivs[pxIdx]);//pxIdx / filterSize; // pixel x,y coords in image
8693 const int pxX = imgLoadModPosX + LO16(pxDivs[pxIdx]);
8694 if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
8695 const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
8696 #pragma unroll
8697 for (int c = 0; c < colorsPerThread; c++) {
8698 shImgLoad[(y + c * B_Y) * preloadCases] = images[caseIdx + c * imgPixels * imgStride + pixIdx];
8699 }
8700 } else {
8701 #pragma unroll
8702 for (int c = 0; c < colorsPerThread; c++) {
8703 shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8704 }
8705 }
8706 } else {
8707 #pragma unroll
8708 for (int c = 0; c < colorsPerThread; c++) {
8709 shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8710 }
8711 }
8712 }
8713 }
8714 }
8715 if (loadY < B_X * filtersPerThread && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8716 #pragma unroll
8717 for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
8718 // Make sure number of rows in the array is divisible by number of rows filled per iteration
8719 if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) {
8720 shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules];
8721 }
8722 }
8723 }
8724
8725 __syncthreads();
8726
8727 #pragma unroll
8728 for (int c = 0; c < colorsPerThread; c++) {
8729 #pragma unroll
8730 for (int i = 0; i < preloadCases; i++) {
8731 #pragma unroll
8732 for (int f = 0; f < filtersPerThread; f++) {
8733 prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i];
8734 }
8735 }
8736 }
8737 __syncthreads();
8738 }
8739 hidActs += numImages;
8740 }
8741 if (blockPixelOffset + threadIdx.y < filterPixels) {
8742 if (scale) {
8743 #pragma unroll
8744 for (int f = 0; f < filtersPerThread; f++) {
8745 #pragma unroll
8746 for (int c = 0; c < colorsPerThread; c++) {
8747 targets[c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f];
8748 }
8749 }
8750 } else {
8751 #pragma unroll
8752 for (int f = 0; f < filtersPerThread; f++) {
8753 #pragma unroll
8754 for (int c = 0; c < colorsPerThread; c++) {
8755 targets[c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f];
8756 }
8757 }
8758 }
8759 }
8760 }
8761
8762 /*
8763 * Each block computes weight gradients for B_Y pixels and B_X * filtersPerThread filters
8764 * threadIdx.x determines filter
8765 * threadIdx.y determines pixel in filter
8766 *
8767 * blockIdx.x determines filter batch of B_X * filtersPerThread, module batch of partialSum
8768 * blockIdx.y determines pixel, color batch of B_Y * colorsPerThread
8769 * In essence, blockIdx.y.x = 0...numFilterColors / colorsPerThread
8770 * blockIdx.y.y = 0...DIVUP(numPixels, B_Y)
8771 * ============
8772 * CONSTRAINTS:
8773 * ============
8774 * numFilters/numGroups must be divisible by B_X * filtersPerThread
8775 * numFilterColors must be divisible by colorsPerThread
8776 * numFilters must be divisible by numGroups
8777 * numImgColors must be divisible by numFilterColors
8778 * Number of images (cases) should be divisible by preloadCases if checkCaseBounds is false.
8779 *
8780 * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
8781 * hidActs: (numFilters, numModulesY, numModulesX, numImages)
8782 *
8783 * targets: (numModules, numFilterColors, filterPixels, numFilters)
8784 * colorIndices: (numGroups, numFilterColors)
8785 *
8786 * B_Y * B_X should be divisible by preloadCases.
8787 * preloadCases one of 16, 32.
8788 * B_X one of 4, 8, 16, 32
8789 * B_Y arbitrary (satisfying divisibility constraints)
8790 *
8791 * This routine is especially fast when numFilters >= 32. That's when it should be used.
8792 */
8793 template <int B_Y, int B_X, int filtersPerThread, int colorsPerThread, int preloadCases, bool scale, bool checkCaseBounds>
8794 __global__ void conv_weight_acts_mc_mf_rand(float* images, float* hidActs, float* targets, int* colorIndices,
8795 const int numImages, const int numFilters,
8796 const int numModulesY, const int numModulesX,
8797 const int imgSizeY, const int imgSizeX, const int filterSize,
8798 const int paddingStart, const int moduleStride, const int imgStride,
8799 const int numFilterColors, const int numGroups, const int partialSum,
8800 const float scaleTargets, const float scaleOutputs) {
8801 __shared__ float shImages[colorsPerThread * B_Y][preloadCases]; // preload preloadCases cases of B_Y * pixelsPerThread pixels
8802 __shared__ float shHidActs[filtersPerThread * B_X][preloadCases + 1]; // preload preloadCases cases of B_X hidacts
8803 __shared__ int shColors[colorsPerThread];
8804 // This avoids doing a division in an inner loop
8805 __shared__ int pxDivs[B_Y];
8806 const int tidx = B_X * threadIdx.y + threadIdx.x;
8807 const int loadY = tidx / preloadCases, loadX = tidx % preloadCases;
8808
8809 const int filterPixels = filterSize * filterSize;
8810 const int imgPixels = imgSizeY * imgSizeX;
8811
8812 const int numFilterBlocks = numFilters / (B_X * filtersPerThread);
8813 const int outputModuleIdx = blockIdx.x / numFilterBlocks;
8814 const int moduleIdx = partialSum * outputModuleIdx;
8815 const int blockFilterIdx = filtersPerThread * B_X * (blockIdx.x % numFilterBlocks);
8816 const int numModules = numModulesY * numModulesX;
8817
8818 const int numFiltersPerGroup = numFilters / numGroups;
8819 const int blockGroupIdx = blockFilterIdx / numFiltersPerGroup;
8820
8821 const int blockPixelOffset = (blockIdx.y / (numFilterColors/colorsPerThread)) * B_Y;
8822 const int filterColorIdx = (blockIdx.y % (numFilterColors/colorsPerThread)) * colorsPerThread;
8823 // const int imgColorIdx = filterColorIdx + blockGroupIdx * numFilterColors;
8824
8825 images += loadX;
8826
8827 hidActs += moduleIdx * numImages
8828 + blockFilterIdx * numImages * numModules
8829 + loadY * numImages * numModules
8830 + loadX;
8831
8832 targets += outputModuleIdx * numFilters * filterPixels * numFilterColors
8833 + filterColorIdx * filterPixels * numFilters
8834 + blockPixelOffset * numFilters
8835 + blockFilterIdx
8836 + threadIdx.y * numFilters + threadIdx.x;
8837
8838 float* shHidActLoad = &shHidActs[loadY][loadX];
8839 float* shImgLoad = &shImages[loadY][loadX];
8840 float prod[colorsPerThread][filtersPerThread];
8841 #pragma unroll
8842 for (int c = 0; c < colorsPerThread; c++) {
8843 #pragma unroll
8844 for (int f = 0; f < filtersPerThread; f++) {
8845 prod[c][f] = 0;
8846 }
8847 }
8848
8849 if (tidx < B_Y) {
8850 pxDivs[tidx] = ((blockPixelOffset + tidx) / filterSize << 16) + ((blockPixelOffset + tidx) % filterSize);
8851 }
8852 if (tidx < colorsPerThread) {
8853 shColors[tidx] = colorIndices[blockGroupIdx * numFilterColors + filterColorIdx + tidx] * imgPixels * imgStride;
8854 }
8855 __syncthreads();
8856 for (int m = moduleIdx; m < moduleIdx + partialSum; m++) {
8857 const int imgLoadModPosY = paddingStart + (m / numModulesX) * moduleStride;
8858 const int imgLoadModPosX = paddingStart + (m % numModulesX) * moduleStride;
8859 for (int caseIdx = 0; caseIdx < numImages; caseIdx += preloadCases) {
8860 if (loadY < B_Y) {
8861 /*
8862 * As long as B_Y * B_X is divisible by preloadCases this will loop the right
8863 * number of times.
8864 *
8865 * This will load some images from filter pixels that don't exist (it'll set those to 0),
8866 * but the code does not produce any output for those pixels (see last lines).
8867 */
8868 // #pragma unroll
8869 for (int y = 0; y < B_Y; y += (B_X * B_Y) / preloadCases) {
8870 // Make sure number of rows in the array is divisible by number of rows filled per iteration
8871 if (B_Y % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_Y) {
8872 const int pxIdx = loadY + y; // pixel idx in filter
8873
8874 if (pxIdx + blockPixelOffset < filterPixels && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8875 const int pxY = imgLoadModPosY + HI16(pxDivs[pxIdx]);//pxIdx / filterSize; // pixel x,y coords in image
8876 const int pxX = imgLoadModPosX + LO16(pxDivs[pxIdx]);
8877 if (pxY >= 0 && pxY < imgSizeY && pxX >= 0 && pxX < imgSizeX) {
8878 const int pixIdx = (pxY * imgSizeX + pxX) * imgStride; // pixel idx in image
8879 #pragma unroll
8880 for (int c = 0; c < colorsPerThread; c++) {
8881 shImgLoad[(y + c * B_Y) * preloadCases] = images[caseIdx + shColors[c] + pixIdx];
8882 }
8883 } else {
8884 #pragma unroll
8885 for (int c = 0; c < colorsPerThread; c++) {
8886 shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8887 }
8888 }
8889 } else {
8890 #pragma unroll
8891 for (int c = 0; c < colorsPerThread; c++) {
8892 shImgLoad[(y + c * B_Y) * preloadCases] = 0;
8893 }
8894 }
8895 }
8896 }
8897 }
8898 if (loadY < B_X * filtersPerThread && (!checkCaseBounds || caseIdx + loadX < numImages)) {
8899 #pragma unroll
8900 for (int y = 0; y < B_X * filtersPerThread; y += (B_X * B_Y) / preloadCases) {
8901 // Make sure number of rows in the array is divisible by number of rows filled per iteration
8902 if ((B_X * filtersPerThread) % (B_X * B_Y / preloadCases) == 0 || y + loadY < B_X * filtersPerThread) {
8903 shHidActLoad[y * (preloadCases + 1)] = hidActs[caseIdx + y * numImages * numModules];
8904 }
8905 }
8906 }
8907
8908 __syncthreads();
8909
8910 #pragma unroll
8911 for (int c = 0; c < colorsPerThread; c++) {
8912 #pragma unroll
8913 for (int i = 0; i < preloadCases; i++) {
8914 #pragma unroll
8915 for (int f = 0; f < filtersPerThread; f++) {
8916 prod[c][f] += shImages[threadIdx.y + c * B_Y][i] * shHidActs[threadIdx.x + f * B_X][i];
8917 }
8918 }
8919 }
8920 __syncthreads();
8921 }
8922 hidActs += numImages;
8923 }
8924 if (blockPixelOffset + threadIdx.y < filterPixels) {
8925 if (scale) {
8926 #pragma unroll
8927 for (int f = 0; f < filtersPerThread; f++) {
8928 #pragma unroll
8929 for (int c = 0; c < colorsPerThread; c++) {
8930 targets[c * filterPixels * numFilters + f * B_X] = scaleTargets * targets[c * filterPixels * numFilters + f * B_X] + scaleOutputs * prod[c][f];
8931 }
8932 }
8933 } else {
8934 #pragma unroll
8935 for (int f = 0; f < filtersPerThread; f++) {
8936 #pragma unroll
8937 for (int c = 0; c < colorsPerThread; c++) {
8938 targets[c * filterPixels * numFilters + f * B_X] = scaleOutputs * prod[c][f];
8939 }
8940 }
8941 }
8942 }
8943 }
8944
8945 /*
8946 * images: (numImgColors, imgSizeY, imgSizeX, numImages), with stride given
8947 * hidActs: (numFilters, numModules, numImages)
8948 *
8949 * targets: (numModuleY*numModulesX/partialSum, numFilterColors, filterPixels, numFilters)
8950 *
8951 * TODO: you can get a slight speed boost for local non-convolutional units by writing special
8952 * routines for partialSum = 1. But I dunno if the code duplication is worth it...
8953 *
8954 * Note: all of these convolution routines are optimized for the case when
8955 * the number of images (i.e. the minibatch size) is a multiple of 128.
8956 * Other batch sizes will work, but but I made no attempt whatsoever
8957 * to make them work fast.
8958 */
8959 void _weightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
8960 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors,
8961 int numGroups, int partialSum, float scaleTargets, float scaleOutput) {
8962 int numFilterColors = numImgColors / numGroups;
8963 int imgStride = images.getStride();
8964 int numImages = images.getNumCols();
8965 int imgPixels = images.getNumRows() / numImgColors;
8966 int imgSizeX = imgPixels / imgSizeY;
8967 int numModules = numModulesY * numModulesX;
8968 int numFilters = hidActs.getNumRows() / numModules;
8969 int numFiltersPerGroup = numFilters / numGroups;
8970
8971 assert(numImgColors % numGroups == 0);
8972 assert(numFilters % (16*numGroups) == 0);
8973 if (!(numGroups > 1 || (numImgColors > 0 && (numImgColors <= 3 || numImgColors % 4 == 0))))
8974 {
8975 printf("numGroups: %d\n", numGroups);
8976 printf("numImgColors: %d\n", numImgColors);
8977 assert(false);
8978 }
8979 assert(numGroups == 1 || numFilterColors % 4 == 0);
8980 assert(imgSizeY * imgSizeX == imgPixels);
8981 assert(images.getNumRows() == imgPixels * numImgColors);
8982
8983 int filterPixels = filterSize * filterSize;
8984 partialSum = partialSum == 0 ? numModules : partialSum;
8985
8986 assert(numModules % partialSum == 0);
8987 assert(hidActs.getNumCols() == numImages);
8988
8989 // These routines don't handle the case when only part of the image is visited in the convolution
8990 assert(paddingStart <= 0);
8991 // assert changed to if statement by Ian Goodfellow
8992 if (paddingStart + (numModulesX-1)*moduleStride + filterSize < imgSizeX)
8993 {
8994 printf("imgSizeX: %d\n", imgSizeX);
8995 printf("numModulesX: %d\n", numModulesX);
8996 assert(false);
8997 }
8998 assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
8999 assert(moduleStride <= filterSize);
9000
9001 assert(numModules * numFilters == hidActs.getNumRows());
9002
9003 assert(!images.isTrans());
9004 assert(!hidActs.isTrans());
9005 assert(hidActs.isContiguous());
9006
9007 assert(!targets.isTrans());
9008 assert(targets.isContiguous());
9009
9010 int preloadCases = 32;
9011
9012 dim3 blocks, threads;
9013 int bx, by;
9014 int pixelsPerThread, filtersPerThread, colorsPerThread;
9015 // Worth playing with these parameters to find best values for your problem.
9016 // These values work relatively well, but not optimal for all problems.
9017 if (numFilterColors > 3) {
9018 filtersPerThread = numFiltersPerGroup % 32 == 0 ? 2 : 1;
9019 colorsPerThread = numFilterColors % 8 == 0 ? 8 : 4;
9020 by = numFiltersPerGroup % 64 == 0 ? 4 : 8;
9021 bx = numFiltersPerGroup % 64 == 0 ? 32 : 16;
9022 blocks = dim3((numModules/partialSum)*(numFilters/(bx*filtersPerThread)), DIVUP(filterPixels, by) * (numFilterColors / colorsPerThread));
9023 } else {
9024 assert(numGroups == 1); // Just for sanity
9025 pixelsPerThread = numFilters % 32 == 0 ? (numImgColors == 1 ? 8 : 5) : (numImgColors == 1 ? 5 : 2);
9026 by = numFilters % 32 == 0 ? 4 : 8; // by == 4 seems to work best
9027 bx = numFilters % 32 == 0 ? 32 : 16;
9028 blocks = dim3((numModules/partialSum)*(numFilters/bx), DIVUP(filterPixels, by*pixelsPerThread));
9029 }
9030 assert((by * bx) % preloadCases == 0);
9031 threads = dim3(bx, by);
9032 bool checkCaseBounds = numImages % 32 != 0;
9033
9034 /* Modified by Ian Goodfellow. I removed the branch here, because our wrapper doesn't
9035 support resizing when the data isn't owned by the NVMatrix. Also, the resize should
9036 always be a no-op, because in the context we're likely to use this, we should always
9037 have allocated the right size of NVMatrix to receive the gradient.
9038 if (scaleTargets == 0) {
9039 targets.resize((numModules/partialSum) * numFilterColors*filterPixels, numFilters);
9040 } else {
9041 assert(targets.getNumRows() == (numModules/partialSum) * numFilterColors*filterPixels);
9042 assert(targets.getNumCols() == numFilters);
9043 }
9044 */
9045
9046
9047 assert(targets.getNumRows() == (numModules/partialSum) * numFilterColors*filterPixels);
9048 assert(targets.getNumCols() == numFilters);
9049
9050 if (numFilterColors > 3) {
9051 if (scaleTargets == 0) { // do not scale
9052 if (numFiltersPerGroup % 64 == 0) {
9053 if (numFilterColors % 8 == 0) {
9054 if (checkCaseBounds) {
9055 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9056 conv_weight_acts_mc_mf<4,32,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9057 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9058 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9059 } else {
9060 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9061 conv_weight_acts_mc_mf<4,32,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9062 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9063 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9064 }
9065 } else {
9066 if (checkCaseBounds) {
9067 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9068 conv_weight_acts_mc_mf<4,32,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9069 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9070 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9071 } else {
9072 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9073 conv_weight_acts_mc_mf<4,32,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9074 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9075 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9076 }
9077 }
9078 } else if (numFiltersPerGroup % 32 == 0) {
9079 if (numFilterColors % 8 == 0) {
9080 if (checkCaseBounds) {
9081 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9082 conv_weight_acts_mc_mf<8,16,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9083 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9084 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9085 } else {
9086 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9087 conv_weight_acts_mc_mf<8,16,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9088 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9089 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9090 }
9091 } else {
9092 if (checkCaseBounds) {
9093 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9094 conv_weight_acts_mc_mf<8,16,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9095 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9096 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9097 } else {
9098 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9099 conv_weight_acts_mc_mf<8,16,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9100 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9101 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9102 }
9103 }
9104 } else {
9105 if (numFilterColors % 8 == 0) {
9106 if (checkCaseBounds) {
9107 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9108 conv_weight_acts_mc_mf<8,16,1,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9109 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9110 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9111 } else {
9112 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9113 conv_weight_acts_mc_mf<8,16,1,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9114 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9115 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9116 }
9117 } else {
9118 if (checkCaseBounds) {
9119 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9120 conv_weight_acts_mc_mf<8,16,1,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9121 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9122 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9123 } else {
9124 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, false>, cudaFuncCachePreferShared);
9125 conv_weight_acts_mc_mf<8,16,1,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9126 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9127 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9128 }
9129 }
9130 }
9131 } else {
9132
9133 if (numFiltersPerGroup % 64 == 0) {
9134 if (numFilterColors % 8 == 0) {
9135 if (checkCaseBounds) {
9136 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9137 conv_weight_acts_mc_mf<4,32,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9138 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9139 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9140 } else {
9141 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9142 conv_weight_acts_mc_mf<4,32,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9143 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9144 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9145 }
9146 } else {
9147 if (checkCaseBounds) {
9148 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9149 conv_weight_acts_mc_mf<4,32,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9150 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9151 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9152 } else {
9153 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9154 conv_weight_acts_mc_mf<4,32,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9155 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9156 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9157 }
9158 }
9159 } else if (numFiltersPerGroup % 32 == 0) {
9160 if (numFilterColors % 8 == 0) {
9161 if (checkCaseBounds) {
9162 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9163 conv_weight_acts_mc_mf<8,16,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9164 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9165 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9166 } else {
9167 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9168 conv_weight_acts_mc_mf<8,16,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9169 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9170 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9171 }
9172 } else {
9173 if (checkCaseBounds) {
9174 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9175 conv_weight_acts_mc_mf<8,16,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9176 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9177 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9178 } else {
9179 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9180 conv_weight_acts_mc_mf<8,16,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9181 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9182 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9183 }
9184 }
9185 } else {
9186 if (numFilterColors % 8 == 0) {
9187 if (checkCaseBounds) {
9188 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9189 conv_weight_acts_mc_mf<8,16,1,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9190 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9191 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9192 } else {
9193 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9194 conv_weight_acts_mc_mf<8,16,1,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9195 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9196 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9197 }
9198 } else {
9199 if (checkCaseBounds) {
9200 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9201 conv_weight_acts_mc_mf<8,16,1,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9202 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9203 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9204 } else {
9205 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf<8,16,1,4,32, false, false>, cudaFuncCachePreferShared);
9206 conv_weight_acts_mc_mf<8,16,1,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9207 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9208 paddingStart, moduleStride, imgStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9209 }
9210 }
9211 }
9212 }
9213 } else { // numColors in 1,2,3
9214 if (scaleTargets == 0) { // do not scale
9215 if (numFilterColors == 1) {
9216 if (checkCaseBounds) {
9217 if (numFilters % 32 == 0) {
9218 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, false, true>, cudaFuncCachePreferShared);
9219 conv_weight_acts_c<4,32,8,32,1,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9220 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9221 } else {
9222 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, false, true>, cudaFuncCachePreferShared);
9223 conv_weight_acts_c<8,16,5,32,1,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9224 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9225 }
9226 } else {
9227 if (numFilters % 32 == 0) {
9228 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, false, false>, cudaFuncCachePreferShared);
9229 conv_weight_acts_c<4,32,8,32,1,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9230 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9231 } else {
9232 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, false, false>, cudaFuncCachePreferShared);
9233 conv_weight_acts_c<8,16,5,32,1,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9234 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9235 }
9236 }
9237 } else if (numFilterColors == 2) {
9238 if (checkCaseBounds) {
9239 if (numFilters % 32 == 0) {
9240 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, false, true>, cudaFuncCachePreferShared);
9241 conv_weight_acts_c<4,32,5,32,2,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9242 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9243 } else {
9244 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, false, true>, cudaFuncCachePreferShared);
9245 conv_weight_acts_c<8,16,2,32,2,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9246 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9247 }
9248 } else {
9249 if (numFilters % 32 == 0) {
9250 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, false, false>, cudaFuncCachePreferShared);
9251 conv_weight_acts_c<4,32,5,32,2,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9252 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9253 } else {
9254 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, false, false>, cudaFuncCachePreferShared);
9255 conv_weight_acts_c<8,16,2,32,2,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9256 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9257 }
9258 }
9259 } else if (numFilterColors == 3) {
9260 if (checkCaseBounds) {
9261 if (numFilters % 32 == 0) {
9262 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, false, true>, cudaFuncCachePreferShared);
9263 conv_weight_acts_c<4,32,5,32,3,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9264 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9265 } else {
9266 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, false, true>, cudaFuncCachePreferShared);
9267 conv_weight_acts_c<8,16,2,32,3,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9268 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9269 }
9270 } else {
9271 if (numFilters % 32 == 0) {
9272 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, false, false>, cudaFuncCachePreferShared);
9273 conv_weight_acts_c<4,32,5,32,3,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9274 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9275 } else {
9276 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, false, false>, cudaFuncCachePreferShared);
9277 conv_weight_acts_c<8,16,2,32,3,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9278 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9279 }
9280 }
9281 }
9282
9283 } else { // do scale
9284 if (numFilterColors == 1) {
9285 if (checkCaseBounds) {
9286 if (numFilters % 32 == 0) {
9287 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, true, true>, cudaFuncCachePreferShared);
9288 conv_weight_acts_c<4,32,8,32,1,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9289 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9290 } else {
9291 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, true, true>, cudaFuncCachePreferShared);
9292 conv_weight_acts_c<8,16,5,32,1,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9293 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9294 }
9295 } else {
9296 if (numFilters % 32 == 0) {
9297 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,8,32,1, true, false>, cudaFuncCachePreferShared);
9298 conv_weight_acts_c<4,32,8,32,1,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9299 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9300 } else {
9301 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,5,32,1, true, false>, cudaFuncCachePreferShared);
9302 conv_weight_acts_c<8,16,5,32,1,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9303 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9304 }
9305 }
9306 } else if (numFilterColors == 2) {
9307 if (checkCaseBounds) {
9308 if (numFilters % 32 == 0) {
9309 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, true, true>, cudaFuncCachePreferShared);
9310 conv_weight_acts_c<4,32,5,32,2,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9311 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9312 } else {
9313 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, true, true>, cudaFuncCachePreferShared);
9314 conv_weight_acts_c<8,16,2,32,2,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9315 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9316 }
9317 } else {
9318 if (numFilters % 32 == 0) {
9319 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,2, true, false>, cudaFuncCachePreferShared);
9320 conv_weight_acts_c<4,32,5,32,2,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9321 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9322 } else {
9323 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,2, true, false>, cudaFuncCachePreferShared);
9324 conv_weight_acts_c<8,16,2,32,2,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9325 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9326 }
9327 }
9328 } else if (numFilterColors == 3) {
9329 if (checkCaseBounds) {
9330 if (numFilters % 32 == 0) {
9331 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, true, true>, cudaFuncCachePreferShared);
9332 conv_weight_acts_c<4,32,5,32,3,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9333 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9334 } else {
9335 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, true, true>, cudaFuncCachePreferShared);
9336 conv_weight_acts_c<8,16,2,32,3,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9337 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9338 }
9339 } else {
9340 if (numFilters % 32 == 0) {
9341 cudaFuncSetCacheConfig(conv_weight_acts_c<4,32,5,32,3, true, false>, cudaFuncCachePreferShared);
9342 conv_weight_acts_c<4,32,5,32,3,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9343 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9344 } else {
9345 cudaFuncSetCacheConfig(conv_weight_acts_c<8,16,2,32,3, true, false>, cudaFuncCachePreferShared);
9346 conv_weight_acts_c<8,16,2,32,3,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(),
9347 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize, paddingStart, moduleStride, imgStride, partialSum, scaleTargets, scaleOutput);
9348 }
9349 }
9350 }
9351 }
9352 }
9353 cutilCheckMsg("weightActs: kernel execution failed");
9354 }
9355
9356 void convWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9357 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum) {
9358 _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, 0, 1);
9359 }
9360
9361 void convWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9362 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups, int partialSum,
9363 float scaleTargets, float scaleOutput) {
9364 _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, partialSum, scaleTargets, scaleOutput);
9365 }
9366
9367 void localWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9368 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numGroups) {
9369 _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1, 0, 1);
9370 }
9371
9372 void localWeightActs(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets,
9373 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9374 int numImgColors, int numGroups, float scaleTargets, float scaleOutput) {
9375 _weightActs(images, hidActs, targets, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart, moduleStride, numImgColors, numGroups, 1,
9376 scaleTargets, scaleOutput);
9377 }
9378
9379 /*
9380 * images: (numImgColors, imgPixels, numImages), with stride given
9381 * hidActs: (numFilters, numModules, numImages)
9382 *
9383 * targets: (numModules/partialSum, numFilterColors, filterPixels, numFilters)
9384 * colorIndices: (numGroups, numFilterColors)
9385 *
9386 * Note: all of these convolution routines are optimized for the case when
9387 * the number of images (i.e. the minibatch size) is a multiple of 128.
9388 * Other batch sizes will work, but but I made no attempt whatsoever
9389 * to make them work fast.
9390 */
9391 void _weightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9392 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9393 int numImgColors, int numFilterColors, int numGroups, int partialSum,
9394 float scaleTargets, float scaleOutput) {
9395 int imgStride = images.getStride();
9396 int numImages = images.getNumCols();
9397 int imgPixels = images.getNumRows() / numImgColors;
9398 int imgSizeX = imgPixels / imgSizeY;
9399 int numModules = numModulesY * numModulesX;
9400 int numFilters = hidActs.getNumRows() / numModules;
9401 int numFiltersPerGroup = numFilters / numGroups;
9402
9403 assert(numGroups > 1);
9404 assert(numImgColors % numFilterColors == 0);
9405 assert((numFilterColors * numGroups) % numImgColors == 0);
9406 assert(numFilters % (16*numGroups) == 0);
9407 assert(numFilterColors % 4 == 0);
9408 assert(imgSizeY * imgSizeX == imgPixels);
9409 assert(images.getNumRows() == imgPixels * numImgColors);
9410
9411 int filterPixels = filterSize * filterSize;
9412 partialSum = partialSum == 0 ? numModules : partialSum;
9413
9414 assert(numModules % partialSum == 0);
9415 assert(hidActs.getNumCols() == numImages);
9416
9417 // These routines don't handle the case when only part of the image is visited in the convolution
9418 assert(paddingStart <= 0);
9419 assert(paddingStart + (numModulesX-1)*moduleStride + filterSize >= imgSizeX);
9420 assert(paddingStart + (numModulesY-1)*moduleStride + filterSize >= imgSizeY);
9421 assert(moduleStride <= filterSize);
9422
9423 assert(numModules * numFilters == hidActs.getNumRows());
9424
9425 assert(!images.isTrans());
9426 assert(!hidActs.isTrans());
9427 assert(hidActs.isContiguous());
9428
9429 assert(!targets.isTrans());
9430 assert(targets.isContiguous());
9431
9432 int preloadCases = 32;
9433
9434 dim3 blocks, threads;
9435 int bx, by;
9436 int filtersPerThread, colorsPerThread;
9437
9438 filtersPerThread = numFiltersPerGroup % 32 == 0 ? 2 : 1;
9439 colorsPerThread = numFilterColors % 8 == 0 ? 8 : 4;
9440 by = numFiltersPerGroup % 64 == 0 ? 4 : 8;
9441 bx = numFiltersPerGroup % 64 == 0 ? 32 : 16;
9442 blocks = dim3((numModules/partialSum)*(numFilters/(bx*filtersPerThread)), DIVUP(filterPixels, by) * (numFilterColors / colorsPerThread));
9443
9444 assert((by * bx) % preloadCases == 0);
9445 threads = dim3(bx, by);
9446 bool checkCaseBounds = numImages % 32 != 0;
9447
9448 if (scaleTargets == 0) {
9449 targets.resize((numModules/partialSum) * numFilterColors*filterPixels, numFilters);
9450 } else {
9451 assert(targets.getNumRows() == (numModules/partialSum) * numFilterColors*filterPixels);
9452 assert(targets.getNumCols() == numFilters);
9453 }
9454
9455 if (scaleTargets == 0) { // do not scale
9456 if (numFiltersPerGroup % 64 == 0) {
9457 if (numFilterColors % 8 == 0) {
9458 if (checkCaseBounds) {
9459 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9460 conv_weight_acts_mc_mf_rand<4,32,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9461 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9462 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9463 } else {
9464 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9465 conv_weight_acts_mc_mf_rand<4,32,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9466 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9467 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9468 }
9469 } else {
9470 if (checkCaseBounds) {
9471 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9472 conv_weight_acts_mc_mf_rand<4,32,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9473 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9474 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9475 } else {
9476 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9477 conv_weight_acts_mc_mf_rand<4,32,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9478 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9479 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9480 }
9481 }
9482 } else if (numFiltersPerGroup % 32 == 0) {
9483 if (numFilterColors % 8 == 0) {
9484 if (checkCaseBounds) {
9485 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9486 conv_weight_acts_mc_mf_rand<8,16,2,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9487 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9488 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9489 } else {
9490 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9491 conv_weight_acts_mc_mf_rand<8,16,2,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9492 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9493 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9494 }
9495 } else {
9496 if (checkCaseBounds) {
9497 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9498 conv_weight_acts_mc_mf_rand<8,16,2,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9499 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9500 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9501 } else {
9502 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9503 conv_weight_acts_mc_mf_rand<8,16,2,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9504 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9505 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9506 }
9507 }
9508 } else {
9509 if (numFilterColors % 8 == 0) {
9510 if (checkCaseBounds) {
9511 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9512 conv_weight_acts_mc_mf_rand<8,16,1,8,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9513 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9514 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9515 } else {
9516 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9517 conv_weight_acts_mc_mf_rand<8,16,1,8,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9518 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9519 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9520 }
9521 } else {
9522 if (checkCaseBounds) {
9523 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9524 conv_weight_acts_mc_mf_rand<8,16,1,4,32,false, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9525 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9526 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9527 } else {
9528 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,4,32, false, false>, cudaFuncCachePreferShared);
9529 conv_weight_acts_mc_mf_rand<8,16,1,4,32,false, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9530 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9531 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9532 }
9533 }
9534 }
9535
9536 } else {
9537 if (numFiltersPerGroup % 64 == 0) {
9538 if (numFilterColors % 8 == 0) {
9539 if (checkCaseBounds) {
9540 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, true>, cudaFuncCachePreferShared);
9541 conv_weight_acts_mc_mf_rand<4,32,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9542 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9543 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9544 } else {
9545 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,8,32, false, false>, cudaFuncCachePreferShared);
9546 conv_weight_acts_mc_mf_rand<4,32,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9547 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9548 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9549 }
9550 } else {
9551 if (checkCaseBounds) {
9552 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, true>, cudaFuncCachePreferShared);
9553 conv_weight_acts_mc_mf_rand<4,32,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9554 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9555 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9556 } else {
9557 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<4,32,2,4,32, false, false>, cudaFuncCachePreferShared);
9558 conv_weight_acts_mc_mf_rand<4,32,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9559 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9560 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9561 }
9562 }
9563 } else if (numFiltersPerGroup % 32 == 0) {
9564 if (numFilterColors % 8 == 0) {
9565 if (checkCaseBounds) {
9566 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, true>, cudaFuncCachePreferShared);
9567 conv_weight_acts_mc_mf_rand<8,16,2,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9568 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9569 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9570 } else {
9571 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,8,32, false, false>, cudaFuncCachePreferShared);
9572 conv_weight_acts_mc_mf_rand<8,16,2,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9573 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9574 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9575 }
9576 } else {
9577 if (checkCaseBounds) {
9578 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, true>, cudaFuncCachePreferShared);
9579 conv_weight_acts_mc_mf_rand<8,16,2,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9580 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9581 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9582 } else {
9583 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9584 conv_weight_acts_mc_mf_rand<8,16,2,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9585 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9586 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9587 }
9588 }
9589 } else {
9590 if (numFilterColors % 8 == 0) {
9591 if (checkCaseBounds) {
9592 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, true>, cudaFuncCachePreferShared);
9593 conv_weight_acts_mc_mf_rand<8,16,1,8,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9594 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9595 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9596 } else {
9597 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,8,32, false, false>, cudaFuncCachePreferShared);
9598 conv_weight_acts_mc_mf_rand<8,16,1,8,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9599 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9600 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9601 }
9602 } else {
9603 if (checkCaseBounds) {
9604 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,1,4,32, false, true>, cudaFuncCachePreferShared);
9605 conv_weight_acts_mc_mf_rand<8,16,1,4,32,true, true><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9606 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9607 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9608 } else {
9609 cudaFuncSetCacheConfig(conv_weight_acts_mc_mf_rand<8,16,2,4,32, false, false>, cudaFuncCachePreferShared);
9610 conv_weight_acts_mc_mf_rand<8,16,1,4,32,true, false><<<blocks, threads>>>(images.getDevData(), hidActs.getDevData(), targets.getDevData(), dColorIndices,
9611 numImages, numFilters, numModulesY, numModulesX, imgSizeY, imgSizeX, filterSize,
9612 paddingStart, moduleStride, imgStride, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9613 }
9614 }
9615 }
9616 }
9617 cutilCheckMsg("weightActsSparse: kernel execution failed");
9618 }
9619
9620 void convWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9621 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9622 int numImgColors, int numFilterColors, int numGroups) {
9623 _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9624 moduleStride, numImgColors, numFilterColors, numGroups, 0, 1, 0);
9625 }
9626
9627 void convWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9628 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numFilterColors,
9629 int numGroups, int partialSum, float scaleTargets, float scaleOutput) {
9630 _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9631 moduleStride, numImgColors, numFilterColors, numGroups, partialSum, scaleTargets, scaleOutput);
9632 }
9633
9634 void localWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9635 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride,
9636 int numImgColors, int numFilterColors, int numGroups) {
9637 _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9638 moduleStride, numImgColors, numFilterColors, numGroups, 1, 1, 0);
9639 }
9640
9641 void localWeightActsSparse(NVMatrix& images, NVMatrix& hidActs, NVMatrix& targets, int* dColorIndices,
9642 int imgSizeY, int numModulesY, int numModulesX, int filterSize, int paddingStart, int moduleStride, int numImgColors, int numFilterColors,
9643 int numGroups, float scaleTargets, float scaleOutput) {
9644 _weightActsSparse(images, hidActs, targets, dColorIndices, imgSizeY, numModulesY, numModulesX, filterSize, paddingStart,
9645 moduleStride, numImgColors, numFilterColors, numGroups, 1, scaleTargets, scaleOutput);
9646 }
9647
===============================
In file included from /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1804:0,
from /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/ndarrayobject.h:17,
from /usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/arrayobject.h:4,
from /usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/cuda_ndarray.cuh:35,
from /home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/nvmatrix.cuh:49,
from mod.cu:130:
/usr/local/lib/python2.7/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h:15:2: warning: #warning "Using deprecated NumPy API, disable it by " "#defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" [-Wcpp]
#warning "Using deprecated NumPy API, disable it by " \
^
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(90): error: more than one instance of overloaded function "cublasGetVersion_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(102): error: more than one instance of overloaded function "cublasSnrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(103): error: more than one instance of overloaded function "cublasDnrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(104): error: more than one instance of overloaded function "cublasScnrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(105): error: more than one instance of overloaded function "cublasDznrm2_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(108): error: more than one instance of overloaded function "cublasSdot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(110): error: more than one instance of overloaded function "cublasDdot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(112): error: more than one instance of overloaded function "cublasCdotu_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(114): error: more than one instance of overloaded function "cublasCdotc_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(116): error: more than one instance of overloaded function "cublasZdotu_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(118): error: more than one instance of overloaded function "cublasZdotc_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(122): error: more than one instance of overloaded function "cublasSscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(123): error: more than one instance of overloaded function "cublasDscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(124): error: more than one instance of overloaded function "cublasCscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(125): error: more than one instance of overloaded function "cublasZscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(127): error: more than one instance of overloaded function "cublasCsscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(128): error: more than one instance of overloaded function "cublasZdscal_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(131): error: more than one instance of overloaded function "cublasSaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(133): error: more than one instance of overloaded function "cublasDaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(135): error: more than one instance of overloaded function "cublasCaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(137): error: more than one instance of overloaded function "cublasZaxpy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(141): error: more than one instance of overloaded function "cublasScopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(143): error: more than one instance of overloaded function "cublasDcopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(145): error: more than one instance of overloaded function "cublasCcopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(147): error: more than one instance of overloaded function "cublasZcopy_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(151): error: more than one instance of overloaded function "cublasSswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(152): error: more than one instance of overloaded function "cublasDswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(153): error: more than one instance of overloaded function "cublasCswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(154): error: more than one instance of overloaded function "cublasZswap_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(157): error: more than one instance of overloaded function "cublasIsamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(158): error: more than one instance of overloaded function "cublasIdamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(159): error: more than one instance of overloaded function "cublasIcamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(160): error: more than one instance of overloaded function "cublasIzamax_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(163): error: more than one instance of overloaded function "cublasIsamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(164): error: more than one instance of overloaded function "cublasIdamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(166): error: more than one instance of overloaded function "cublasIcamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(167): error: more than one instance of overloaded function "cublasIzamin_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(170): error: more than one instance of overloaded function "cublasSasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(171): error: more than one instance of overloaded function "cublasDasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(172): error: more than one instance of overloaded function "cublasScasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(173): error: more than one instance of overloaded function "cublasDzasum_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(176): error: more than one instance of overloaded function "cublasSrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(178): error: more than one instance of overloaded function "cublasDrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(180): error: more than one instance of overloaded function "cublasCrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(182): error: more than one instance of overloaded function "cublasZrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(185): error: more than one instance of overloaded function "cublasCsrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(187): error: more than one instance of overloaded function "cublasZdrot_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(191): error: more than one instance of overloaded function "cublasSrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(192): error: more than one instance of overloaded function "cublasDrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(193): error: more than one instance of overloaded function "cublasCrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(195): error: more than one instance of overloaded function "cublasZrotg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(199): error: more than one instance of overloaded function "cublasSrotm_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(201): error: more than one instance of overloaded function "cublasDrotm_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(205): error: more than one instance of overloaded function "cublasSrotmg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(207): error: more than one instance of overloaded function "cublasDrotmg_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(212): error: more than one instance of overloaded function "cublasSgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(215): error: more than one instance of overloaded function "cublasDgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(218): error: more than one instance of overloaded function "cublasCgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(221): error: more than one instance of overloaded function "cublasZgemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(226): error: more than one instance of overloaded function "cublasSgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(230): error: more than one instance of overloaded function "cublasDgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(234): error: more than one instance of overloaded function "cublasCgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(238): error: more than one instance of overloaded function "cublasZgbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(244): error: more than one instance of overloaded function "cublasStrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(246): error: more than one instance of overloaded function "cublasDtrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(248): error: more than one instance of overloaded function "cublasCtrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(250): error: more than one instance of overloaded function "cublasZtrmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(254): error: more than one instance of overloaded function "cublasStbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(256): error: more than one instance of overloaded function "cublasDtbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(258): error: more than one instance of overloaded function "cublasCtbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(260): error: more than one instance of overloaded function "cublasZtbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(264): error: more than one instance of overloaded function "cublasStpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(266): error: more than one instance of overloaded function "cublasDtpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(268): error: more than one instance of overloaded function "cublasCtpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(270): error: more than one instance of overloaded function "cublasZtpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(273): error: more than one instance of overloaded function "cublasStrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(275): error: more than one instance of overloaded function "cublasDtrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(277): error: more than one instance of overloaded function "cublasCtrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(279): error: more than one instance of overloaded function "cublasZtrsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(283): error: more than one instance of overloaded function "cublasStpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(286): error: more than one instance of overloaded function "cublasDtpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(288): error: more than one instance of overloaded function "cublasCtpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(290): error: more than one instance of overloaded function "cublasZtpsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(294): error: more than one instance of overloaded function "cublasStbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(298): error: more than one instance of overloaded function "cublasDtbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(301): error: more than one instance of overloaded function "cublasCtbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(305): error: more than one instance of overloaded function "cublasZtbsv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(310): error: more than one instance of overloaded function "cublasSsymv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(313): error: more than one instance of overloaded function "cublasDsymv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(316): error: more than one instance of overloaded function "cublasChemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(319): error: more than one instance of overloaded function "cublasZhemv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(324): error: more than one instance of overloaded function "cublasSsbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(327): error: more than one instance of overloaded function "cublasDsbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(330): error: more than one instance of overloaded function "cublasChbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(333): error: more than one instance of overloaded function "cublasZhbmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(338): error: more than one instance of overloaded function "cublasSspmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(341): error: more than one instance of overloaded function "cublasDspmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(344): error: more than one instance of overloaded function "cublasChpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(347): error: more than one instance of overloaded function "cublasZhpmv_v2" has "C" linkage
/usr/local/cuda/bin/../targets/x86_64-linux/include/cublas.h(353): error: more than one instance of overloaded function "cublasSger_v2" has "C" linkage
Error limit reached.
100 errors detected in the compilation of "/tmp/tmpxft_000009bf_00000000-8_mod.cpp1.ii".
Compilation terminated.
ERROR (pylearn2.sandbox.cuda_convnet.convnet_compile): Failed to compile /home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet/mod.cu ('nvmatrix_kernels.cu', 'nvmatrix.cu', 'conv_util.cu', 'filter_acts.cu', 'img_acts.cu', 'weight_acts.cu'): ('nvcc return status', 4, 'for cmd', 'nvcc -shared -g -O3 -arch=sm_30 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=d67f7c8a21306c67152a70a88a837011,-fPIC -Xlinker -rpath,/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray -I/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/ -I/usr/local/lib/python2.7/dist-packages/numpy/core/include -I/usr/include/python2.7 -I/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda -o /home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet/cuda_convnet.so mod.cu -L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray -L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet -LNone/lib -LNone/lib64 -L/usr/lib -lpython2.7 -lcublas -lcudart')
['nvcc', '-shared', '-g', '-O3', '-arch=sm_30', '-m64', '-Xcompiler', '-DCUDA_NDARRAY_CUH=d67f7c8a21306c67152a70a88a837011,-fPIC', '-Xlinker', '-rpath,/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray', '-I/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/', '-I/usr/local/lib/python2.7/dist-packages/numpy/core/include', '-I/usr/include/python2.7', '-I/usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda', '-o', '/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet/cuda_convnet.so', 'mod.cu', '-L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_ndarray', '-L/home/ubuntu/.theano/compiledir_Linux-3.13.0-35-generic-x86_64-with-Ubuntu-14.04-trusty-x86_64-2.7.6-64/cuda_convnet', '-LNone/lib', '-LNone/lib64', '-L/usr/lib', '-lpython2.7', '-lcublas', '-lcudart']
Traceback (most recent call last):
File "/home/ubuntu/pylearn2/pylearn2/scripts/train.py", line 252, in <module>
args.verbose_logging, args.debug)
File "/home/ubuntu/pylearn2/pylearn2/scripts/train.py", line 197, in train
train_obj = serial.load_train_file(config)
File "/home/ubuntu/pylearn2/pylearn2/utils/serial.py", line 524, in load_train_file
return yaml_parse.load_path(config_file_path, environ=environ)
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 379, in load_path
return load(content, instantiate=instantiate, environ=environ, **kwargs)
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 337, in load
return _instantiate(proxy_graph)
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 280, in _instantiate
return _instantiate_proxy_tuple(proxy, bindings)
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 229, in _instantiate_proxy_tuple
for k, v in proxy.keywords.iteritems())
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 229, in <genexpr>
for k, v in proxy.keywords.iteritems())
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 280, in _instantiate
return _instantiate_proxy_tuple(proxy, bindings)
File "/home/ubuntu/pylearn2/pylearn2/config/yaml_parse.py", line 230, in _instantiate_proxy_tuple
obj = checked_call(proxy.callable, kwargs)
File "/home/ubuntu/pylearn2/pylearn2/utils/call_check.py", line 99, in checked_call
return to_call(**kwargs)
File "/home/ubuntu/pylearn2/pylearn2/models/mlp.py", line 490, in __init__
self._update_layer_input_spaces()
File "/home/ubuntu/pylearn2/pylearn2/models/mlp.py", line 555, in _update_layer_input_spaces
layers[0].set_input_space(self.get_input_space())
File "/home/ubuntu/pylearn2/pylearn2/models/maxout.py", line 803, in set_input_space
dummy_p = dummy_p.eval()
File "/usr/local/lib/python2.7/dist-packages/theano/gof/graph.py", line 420, in eval
self._fn = theano.function(self._fn_inputs, self)
File "/usr/local/lib/python2.7/dist-packages/theano/compile/function.py", line 223, in function
profile=profile)
File "/usr/local/lib/python2.7/dist-packages/theano/compile/pfunc.py", line 512, in pfunc
on_unused_input=on_unused_input)
File "/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.py", line 1312, in orig_function
defaults)
File "/usr/local/lib/python2.7/dist-packages/theano/compile/function_module.py", line 1181, in create
_fn, _i, _o = self.linker.make_thunk(input_storage=input_storage_lists)
File "/usr/local/lib/python2.7/dist-packages/theano/gof/link.py", line 434, in make_thunk
output_storage=output_storage)[:3]
File "/usr/local/lib/python2.7/dist-packages/theano/gof/vm.py", line 847, in make_all
no_recycling))
File "/home/ubuntu/pylearn2/pylearn2/sandbox/cuda_convnet/pool.py", line 334, in make_thunk
raise RuntimeError('Could not compile cuda_convnet')
RuntimeError: ('The following error happened while compiling the node', <pylearn2.sandbox.cuda_convnet.pool.MaxPool object at 0x7f82776f8410>(GpuContiguous.0), '\n', 'Could not compile cuda_convnet')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment