MarkusPfundstein/Versions

## cuda-memcheck
========= CUDA-MEMCHECK
========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemsetAsync.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x232029]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a11d]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
=========     Host Frame:python [0x649]
=========
========= Program hit cudaErrorInvalidResourceHandle (error 33) due to "invalid resource handle" on CUDA API call to cudaEventRecord.
=========     Saved host backtrace up to driver entry point at error
=========     Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x231732]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x2140a]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a2b2]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
=========     Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
=========     Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libp0.7.0
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
=========     Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
=========     Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
=========     Host Frame:python [0x649]
=========
========= ERROR SUMMARY: 2 errors

## nvidia-smi OUTPUT
Tue Jun  9 08:45:54 2015
+------------------------------------------------------+
| NVIDIA-SMI 346.59     Driver Version: 346.59         |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla K80           Off  | 0000:83:00.0     Off |                    0 |
| N/A   34C    P0    59W / 149W |     55MiB / 11519MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla K80           Off  | 0000:84:00.0     Off |                    0 |
| N/A   32C    P0    75W / 149W |     55MiB / 11519MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   2  Tesla K80           Off  | 0000:87:00.0     Off |                    0 |
| N/A   34C    P0    60W / 149W |     55MiB / 11519MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   3  Tesla K80           Off  | 0000:88:00.0     Off |                    0 |
| N/A   31C    P0    75W / 149W |     55MiB / 11519MiB |     97%      Default |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

## queryDevice output
./queryDevice OUTPUT:
NVIDIA_CUDA-7.0_Samples/bin/x86_64/linux/release/deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 4 CUDA Capable device(s)

Device 0: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 131 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Device 1: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 132 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Device 2: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 135 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Device 3: "Tesla K80"
  CUDA Driver Version / Runtime Version          7.0 / 7.0
  CUDA Capability Major/Minor version number:    3.7
  Total amount of global memory:                 11520 MBytes (12079136768 bytes)
  (13) Multiprocessors, (192) CUDA Cores/MP:     2496 CUDA Cores
  GPU Max Clock rate:                            824 MHz (0.82 GHz)
  Memory Clock rate:                             2505 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 1572864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
  Maximum Layered 1D Texture Size, (num) layers  1D=(16384), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(16384, 16384), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  2048
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Enabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Domain ID / Bus ID / location ID:   0 / 136 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU2) : Yes
> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU3) : Yes
> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU3) : Yes
> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU3) : Yes
> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU0) : Yes
> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU0) : Yes
> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU1) : Yes
> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU2) : Yes

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 4, Device0 = Tesla K80, Device1 = Tesla K80, Device2 = Tesla K80, Device3 = Tesla K80
Result = PASS

## runtime exception
Traceback (most recent call last):
  File "/home/<xy>net/<xy>nn/main.py", line 281, in <module>
    args.func(args)
  File "/home/<xy>net/<xy>nn/main.py", line 72, in train
    nn.train(data_provider)
  File "/home/<xy>net/<xy>nn/NN/neural_net.py", line 399, in train
    cost = self._train(batch_offset[0], batch_offset[1])
  File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 606, in __call__
    storage_map=self.fn.storage_map)
  File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 595, in __call__
    outputs = self.fn()
RuntimeError: GpuCorrMM encountered a CUBLAS error: an internal operation failed
This could be a known bug in CUDA, please see the GpuCorrMM() documentation.

Apply node that caused the error: GpuCorrMM{valid, (1, 1)}(GpuContiguous.0, GpuContiguous.0)
Inputs types: [CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, 4D)]
Inputs shapes: [(3, 5, 128, 128), (3, 5, 118, 118)]
Inputs strides: [(81920, 16384, 128, 1), (69620, 13924, 118, 1)]
Inputs values: ['not shown', 'not shown']

HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

## Versions
NVIDIA drivers:
<xy>net@train-k80:~/kaggle$ cat /proc/driver/nvidia/version
NVRM version: NVIDIA UNIX x86_64 Kernel Module  346.59  Tue Mar 31 14:10:31 PDT 2015
GCC version:  gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)

Theano:
<xy>net@train-k80:~/kaggle$ python -c 'import theano; print theano.__version__'
Using gpu device 0: Tesla K80
0.7.0
	========= CUDA-MEMCHECK
	========= Program hit cudaErrorInvalidValue (error 11) due to "invalid argument" on CUDA API call to cudaMemsetAsync.
	========= Saved host backtrace up to driver entry point at error
	========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x232029]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a11d]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
	========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
	========= Host Frame:python [0x649]
	=========
	========= Program hit cudaErrorInvalidResourceHandle (error 33) due to "invalid resource handle" on CUDA API call to cudaEventRecord.
	========= Saved host backtrace up to driver entry point at error
	========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2e4263]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x231732]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x2140a]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x11a2b2]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0x119840]
	========= Host Frame:/usr/local/cuda-7.0/lib64/libcublas.so.7.0 [0xe7f12]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_Z6corrMMP11CudaNdarrayS0_S0_iiiii + 0x63d) [0x2c2d]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/tmpWEAUFm/6b20a8021c12925d87ac05c6d9b33c6c.so (_ZN62_GLOBAL__N__38_tmpxft_000046f7_00000000_9_mod_cpp1_ii_9233ac7753__struct_compiled_op_6b20a8021c12925d87ac05c6d9b33c6c3runEv + 0x625) [0x37d5]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x36eb]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4128]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x3a00]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x4064]
	========= Host Frame:/home/<xy>net/.theano/compiledir_Linux-3.13--generic-x86_64-with-debian-jessie-sid-x86_64-2.7.10-64/lazylinker_ext/lazylinker_ext.so [0x47b7]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x77891]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0x5a96f]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libp0.7.0
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 [0xb4b2c]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyObject_Call + 0x53) [0x48333]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x4c42) [0xf74b2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalFrameEx + 0x5ee0) [0xf8750]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCodeEx + 0x88e) [0xf9dbe]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyEval_EvalCode + 0x32) [0xf9ed2]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_FileExFlags + 0xb0) [0x119e10]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (PyRun_SimpleFileExFlags + 0xef) [0x119fef]
	========= Host Frame:/home/<xy>net/anaconda/bin/../lib/libpython2.7.so.1.0 (Py_Main + 0xca4) [0x12f8f4]
	========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21ec5]
	========= Host Frame:python [0x649]
	=========
	========= ERROR SUMMARY: 2 errors
	Tue Jun 9 08:45:54 2015
	+------------------------------------------------------+
	\| NVIDIA-SMI 346.59 Driver Version: 346.59 \|
	\|-------------------------------+----------------------+----------------------+
	\| GPU Name Persistence-M\| Bus-Id Disp.A \| Volatile Uncorr. ECC \|
	\| Fan Temp Perf Pwr:Usage/Cap\| Memory-Usage \| GPU-Util Compute M. \|
	\|===============================+======================+======================\|
	\| 0 Tesla K80 Off \| 0000:83:00.0 Off \| 0 \|
	\| N/A 34C P0 59W / 149W \| 55MiB / 11519MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+
	\| 1 Tesla K80 Off \| 0000:84:00.0 Off \| 0 \|
	\| N/A 32C P0 75W / 149W \| 55MiB / 11519MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+
	\| 2 Tesla K80 Off \| 0000:87:00.0 Off \| 0 \|
	\| N/A 34C P0 60W / 149W \| 55MiB / 11519MiB \| 0% Default \|
	+-------------------------------+----------------------+----------------------+
	\| 3 Tesla K80 Off \| 0000:88:00.0 Off \| 0 \|
	\| N/A 31C P0 75W / 149W \| 55MiB / 11519MiB \| 97% Default \|
	+-------------------------------+----------------------+----------------------+

	+-----------------------------------------------------------------------------+
	\| Processes: GPU Memory \|
	\| GPU PID Type Process name Usage \|
	\|=============================================================================\|
	\| No running processes found \|
	+-----------------------------------------------------------------------------+
	./queryDevice OUTPUT:
	NVIDIA_CUDA-7.0_Samples/bin/x86_64/linux/release/deviceQuery Starting...

	CUDA Device Query (Runtime API) version (CUDART static linking)

	Detected 4 CUDA Capable device(s)

	Device 0: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 131 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 1: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 132 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 2: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 135 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

	Device 3: "Tesla K80"
	CUDA Driver Version / Runtime Version 7.0 / 7.0
	CUDA Capability Major/Minor version number: 3.7
	Total amount of global memory: 11520 MBytes (12079136768 bytes)
	(13) Multiprocessors, (192) CUDA Cores/MP: 2496 CUDA Cores
	GPU Max Clock rate: 824 MHz (0.82 GHz)
	Memory Clock rate: 2505 Mhz
	Memory Bus Width: 384-bit
	L2 Cache Size: 1572864 bytes
	Maximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)
	Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layers
	Maximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layers
	Total amount of constant memory: 65536 bytes
	Total amount of shared memory per block: 49152 bytes
	Total number of registers available per block: 65536
	Warp size: 32
	Maximum number of threads per multiprocessor: 2048
	Maximum number of threads per block: 1024
	Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
	Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)
	Maximum memory pitch: 2147483647 bytes
	Texture alignment: 512 bytes
	Concurrent copy and kernel execution: Yes with 2 copy engine(s)
	Run time limit on kernels: No
	Integrated GPU sharing Host Memory: No
	Support host page-locked memory mapping: Yes
	Alignment requirement for Surfaces: Yes
	Device has ECC support: Enabled
	Device supports Unified Addressing (UVA): Yes
	Device PCI Domain ID / Bus ID / location ID: 0 / 136 / 0
	Compute Mode:
	< Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
	> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU2) : Yes
	> Peer access from Tesla K80 (GPU0) -> Tesla K80 (GPU3) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU3) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU3) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU0) : Yes
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU1) : No
	> Peer access from Tesla K80 (GPU1) -> Tesla K80 (GPU2) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU0) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU2) -> Tesla K80 (GPU2) : No
	> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU0) : Yes
	> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU1) : Yes
	> Peer access from Tesla K80 (GPU3) -> Tesla K80 (GPU2) : Yes

	deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 7.0, CUDA Runtime Version = 7.0, NumDevs = 4, Device0 = Tesla K80, Device1 = Tesla K80, Device2 = Tesla K80, Device3 = Tesla K80
	Result = PASS
	Traceback (most recent call last):
	File "/home/<xy>net/<xy>nn/main.py", line 281, in <module>
	args.func(args)
	File "/home/<xy>net/<xy>nn/main.py", line 72, in train
	nn.train(data_provider)
	File "/home/<xy>net/<xy>nn/NN/neural_net.py", line 399, in train
	cost = self._train(batch_offset[0], batch_offset[1])
	File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 606, in __call__
	storage_map=self.fn.storage_map)
	File "/home/<xy>net/anaconda/lib/python2.7/site-packages/theano/compile/function_module.py", line 595, in __call__
	outputs = self.fn()
	RuntimeError: GpuCorrMM encountered a CUBLAS error: an internal operation failed
	This could be a known bug in CUDA, please see the GpuCorrMM() documentation.

	Apply node that caused the error: GpuCorrMM{valid, (1, 1)}(GpuContiguous.0, GpuContiguous.0)
	Inputs types: [CudaNdarrayType(float32, 4D), CudaNdarrayType(float32, 4D)]
	Inputs shapes: [(3, 5, 128, 128), (3, 5, 118, 118)]
	Inputs strides: [(81920, 16384, 128, 1), (69620, 13924, 118, 1)]
	Inputs values: ['not shown', 'not shown']

	HINT: Re-running with most Theano optimization disabled could give you a back-trace of when this node was created. This can be done with by setting the Theano flag 'optimizer=fast_compile'. If that does not work, Theano optimizations can be disabled with 'optimizer=None'.
	HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.
	NVIDIA drivers:
	<xy>net@train-k80:~/kaggle$ cat /proc/driver/nvidia/version
	NVRM version: NVIDIA UNIX x86_64 Kernel Module 346.59 Tue Mar 31 14:10:31 PDT 2015
	GCC version: gcc version 4.8.2 (Ubuntu 4.8.2-19ubuntu1)

	Theano:
	<xy>net@train-k80:~/kaggle$ python -c 'import theano; print theano.__version__'
	Using gpu device 0: Tesla K80
	0.7.0