-
-
Save zomux/5e80052300ce675e21fe to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:73 | |
Time in 16 calls to Function.__call__: 9.254583e+00s | |
Time in Function.fn.__call__: 9.252455e+00s (99.977%) | |
Time in thunks: 9.208560e+00s (99.503%) | |
Total compile time: 1.111225e+01s | |
Number of Apply nodes: 344 | |
Theano Optimizer time: 9.731210e+00s | |
Theano validate time: 2.255249e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.302246e+00s | |
Import time 8.582520e-02s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
55.9% 55.9% 5.147s 1.07e-01s Py 48 3 theano.scan_module.scan_op.Scan | |
30.9% 86.8% 2.847s 1.62e-02s C 176 11 theano.sandbox.cuda.blas.GpuDot22 | |
6.9% 93.8% 0.640s 4.00e-02s C 16 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
4.9% 98.7% 0.452s 4.03e-03s C 112 7 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.5% 99.2% 0.047s 9.72e-05s Py 480 30 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.3% 99.4% 0.025s 1.57e-03s C 16 1 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.2% 99.7% 0.021s 4.35e-04s Py 48 3 theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1 | |
0.1% 99.8% 0.013s 1.35e-04s C 96 6 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.007s 1.51e-04s C 48 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 99.9% 0.003s 2.00e-06s C 1696 106 theano.tensor.elemwise.Elemwise | |
0.0% 99.9% 0.001s 3.77e-05s C 32 2 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.0% 100.0% 0.001s 1.26e-06s C 704 44 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.001s 1.42e-06s C 576 36 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.0% 100.0% 0.001s 1.39e-06s C 496 31 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.001s 3.24e-05s Py 16 1 theano.tensor.basic.ARange | |
0.0% 100.0% 0.000s 2.26e-06s C 208 13 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.0% 100.0% 0.000s 1.14e-05s Py 32 2 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.0% 100.0% 0.000s 7.79e-07s C 448 28 theano.tensor.basic.ScalarFromTensor | |
0.0% 100.0% 0.000s 2.14e-05s C 16 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
0.0% 100.0% 0.000s 1.12e-05s C 16 1 theano.tensor.basic.Join | |
... (remaining 6 Classes account for 0.01%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
42.3% 42.3% 3.892s 2.43e-01s Py 16 1 forall_inplace,gpu,scan_fn} | |
30.9% 73.2% 2.847s 1.62e-02s C 176 11 GpuDot22 | |
13.6% 86.8% 1.255s 3.92e-02s Py 32 2 forall_inplace,gpu,scan_fn} | |
6.9% 93.8% 0.640s 4.00e-02s C 16 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
2.5% 96.2% 0.226s 4.72e-03s C 48 3 GpuElemwise{Mul}[(0, 0)] | |
2.4% 98.7% 0.225s 7.02e-03s C 32 2 GpuElemwise{Add}[(0, 0)] | |
0.4% 99.1% 0.040s 1.47e-04s Py 272 17 GpuReshape{2} | |
0.3% 99.4% 0.025s 1.57e-03s C 16 1 GpuJoin | |
0.2% 99.6% 0.021s 4.35e-04s Py 48 3 GpuAdvancedSubtensor1 | |
0.1% 99.7% 0.013s 1.35e-04s C 96 6 GpuAlloc{memset_0=True} | |
0.1% 99.8% 0.007s 1.51e-04s C 48 3 GpuFromHost | |
0.1% 99.9% 0.007s 3.19e-05s Py 208 13 GpuReshape{3} | |
0.0% 99.9% 0.001s 3.77e-05s C 32 2 GpuCAReduce{add}{1} | |
0.0% 99.9% 0.001s 1.39e-06s C 496 31 MakeVector | |
0.0% 99.9% 0.001s 5.96e-06s C 112 7 Elemwise{sub,no_inplace} | |
0.0% 99.9% 0.001s 3.97e-05s C 16 1 GpuElemwise{Composite{(log(clip(i0, i1, i2)) * i3)}}[(0, 0)] | |
0.0% 99.9% 0.001s 3.24e-05s Py 16 1 ARange | |
0.0% 99.9% 0.000s 1.09e-06s C 400 25 Shape_i{0} | |
0.0% 99.9% 0.000s 1.50e-06s C 272 17 Shape_i{1} | |
0.0% 99.9% 0.000s 2.30e-05s C 16 1 GpuElemwise{Composite{((-i0) / i1)}}[(0, 0)] | |
... (remaining 55 Ops account for 0.06%(0.01s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
42.3% 42.3% 3.892s 2.43e-01s 16 309 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
24.0% 66.3% 2.214s 1.38e-01s 16 324 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
7.2% 73.5% 0.661s 4.13e-02s 16 298 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSub | |
tensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{ | |
memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuf | |
6.9% 80.4% 0.640s 4.00e-02s 16 336 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(Gp | |
uContiguous.0) | |
6.4% 86.9% 0.594s 3.71e-02s 16 297 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSu | |
btensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
loc{memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDim | |
2.4% 89.3% 0.221s 1.38e-02s 16 329 GpuElemwise{Add}[(0, 0)](GpuReshape{3}.0, GpuDimShuffle{x,x,0}.0) | |
2.3% 91.6% 0.215s 1.34e-02s 16 333 GpuElemwise{Mul}[(0, 0)](GpuReshape{2}.0, GpuDimShuffle{0,x}.0) | |
1.2% 92.8% 0.112s 7.01e-03s 16 307 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.7% 93.5% 0.061s 3.78e-03s 16 313 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 94.1% 0.058s 3.61e-03s 16 148 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 94.7% 0.058s 3.60e-03s 16 151 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 95.4% 0.058s 3.59e-03s 16 145 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 96.0% 0.057s 3.59e-03s 16 146 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 96.6% 0.057s 3.59e-03s 16 149 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 97.2% 0.057s 3.59e-03s 16 147 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 97.9% 0.057s 3.59e-03s 16 152 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.6% 98.5% 0.057s 3.59e-03s 16 150 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.3% 98.8% 0.025s 1.57e-03s 16 304 GpuJoin(TensorConstant{2}, GpuDimShuffle{1,0,2}.0, GpuSubtensor{::, ::i | |
nt64}.0) | |
0.2% 99.0% 0.021s 1.29e-03s 16 312 GpuReshape{2}(GpuDimShuffle{1,0,2}.0, MakeVector.0) | |
0.2% 99.1% 0.014s 8.71e-04s 16 142 GpuReshape{2}(GpuDimShuffle{1,0,2}.0, MakeVector.0) | |
... (remaining 324 Apply instances account for 0.86%(0.08s) of the runtime) | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/sandbox/cuda/dnn.py:206 | |
Time in 1 calls to Function.__call__: 2.312660e-05s | |
Time in Function.fn.__call__: 6.914139e-06s (29.897%) | |
Total compile time: 3.566572e+00s | |
Number of Apply nodes: 1 | |
Theano Optimizer time: 2.098083e-05s | |
Theano validate time: 0.000000e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.560112e+00s | |
Import time 1.200914e-03s | |
Time in all call to theano.grad() 8.831870e+00s | |
No execution time accumulated (hint: try config profiling.time_thunks=1) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 16 calls of the op (for a total of 381 steps) 5.888572e-01s | |
Total time spent in calling the VM 5.220029e-01s (88.647%) | |
Total overhead (computing slices..) 6.685424e-02s (11.353%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
84.0% 84.0% 0.434s 2.85e-04s C 1524 4 theano.sandbox.cuda.blas.GpuGemm | |
10.2% 94.2% 0.053s 4.61e-05s C 1143 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.4% 99.5% 0.028s 3.63e-05s C 762 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.002s 3.15e-06s C 762 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
84.0% 84.0% 0.434s 2.85e-04s C 1524 4 GpuGemm{no_inplace} | |
5.4% 89.3% 0.028s 3.63e-05s C 762 2 GpuFromHost | |
3.8% 93.1% 0.020s 5.16e-05s C 381 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
3.7% 96.9% 0.019s 5.07e-05s C 381 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
2.7% 99.5% 0.014s 3.61e-05s C 381 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.002s 3.15e-06s C 762 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
21.6% 21.6% 0.112s 2.93e-04s 381 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
21.0% 42.6% 0.108s 2.84e-04s 381 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
20.7% 63.3% 0.107s 2.81e-04s 381 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
20.7% 84.0% 0.107s 2.81e-04s 381 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
3.8% 87.8% 0.020s 5.16e-05s 381 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.7% 91.5% 0.019s 5.07e-05s 381 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.0% 94.5% 0.016s 4.09e-05s 381 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.7% 97.2% 0.014s 3.61e-05s 381 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.3% 99.5% 0.012s 3.18e-05s 381 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.002s 3.97e-06s 381 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.001s 2.32e-06s 381 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 16 calls of the op (for a total of 381 steps) 6.570778e-01s | |
Total time spent in calling the VM 5.623438e-01s (85.583%) | |
Total overhead (computing slices..) 9.473395e-02s (14.417%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
78.6% 78.6% 0.438s 2.87e-04s C 1524 4 theano.sandbox.cuda.blas.GpuGemm | |
15.6% 94.3% 0.087s 7.60e-05s C 1143 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.3% 99.5% 0.029s 3.84e-05s C 762 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.003s 3.53e-06s C 762 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
78.6% 78.6% 0.438s 2.87e-04s C 1524 4 GpuGemm{no_inplace} | |
9.4% 88.0% 0.052s 1.37e-04s C 381 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
5.3% 93.3% 0.029s 3.84e-05s C 762 2 GpuFromHost | |
3.6% 96.9% 0.020s 5.30e-05s C 381 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
2.6% 99.5% 0.015s 3.83e-05s C 381 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.003s 3.53e-06s C 762 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
20.4% 20.4% 0.114s 2.99e-04s 381 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
19.6% 40.1% 0.109s 2.87e-04s 381 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
19.3% 59.4% 0.107s 2.82e-04s 381 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
19.3% 78.6% 0.107s 2.81e-04s 381 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
9.4% 88.0% 0.052s 1.37e-04s 381 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.6% 91.6% 0.020s 5.30e-05s 381 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.0% 94.6% 0.017s 4.37e-05s 381 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.6% 97.2% 0.015s 3.83e-05s 381 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.3% 99.5% 0.013s 3.31e-05s 381 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.002s 4.43e-06s 381 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.001s 2.63e-06s 381 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 16 calls of the op (for a total of 632 steps) 3.881260e+00s | |
Total time spent in calling the VM 3.777264e+00s (97.321%) | |
Total overhead (computing slices..) 1.039963e-01s (2.679%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
36.8% 36.8% 1.384s 4.38e-04s C 3160 5 theano.sandbox.cuda.blas.GpuDot22 | |
35.1% 71.9% 1.318s 2.61e-04s C 5056 8 theano.sandbox.cuda.blas.GpuGemm | |
18.3% 90.2% 0.689s 1.82e-04s C 3792 6 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.2% 95.4% 0.195s 3.09e-04s C 632 1 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.8% 97.3% 0.069s 1.09e-04s C 632 1 theano.sandbox.cuda.blas.GpuGemv | |
0.9% 98.2% 0.035s 2.73e-05s Py 1264 2 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 98.9% 0.027s 4.31e-05s C 632 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 99.4% 0.019s 2.96e-05s C 632 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
0.4% 99.8% 0.014s 2.27e-05s C 632 1 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.003s 7.27e-07s C 4424 7 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.1% 99.9% 0.003s 2.24e-06s C 1264 2 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.001s 5.34e-07s C 2528 4 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.001s 4.34e-07s C 1264 2 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.000s 5.48e-07s C 632 1 theano.sandbox.cuda.basic_ops.GpuContiguous | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
36.8% 36.8% 1.384s 4.38e-04s C 3160 5 GpuDot22 | |
35.1% 71.9% 1.318s 2.61e-04s C 5056 8 GpuGemm{inplace} | |
7.6% 79.4% 0.284s 4.50e-04s C 632 1 GpuElemwise{mul,no_inplace} | |
5.3% 84.7% 0.198s 3.13e-04s C 632 1 GpuElemwise{add,no_inplace} | |
5.2% 89.9% 0.195s 3.09e-04s C 632 1 GpuCAReduce{add}{0,1,0} | |
3.4% 93.3% 0.129s 2.04e-04s C 632 1 GpuElemwise{Tanh}[(0, 0)] | |
1.8% 95.2% 0.069s 1.09e-04s C 632 1 GpuGemv{inplace} | |
1.1% 96.2% 0.040s 6.33e-05s C 632 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))},no_inplace} | |
0.9% 97.2% 0.035s 2.73e-05s Py 1264 2 GpuReshape{2} | |
0.7% 97.9% 0.027s 4.31e-05s C 632 1 GpuFromHost | |
0.7% 98.6% 0.027s 4.28e-05s C 632 1 GpuElemwise{Composite{(scalar_sigmoid((i0 + i1)) * tanh(i2)) | |
},no_inplace} | |
0.5% 99.1% 0.019s 2.96e-05s C 632 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
0.4% 99.5% 0.014s 2.27e-05s C 632 1 GpuAlloc{memset_0=True} | |
0.3% 99.8% 0.011s 1.79e-05s C 632 1 GpuElemwise{Mul}[(0, 0)] | |
0.1% 99.8% 0.002s 3.92e-06s C 632 1 Elemwise{Cast{float32}} | |
0.0% 99.9% 0.001s 6.19e-07s C 1264 2 Shape_i{0} | |
0.0% 99.9% 0.001s 4.50e-07s C 1264 2 Shape_i{1} | |
0.0% 99.9% 0.001s 4.34e-07s C 1264 2 MakeVector | |
0.0% 99.9% 0.001s 8.45e-07s C 632 1 GpuDimShuffle{0} | |
0.0% 99.9% 0.001s 8.12e-07s C 632 1 GpuDimShuffle{0,1,x,x} | |
... (remaining 7 Ops account for 0.08%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
8.2% 8.2% 0.309s 4.89e-04s 632 31 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wo_copy[cuda]) | |
8.0% 16.3% 0.302s 4.78e-04s 632 28 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wc_copy[cuda]) | |
8.0% 24.3% 0.301s 4.76e-04s 632 30 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wf_copy[cuda]) | |
8.0% 32.3% 0.301s 4.76e-04s 632 29 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wi_copy[cuda]) | |
7.6% 39.8% 0.284s 4.50e-04s 632 26 GpuElemwise{mul,no_inplace}(GpuDimShuffle{0,1,x}.0, <CudaNdarrayType(fl | |
oat32, 3D)>) | |
5.3% 45.1% 0.198s 3.13e-04s 632 11 GpuElemwise{add,no_inplace}(GpuDimShuffle{0,x,1}.0, GpuDimShuffle{0,1,2 | |
}.0) | |
5.2% 50.3% 0.195s 3.09e-04s 632 27 GpuCAReduce{add}{0,1,0}(GpuElemwise{mul,no_inplace}.0) | |
4.6% 54.8% 0.171s 2.71e-04s 632 3 GpuDot22(<CudaNdarrayType(float32, matrix)>, W_wa_copy[cuda]) | |
4.4% 59.2% 0.165s 2.61e-04s 632 39 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
4.4% 63.6% 0.165s 2.61e-04s 632 35 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wo2_copy[cuda], TensorConstant{1.0}) | |
4.4% 68.0% 0.165s 2.61e-04s 632 38 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
4.4% 72.4% 0.165s 2.61e-04s 632 36 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
4.4% 76.8% 0.165s 2.61e-04s 632 37 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
4.4% 81.2% 0.164s 2.60e-04s 632 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wf2_copy[cuda], TensorConstant{1.0}) | |
4.4% 85.5% 0.164s 2.60e-04s 632 32 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wc2_copy[cuda], TensorConstant{1.0}) | |
4.4% 89.9% 0.164s 2.60e-04s 632 33 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wi2_copy[cuda], TensorConstant{1.0}) | |
3.4% 93.3% 0.129s 2.04e-04s 632 15 GpuElemwise{Tanh}[(0, 0)](GpuReshape{2}.0) | |
1.8% 95.2% 0.069s 1.09e-04s 632 17 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, GpuEle | |
mwise{Tanh}[(0, 0)].0, GpuDimShuffle{0}.0, TensorConstant{0.0}) | |
1.1% 96.2% 0.040s 6.33e-05s 632 40 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0) | |
0.7% 97.0% 0.027s 4.31e-05s 632 9 GpuFromHost(Elemwise{Cast{float32}}.0) | |
... (remaining 22 Apply instances account for 3.04%(0.11s) of the runtime) | |
Function profiling | |
================== | |
Message: /usr/local/lib/python2.7/dist-packages/theano/tensor/blas_c.py:733 | |
Time in 1 calls to Function.__call__: 2.198935e-03s | |
Time in Function.fn.__call__: 2.163887e-03s (98.406%) | |
Time in thunks: 2.101898e-03s (95.587%) | |
Total compile time: 1.613784e-02s | |
Number of Apply nodes: 5 | |
Theano Optimizer time: 5.366087e-03s | |
Theano validate time: 1.032352e-04s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.407883e-03s | |
Import time 3.850460e-04s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
93.7% 93.7% 0.002s 6.57e-04s C 3 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
5.2% 99.0% 0.000s 1.10e-04s C 1 1 theano.sandbox.cuda.blas.GpuGemv | |
1.0% 100.0% 0.000s 2.19e-05s C 1 1 theano.sandbox.cuda.basic_ops.HostFromGpu | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
93.7% 93.7% 0.002s 6.57e-04s C 3 3 GpuFromHost | |
5.2% 99.0% 0.000s 1.10e-04s C 1 1 GpuGemv{no_inplace} | |
1.0% 100.0% 0.000s 2.19e-05s C 1 1 HostFromGpu | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
81.6% 81.6% 0.002s 1.72e-03s 1 2 GpuFromHost(aa) | |
10.9% 92.6% 0.000s 2.30e-04s 1 0 GpuFromHost(yy) | |
5.2% 97.8% 0.000s 1.10e-04s 1 3 GpuGemv{no_inplace}(GpuFromHost.0, TensorConstant{1.0}, GpuFromHost.0, | |
GpuFromHost.0, TensorConstant{0.0}) | |
1.1% 99.0% 0.000s 2.41e-05s 1 1 GpuFromHost(xx) | |
1.0% 100.0% 0.000s 2.19e-05s 1 4 HostFromGpu(GpuGemv{no_inplace}.0) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Function profiling | |
================== | |
Message: /home/ubuntu/deepy/deepy/trainers/trainers.py:319 | |
Time in 297 calls to Function.__call__: 6.786967e+02s | |
Time in Function.fn.__call__: 6.786298e+02s (99.990%) | |
Time in thunks: 6.739271e+02s (99.297%) | |
Total compile time: 1.045786e+02s | |
Number of Apply nodes: 1348 | |
Theano Optimizer time: 8.847273e+01s | |
Theano validate time: 1.659854e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.573371e+01s | |
Import time 2.074945e-01s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
63.8% 63.8% 430.046s 2.41e-01s Py 1782 6 theano.scan_module.scan_op.Scan | |
19.8% 83.6% 133.139s 1.18e-02s C 11286 38 theano.sandbox.cuda.blas.GpuDot22 | |
6.6% 90.2% 44.408s 2.93e-03s C 15147 51 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
2.7% 92.8% 17.878s 8.85e-04s C 20196 68 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.5% 94.3% 10.271s 3.46e-02s C 297 1 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1 | |
1.5% 95.8% 9.947s 3.35e-02s C 297 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
1.4% 97.2% 9.632s 3.24e-02s C 297 1 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
1.0% 98.2% 6.487s 4.37e-03s C 1485 5 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.7% 98.9% 4.421s 2.48e-04s C 17820 60 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.4% 99.3% 2.981s 1.43e-03s Py 2079 7 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.2% 99.5% 1.539s 7.40e-05s Py 20790 70 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.2% 99.7% 1.464s 1.90e-04s C 7722 26 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.0% 99.8% 0.317s 3.56e-04s Py 891 3 theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1 | |
0.0% 99.8% 0.278s 9.37e-04s Py 297 1 theano.sandbox.cuda.basic_ops.GpuSplit | |
0.0% 99.9% 0.272s 1.81e-06s C 150579 507 theano.tensor.elemwise.Elemwise | |
0.0% 99.9% 0.222s 7.47e-04s C 297 1 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.0% 99.9% 0.162s 2.73e-04s C 594 2 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1_dev20 | |
0.0% 100.0% 0.150s 6.32e-05s C 2376 8 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 100.0% 0.067s 1.62e-06s C 41283 139 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.060s 2.83e-06s C 21384 72 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 13 Classes account for 0.03%(0.18s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
45.5% 45.5% 306.715s 1.03e+00s Py 297 1 forall_inplace,gpu,grad_of_scan_fn} | |
19.8% 65.3% 133.139s 1.18e-02s C 11286 38 GpuDot22 | |
9.2% 74.5% 61.915s 1.04e-01s Py 594 2 forall_inplace,gpu,grad_of_scan_fn} | |
7.9% 82.3% 53.129s 1.79e-01s Py 297 1 forall_inplace,gpu,scan_fn} | |
4.7% 87.1% 31.945s 3.36e-03s C 9504 32 GpuCAReduce{pre=sqr,red=add}{1,1} | |
1.8% 88.9% 12.223s 2.06e-02s C 594 2 GpuCAReduce{add}{1,1,0} | |
1.5% 90.4% 10.271s 3.46e-02s C 297 1 GpuAdvancedIncSubtensor1{inplace,inc} | |
1.5% 91.9% 9.947s 3.35e-02s C 297 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
1.4% 93.3% 9.632s 3.24e-02s C 297 1 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo | |
='accurate'} | |
1.2% 94.6% 8.287s 1.40e-02s Py 594 2 forall_inplace,gpu,scan_fn} | |
1.0% 95.5% 6.487s 4.37e-03s C 1485 5 GpuDot22Scalar | |
0.9% 96.4% 6.301s 4.82e-04s C 13068 44 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i | |
3, i4, ((i5 * i4) / i6))) * i7))}}[(0, 0)] | |
0.7% 97.1% 4.421s 2.48e-04s C 17820 60 GpuAlloc{memset_0=True} | |
0.5% 97.6% 3.673s 6.18e-03s C 594 2 GpuElemwise{add,no_inplace} | |
0.5% 98.2% 3.644s 4.09e-03s C 891 3 GpuElemwise{mul,no_inplace} | |
0.5% 98.7% 3.467s 5.84e-03s C 594 2 GpuElemwise{Mul}[(0, 0)] | |
0.4% 99.1% 2.974s 2.00e-03s Py 1485 5 GpuFlatten{2} | |
0.2% 99.3% 1.288s 9.64e-05s Py 13365 45 GpuReshape{2} | |
0.1% 99.4% 0.517s 3.48e-04s C 1485 5 GpuIncSubtensor{Inc;:int64:} | |
0.1% 99.5% 0.494s 1.39e-04s C 3564 12 GpuIncSubtensor{InplaceInc;int64::} | |
... (remaining 161 Ops account for 0.51%(3.46s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
45.5% 45.5% 306.715s 1.03e+00s 297 1063 forall_inplace,gpu,grad_of_scan_fn}(Shape_i{1}.0, GpuDimShuffle{0,2,1} | |
.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64: | |
int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=Tru | |
e}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
7.9% 53.4% 53.129s 1.79e-01s 297 959 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
6.4% 59.8% 42.972s 1.45e-01s 297 1044 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
5.8% 65.5% 38.873s 1.31e-01s 297 1136 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
5.1% 70.7% 34.667s 1.17e-01s 297 999 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
4.8% 75.5% 32.447s 1.09e-01s 297 1043 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
3.4% 78.9% 23.042s 7.76e-02s 297 1137 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
1.8% 80.8% 12.410s 4.18e-02s 297 1050 GpuCAReduce{pre=sqr,red=add}{1,1}(GpuDimShuffle{0,1}.0) | |
1.8% 82.5% 12.040s 4.05e-02s 297 1042 GpuCAReduce{add}{1,1,0}(GpuReshape{3}.0) | |
1.5% 84.1% 10.271s 3.46e-02s 297 1031 GpuAdvancedIncSubtensor1{inplace,inc}(GpuAlloc{memset_0=True}.0, GpuEl | |
emwise{Composite{((i0 * i1 * i2) / (i3 * i4))},no_inplace}.0, Elemwise{Composite{((i0 * i1) + i2)}}.0) | |
1.5% 85.5% 9.947s 3.35e-02s 297 1019 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(G | |
puContiguous.0) | |
1.4% 87.0% 9.632s 3.24e-02s 297 1037 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate | |
'}(GpuContiguous.0, GpuContiguous.0) | |
0.6% 87.6% 4.149s 1.40e-02s 297 881 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSub | |
tensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{ | |
memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuf | |
0.6% 88.2% 4.138s 1.39e-02s 297 906 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSu | |
btensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
loc{memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDim | |
0.5% 88.7% 3.584s 1.21e-02s 297 1004 GpuElemwise{add,no_inplace}(GpuReshape{3}.0, GpuDimShuffle{x,x,0}.0) | |
0.5% 89.2% 3.438s 1.16e-02s 297 1012 GpuElemwise{mul,no_inplace}(GpuReshape{2}.0, GpuDimShuffle{0,x}.0) | |
0.5% 89.7% 3.342s 1.13e-02s 297 1039 GpuElemwise{Mul}[(0, 0)](GpuDimShuffle{0,1}.0, GpuDimShuffle{0,x}.0) | |
0.4% 90.2% 2.833s 9.54e-03s 297 1346 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i3, i4, ((i5 | |
* i4) / i6))) * i7))}}[(0, 0)](W_embed, GpuFromHost.0, CudaNdarrayConstant{[[ 0.01]]}, GpuFromHost.0, GpuAdvancedIncSubtensor1_dev2 | |
0{inplace,inc}.0, CudaNdarrayConstant{[[ 3.]]}, GpuDimShuffle{x,x}.0, GpuDimShuffle{x,x}.0) | |
0.4% 90.6% 2.804s 9.44e-03s 297 1116 GpuDot22(GpuFlatten{2}.0, GpuReshape{2}.0) | |
0.4% 91.0% 2.800s 9.43e-03s 297 1119 GpuDot22Scalar(GpuFlatten{2}.0, GpuReshape{2}.0, TensorConstant{3.0}) | |
... (remaining 1328 Apply instances account for 9.01%(60.69s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 298 calls of the op (for a total of 2621 steps) 4.064922e+00s | |
Total time spent in calling the VM 3.681994e+00s (90.580%) | |
Total overhead (computing slices..) 3.829279e-01s (9.420%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
83.7% 83.7% 3.048s 2.91e-04s C 10484 4 theano.sandbox.cuda.blas.GpuGemm | |
10.3% 94.1% 0.377s 4.79e-05s C 7863 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.4% 99.5% 0.197s 3.77e-05s C 5242 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.019s 3.61e-06s C 5242 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
83.7% 83.7% 3.048s 2.91e-04s C 10484 4 GpuGemm{no_inplace} | |
5.4% 89.1% 0.197s 3.77e-05s C 5242 2 GpuFromHost | |
3.8% 93.0% 0.140s 5.34e-05s C 2621 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
3.8% 96.8% 0.138s 5.26e-05s C 2621 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
2.7% 99.5% 0.099s 3.78e-05s C 2621 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.019s 3.61e-06s C 5242 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
21.8% 21.8% 0.792s 3.02e-04s 2621 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
21.1% 42.9% 0.768s 2.93e-04s 2621 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
20.4% 63.3% 0.745s 2.84e-04s 2621 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
20.4% 83.7% 0.743s 2.84e-04s 2621 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
3.8% 87.6% 0.140s 5.34e-05s 2621 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.8% 91.3% 0.138s 5.26e-05s 2621 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.0% 94.4% 0.110s 4.21e-05s 2621 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.7% 97.1% 0.099s 3.78e-05s 2621 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.4% 99.5% 0.087s 3.33e-05s 2621 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.012s 4.62e-06s 2621 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.007s 2.60e-06s 2621 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 298 calls of the op (for a total of 2621 steps) 4.061944e+00s | |
Total time spent in calling the VM 3.678530e+00s (90.561%) | |
Total overhead (computing slices..) 3.834136e-01s (9.439%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
83.8% 83.8% 3.046s 2.90e-04s C 10484 4 theano.sandbox.cuda.blas.GpuGemm | |
10.3% 94.0% 0.373s 4.74e-05s C 7863 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
5.5% 99.5% 0.199s 3.80e-05s C 5242 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.5% 100.0% 0.018s 3.49e-06s C 5242 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
83.8% 83.8% 3.046s 2.90e-04s C 10484 4 GpuGemm{no_inplace} | |
5.5% 89.2% 0.199s 3.80e-05s C 5242 2 GpuFromHost | |
3.8% 93.0% 0.138s 5.27e-05s C 2621 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)] | |
3.8% 96.8% 0.137s 5.22e-05s C 2621 1 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2 | |
)) * i3) + (i4 * i5))},no_inplace} | |
2.7% 99.5% 0.098s 3.74e-05s C 2621 1 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace} | |
0.5% 100.0% 0.018s 3.49e-06s C 5242 2 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
21.7% 21.7% 0.790s 3.01e-04s 2621 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
21.1% 42.9% 0.768s 2.93e-04s 2621 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
20.5% 63.3% 0.744s 2.84e-04s 2621 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
20.4% 83.8% 0.743s 2.84e-04s 2621 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
3.8% 87.6% 0.138s 5.27e-05s 2621 8 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)](<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0) | |
3.8% 91.3% 0.137s 5.22e-05s 2621 10 GpuElemwise{Composite{(((scalar_sigmoid((i0 + i1)) * tanh(i2)) * i3) + | |
(i4 * i5))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{no_inplace}.0, GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * | |
i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuFromHost.0) | |
3.1% 94.4% 0.111s 4.25e-05s 2621 7 GpuFromHost(Elemwise{Cast{float32}}.0) | |
2.7% 97.1% 0.098s 3.74e-05s 2621 9 GpuElemwise{Composite{((i0 * i1) + (i2 * i3))},no_inplace}(GpuElemwise{ | |
Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmoid((i3 + i4)) * tanh((i5 + i6))))}}[(0, 1)].0, GpuFromHost.0, <CudaNdarra | |
yType(float32, matrix)>, GpuFromHost.0) | |
2.4% 99.5% 0.088s 3.35e-05s 2621 6 GpuFromHost(Elemwise{Cast{float32}}.0) | |
0.3% 99.8% 0.011s 4.38e-06s 2621 1 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
0.2% 100.0% 0.007s 2.59e-06s 2621 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 298 calls of the op (for a total of 9881 steps) 5.314603e+01s | |
Total time spent in calling the VM 5.168440e+01s (97.250%) | |
Total overhead (computing slices..) 1.461627e+00s (2.750%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
42.3% 42.3% 21.728s 4.40e-04s C 49405 5 theano.sandbox.cuda.blas.GpuDot22 | |
40.4% 82.7% 20.740s 2.62e-04s C 79048 8 theano.sandbox.cuda.blas.GpuGemm | |
10.1% 92.8% 5.189s 8.75e-05s C 59286 6 theano.sandbox.cuda.basic_ops.GpuElemwise | |
2.9% 95.7% 1.479s 1.50e-04s C 9881 1 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.1% 96.8% 0.555s 2.81e-05s Py 19762 2 theano.sandbox.cuda.basic_ops.GpuReshape | |
1.0% 97.8% 0.525s 5.32e-05s C 9881 1 theano.sandbox.cuda.blas.GpuGemv | |
0.9% 98.7% 0.474s 4.80e-05s C 9881 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.6% 99.3% 0.298s 3.02e-05s C 9881 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
0.4% 99.7% 0.212s 2.15e-05s C 9881 1 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.8% 0.055s 8.02e-07s C 69167 7 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.1% 99.9% 0.042s 2.14e-06s C 19762 2 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.023s 5.93e-07s C 39524 4 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.010s 5.06e-07s C 19762 2 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.006s 6.45e-07s C 9881 1 theano.sandbox.cuda.basic_ops.GpuContiguous | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
42.3% 42.3% 21.728s 4.40e-04s C 49405 5 GpuDot22 | |
40.4% 82.7% 20.740s 2.62e-04s C 79048 8 GpuGemm{inplace} | |
3.5% 86.2% 1.773s 1.79e-04s C 9881 1 GpuElemwise{mul,no_inplace} | |
2.9% 89.1% 1.479s 1.50e-04s C 9881 1 GpuCAReduce{add}{0,1,0} | |
2.6% 91.6% 1.312s 1.33e-04s C 9881 1 GpuElemwise{add,no_inplace} | |
1.7% 93.3% 0.848s 8.59e-05s C 9881 1 GpuElemwise{Tanh}[(0, 0)] | |
1.3% 94.5% 0.642s 6.50e-05s C 9881 1 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (s | |
calar_sigmoid((i3 + i4)) * tanh((i5 + i6))))},no_inplace} | |
1.1% 95.6% 0.555s 2.81e-05s Py 19762 2 GpuReshape{2} | |
1.0% 96.6% 0.525s 5.32e-05s C 9881 1 GpuGemv{inplace} | |
0.9% 97.5% 0.474s 4.80e-05s C 9881 1 GpuFromHost | |
0.9% 98.4% 0.437s 4.43e-05s C 9881 1 GpuElemwise{Composite{(scalar_sigmoid((i0 + i1)) * tanh(i2)) | |
},no_inplace} | |
0.6% 99.0% 0.298s 3.02e-05s C 9881 1 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
0.4% 99.4% 0.212s 2.15e-05s C 9881 1 GpuAlloc{memset_0=True} | |
0.3% 99.7% 0.176s 1.78e-05s C 9881 1 GpuElemwise{Mul}[(0, 0)] | |
0.1% 99.8% 0.037s 3.76e-06s C 9881 1 Elemwise{Cast{float32}} | |
0.0% 99.8% 0.013s 6.49e-07s C 19762 2 Shape_i{0} | |
0.0% 99.8% 0.011s 5.38e-07s C 19762 2 Shape_i{1} | |
0.0% 99.9% 0.010s 5.06e-07s C 19762 2 MakeVector | |
0.0% 99.9% 0.009s 9.41e-07s C 9881 1 GpuDimShuffle{0} | |
0.0% 99.9% 0.009s 8.76e-07s C 9881 1 GpuDimShuffle{0,1,x,x} | |
... (remaining 7 Ops account for 0.10%(0.05s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
9.4% 9.4% 4.842s 4.90e-04s 9881 31 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wo_copy[cuda]) | |
9.2% 18.7% 4.749s 4.81e-04s 9881 28 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wc_copy[cuda]) | |
9.2% 27.9% 4.725s 4.78e-04s 9881 30 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wf_copy[cuda]) | |
9.2% 37.1% 4.722s 4.78e-04s 9881 29 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wi_copy[cuda]) | |
5.2% 42.3% 2.690s 2.72e-04s 9881 3 GpuDot22(<CudaNdarrayType(float32, matrix)>, W_wa_copy[cuda]) | |
5.1% 47.4% 2.600s 2.63e-04s 9881 35 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wo2_copy[cuda], TensorConstant{1.0}) | |
5.1% 52.4% 2.598s 2.63e-04s 9881 39 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
5.1% 57.5% 2.597s 2.63e-04s 9881 38 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
5.1% 62.6% 2.596s 2.63e-04s 9881 37 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
5.1% 67.6% 2.595s 2.63e-04s 9881 36 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, <CudaNdarrayT | |
ype(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
5.0% 72.7% 2.588s 2.62e-04s 9881 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wf2_copy[cuda], TensorConstant{1.0}) | |
5.0% 77.7% 2.583s 2.61e-04s 9881 32 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wc2_copy[cuda], TensorConstant{1.0}) | |
5.0% 82.7% 2.582s 2.61e-04s 9881 33 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, W_wi2_copy[cuda], TensorConstant{1.0}) | |
3.5% 86.2% 1.773s 1.79e-04s 9881 26 GpuElemwise{mul,no_inplace}(GpuDimShuffle{0,1,x}.0, <CudaNdarrayType(fl | |
oat32, 3D)>) | |
2.9% 89.1% 1.479s 1.50e-04s 9881 27 GpuCAReduce{add}{0,1,0}(GpuElemwise{mul,no_inplace}.0) | |
2.6% 91.6% 1.312s 1.33e-04s 9881 11 GpuElemwise{add,no_inplace}(GpuDimShuffle{0,x,1}.0, GpuDimShuffle{0,1,2 | |
}.0) | |
1.7% 93.3% 0.848s 8.59e-05s 9881 15 GpuElemwise{Tanh}[(0, 0)](GpuReshape{2}.0) | |
1.3% 94.5% 0.642s 6.50e-05s 9881 40 GpuElemwise{Composite{((scalar_sigmoid((i0 + i1)) * i2) + (scalar_sigmo | |
id((i3 + i4)) * tanh((i5 + i6))))},no_inplace}(<CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, matrix | |
)>, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0, <CudaNdarrayType(float32, row)>, GpuGemm{inplace}.0) | |
1.0% 95.5% 0.525s 5.32e-05s 9881 17 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, GpuEle | |
mwise{Tanh}[(0, 0)].0, GpuDimShuffle{0}.0, TensorConstant{0.0}) | |
0.9% 96.5% 0.474s 4.80e-05s 9881 9 GpuFromHost(Elemwise{Cast{float32}}.0) | |
... (remaining 22 Apply instances account for 3.54%(1.82s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 297 calls of the op (for a total of 9841 steps) 3.057151e+02s | |
Total time spent in calling the VM 2.697911e+02s (88.249%) | |
Total overhead (computing slices..) 3.592400e+01s (11.751%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
43.3% 43.3% 116.105s 4.07e-04s C 285389 29 theano.sandbox.cuda.blas.GpuDot22 | |
31.4% 74.6% 84.122s 3.17e-04s C 265707 27 theano.sandbox.cuda.blas.GpuGemm | |
19.0% 93.7% 51.008s 1.10e-04s C 462527 47 theano.sandbox.cuda.basic_ops.GpuElemwise | |
2.7% 96.4% 7.295s 6.18e-05s C 118092 12 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.8% 97.2% 2.231s 2.83e-05s Py 78728 8 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 97.9% 1.760s 5.96e-05s C 29523 3 theano.sandbox.cuda.blas.GpuGemv | |
0.6% 98.5% 1.567s 3.18e-05s C 49205 5 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.5% 98.9% 1.313s 6.67e-05s C 19682 2 theano.sandbox.cuda.blas.GpuGer | |
0.3% 99.3% 0.838s 4.26e-05s C 19682 2 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.2% 99.5% 0.557s 5.66e-05s C 9841 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.2% 99.7% 0.533s 2.71e-05s C 19682 2 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
0.1% 99.8% 0.388s 1.16e-06s C 334594 34 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.1% 99.9% 0.305s 3.10e-05s C 9841 1 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
0.0% 100.0% 0.100s 5.10e-06s C 19682 2 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.054s 6.80e-07s C 78728 8 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.031s 7.96e-07s C 39364 4 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.022s 5.61e-07s C 39364 4 theano.sandbox.cuda.basic_ops.GpuContiguous | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
43.3% 43.3% 116.105s 4.07e-04s C 285389 29 GpuDot22 | |
30.2% 73.4% 80.882s 3.16e-04s C 255866 26 GpuGemm{inplace} | |
10.0% 83.4% 26.697s 2.09e-04s C 127933 13 GpuElemwise{add,no_inplace} | |
2.4% 85.8% 6.452s 2.19e-04s C 29523 3 GpuElemwise{mul,no_inplace} | |
1.8% 87.6% 4.794s 2.44e-04s C 19682 2 GpuElemwise{Composite{(((i0 * i1) + (i2 * i1)) + i3)},no_in | |
place} | |
1.4% 88.9% 3.628s 7.37e-05s C 49205 5 GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} | |
1.2% 90.2% 3.275s 1.11e-04s C 29523 3 GpuCAReduce{add}{0,1,0} | |
1.2% 91.4% 3.240s 3.29e-04s C 9841 1 GpuGemm{no_inplace} | |
0.8% 92.2% 2.277s 4.63e-05s C 49205 5 GpuElemwise{Mul}[(0, 0)] | |
0.8% 93.0% 2.235s 3.24e-05s C 68887 7 GpuCAReduce{add}{1,0} | |
0.7% 93.7% 1.785s 9.07e-05s C 19682 2 GpuCAReduce{add}{0,0,1} | |
0.6% 94.3% 1.567s 3.18e-05s C 49205 5 GpuAlloc{memset_0=True} | |
0.6% 94.9% 1.546s 1.57e-04s C 9841 1 GpuElemwise{Composite{tanh((i0 + i1))},no_inplace} | |
0.5% 95.4% 1.313s 6.67e-05s C 19682 2 GpuGer{inplace} | |
0.5% 95.8% 1.209s 3.07e-05s Py 39364 4 GpuReshape{2} | |
0.4% 96.2% 1.094s 5.56e-05s C 19682 2 GpuGemv{inplace} | |
0.3% 96.6% 0.925s 4.70e-05s C 19682 2 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
0.3% 96.9% 0.844s 2.86e-05s C 29523 3 GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 1)] | |
0.2% 97.1% 0.665s 6.76e-05s C 9841 1 GpuGemv{no_inplace} | |
0.2% 97.4% 0.612s 3.11e-05s C 19682 2 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace} | |
... (remaining 33 Ops account for 2.64%(7.09s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
2.2% 2.2% 5.779s 5.87e-04s 9841 59 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inpla | |
ce}.0, W_wo_copy.T_replace[cuda]) | |
2.1% 4.3% 5.764s 5.86e-04s 9841 79 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, W_w | |
i_copy.T_replace[cuda]) | |
2.1% 6.5% 5.762s 5.85e-04s 9841 116 GpuDot22(GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, W_wc_copy. | |
T_replace[cuda]) | |
2.1% 8.6% 5.733s 5.83e-04s 9841 69 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_wf_copy.T_replace[cuda]) | |
2.1% 10.7% 5.724s 5.82e-04s 9841 82 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, W_w | |
f_copy.T_replace[cuda]) | |
2.1% 12.9% 5.722s 5.81e-04s 9841 74 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_wi_copy.T_replace[cuda]) | |
2.1% 15.0% 5.718s 5.81e-04s 9841 106 GpuDot22(GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, W_wc_copy. | |
T_replace[cuda]) | |
2.0% 17.0% 5.343s 5.43e-04s 9841 103 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuDimShuffle{1,0}.0, | |
GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, TensorConstant{1.0}) | |
2.0% 19.0% 5.336s 5.42e-04s 9841 95 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuDimShuffle{1,0}.0, | |
GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, TensorConstant{1.0}) | |
2.0% 21.0% 5.334s 5.42e-04s 9841 125 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuDimShuffle{1,0}.0, | |
GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, TensorConstant{1.0}) | |
1.8% 22.8% 4.888s 4.97e-04s 9841 86 GpuDot22(GpuDimShuffle{1,0}.0, GpuElemwise{Composite{(((i0 * i1) * i2) | |
* i3)}}[(0, 2)].0) | |
1.8% 24.6% 4.883s 4.96e-04s 9841 110 GpuDot22(GpuDimShuffle{1,0}.0, GpuElemwise{Composite{((i0 * i1) * i2)}} | |
[(0, 0)].0) | |
1.8% 26.4% 4.877s 4.96e-04s 9841 78 GpuDot22(GpuDimShuffle{1,0}.0, GpuElemwise{Composite{(((i0 * i1) * i2) | |
* i3)},no_inplace}.0) | |
1.8% 28.2% 4.811s 4.89e-04s 9841 41 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wf_copy[cuda]) | |
1.8% 30.0% 4.757s 4.83e-04s 9841 38 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wo_copy[cuda]) | |
1.8% 31.8% 4.747s 4.82e-04s 9841 40 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wc_copy[cuda]) | |
1.8% 33.5% 4.745s 4.82e-04s 9841 39 GpuDot22(GpuCAReduce{add}{0,1,0}.0, W_wi_copy[cuda]) | |
1.6% 35.1% 4.339s 4.41e-04s 9841 136 GpuElemwise{Composite{(((i0 * i1) + (i2 * i1)) + i3)},no_inplace}(GpuEl | |
emwise{Composite{(((i0 + i1) + i2) + i3)}}[(0, 0)].0, GpuDimShuffle{0,1,x}.0, GpuElemwise{Composite{((i0 + i1) + i2)}}[(0, 0)].0, <C | |
udaNdarrayType(float32, 3D)>) | |
1.2% 36.4% 3.319s 3.37e-04s 9841 121 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
1.2% 37.6% 3.303s 3.36e-04s 9841 132 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
... (remaining 171 Apply instances account for 62.39%(167.34s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 297 calls of the op (for a total of 2611 steps) 3.853760e+01s | |
Total time spent in calling the VM 1.844704e+01s (47.868%) | |
Total overhead (computing slices..) 2.009057e+01s (52.132%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
46.6% 46.6% 8.528s 2.97e-04s C 28721 11 theano.sandbox.cuda.blas.GpuGemm | |
25.0% 71.7% 4.578s 2.92e-04s C 15666 6 theano.sandbox.cuda.blas.GpuDot22 | |
23.4% 95.0% 4.272s 5.28e-05s C 80941 31 theano.sandbox.cuda.basic_ops.GpuElemwise | |
3.4% 98.4% 0.617s 3.38e-05s C 18277 7 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.4% 99.8% 0.264s 5.06e-05s C 5222 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.2% 100.0% 0.027s 5.26e-06s C 5222 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
30.1% 30.1% 5.506s 3.01e-04s C 18277 7 GpuGemm{inplace} | |
25.0% 55.1% 4.578s 2.92e-04s C 15666 6 GpuDot22 | |
16.5% 71.7% 3.022s 2.89e-04s C 10444 4 GpuGemm{no_inplace} | |
13.1% 84.7% 2.386s 1.14e-04s C 20888 8 GpuElemwise{add,no_inplace} | |
3.4% 88.1% 0.617s 3.38e-05s C 18277 7 GpuCAReduce{add}{1,0} | |
1.5% 89.6% 0.280s 3.57e-05s C 7833 3 GpuElemwise{mul,no_inplace} | |
1.4% 91.1% 0.264s 5.06e-05s C 5222 2 GpuFromHost | |
1.4% 92.5% 0.255s 3.25e-05s C 7833 3 GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} | |
1.2% 93.7% 0.220s 2.80e-05s C 7833 3 GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 1)] | |
1.0% 94.7% 0.184s 3.52e-05s C 5222 2 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace} | |
0.7% 95.4% 0.133s 5.09e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) + ((i2 * i1) + (i3 * i4))) | |
+ i5)},no_inplace} | |
0.7% 96.1% 0.133s 2.54e-05s C 5222 2 GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)] | |
0.7% 96.8% 0.128s 4.90e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inp | |
lace} | |
0.6% 97.5% 0.117s 2.23e-05s C 5222 2 GpuElemwise{sub,no_inplace} | |
0.4% 97.9% 0.081s 3.08e-05s C 2611 1 GpuElemwise{Composite{tanh(((i0 * i1) + (i2 * i3)))},no_inpl | |
ace} | |
0.4% 98.3% 0.078s 2.99e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)] | |
0.4% 98.8% 0.078s 2.97e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)] | |
0.4% 99.2% 0.075s 2.87e-05s C 2611 1 GpuElemwise{Composite{tanh((i0 + i1))}}[(0, 1)] | |
0.4% 99.6% 0.073s 2.79e-05s C 2611 1 GpuElemwise{Composite{((i0 * i1) * (i2 - sqr(i3)))}}[(0, 0)] | |
0.3% 99.8% 0.053s 2.04e-05s C 2611 1 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
... (remaining 1 Ops account for 0.15%(0.03s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
4.5% 4.5% 0.827s 3.17e-04s 2611 31 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_ui_copy.T_replace[cuda]) | |
4.5% 9.0% 0.823s 3.15e-04s 2611 52 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 13.5% 0.813s 3.11e-04s 2611 21 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inpla | |
ce}.0, W_uo_copy.T_replace[cuda]) | |
4.4% 17.9% 0.812s 3.11e-04s 2611 40 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, Gp | |
uElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 22.3% 0.811s 3.11e-04s 2611 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{(((i0 * i1) * i2) * i3)},no_inplace}.0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 26.7% 0.805s 3.08e-04s 2611 48 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.3% 31.1% 0.794s 3.04e-04s 2611 37 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, W_u | |
i_copy.T_replace[cuda]) | |
4.2% 35.3% 0.770s 2.95e-04s 2611 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
4.2% 39.5% 0.763s 2.92e-04s 2611 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
4.1% 43.6% 0.752s 2.88e-04s 2611 56 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, TensorConstant{1.0}) | |
4.1% 47.7% 0.752s 2.88e-04s 2611 43 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, TensorConstant{1.0}) | |
4.1% 51.8% 0.751s 2.88e-04s 2611 44 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, TensorConstant{1.0}) | |
4.1% 55.9% 0.745s 2.85e-04s 2611 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
4.1% 59.9% 0.745s 2.85e-04s 2611 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
3.9% 63.9% 0.716s 2.74e-04s 2611 30 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.9% 67.8% 0.714s 2.74e-04s 2611 51 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 | |
* i1) * i2)}}[(0, 0)].0) | |
3.9% 71.7% 0.714s 2.73e-04s 2611 33 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.2% 74.9% 0.582s 2.23e-04s 2611 54 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.2% 78.0% 0.576s 2.21e-04s 2611 58 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.1% 81.1% 0.569s 2.18e-04s 2611 53 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
... (remaining 39 Apply instances account for 18.89%(3.45s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 297 calls of the op (for a total of 2611 steps) 2.270440e+01s | |
Total time spent in calling the VM 1.848128e+01s (81.400%) | |
Total overhead (computing slices..) 4.223124e+00s (18.600%) | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
46.5% 46.5% 8.523s 2.97e-04s C 28721 11 theano.sandbox.cuda.blas.GpuGemm | |
25.0% 71.6% 4.588s 2.93e-04s C 15666 6 theano.sandbox.cuda.blas.GpuDot22 | |
23.5% 95.0% 4.299s 5.31e-05s C 80941 31 theano.sandbox.cuda.basic_ops.GpuElemwise | |
3.4% 98.4% 0.619s 3.39e-05s C 18277 7 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.5% 99.9% 0.266s 5.10e-05s C 5222 2 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.1% 100.0% 0.027s 5.12e-06s C 5222 2 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
30.0% 30.0% 5.501s 3.01e-04s C 18277 7 GpuGemm{inplace} | |
25.0% 55.1% 4.588s 2.93e-04s C 15666 6 GpuDot22 | |
16.5% 71.6% 3.022s 2.89e-04s C 10444 4 GpuGemm{no_inplace} | |
13.2% 84.7% 2.411s 1.15e-04s C 20888 8 GpuElemwise{add,no_inplace} | |
3.4% 88.1% 0.619s 3.39e-05s C 18277 7 GpuCAReduce{add}{1,0} | |
1.5% 89.6% 0.277s 3.53e-05s C 7833 3 GpuElemwise{mul,no_inplace} | |
1.5% 91.1% 0.266s 5.10e-05s C 5222 2 GpuFromHost | |
1.4% 92.5% 0.264s 3.37e-05s C 7833 3 GpuElemwise{Composite{((i0 + i1) + i2)},no_inplace} | |
1.2% 93.7% 0.220s 2.81e-05s C 7833 3 GpuElemwise{Composite{scalar_sigmoid((i0 + i1))}}[(0, 1)] | |
1.0% 94.7% 0.181s 3.47e-05s C 5222 2 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace} | |
0.7% 95.4% 0.133s 5.09e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) + ((i2 * i1) + (i3 * i4))) | |
+ i5)},no_inplace} | |
0.7% 96.1% 0.132s 2.52e-05s C 5222 2 GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)] | |
0.7% 96.8% 0.129s 4.92e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inp | |
lace} | |
0.6% 97.5% 0.117s 2.25e-05s C 5222 2 GpuElemwise{sub,no_inplace} | |
0.4% 97.9% 0.079s 3.04e-05s C 2611 1 GpuElemwise{Composite{tanh(((i0 * i1) + (i2 * i3)))},no_inpl | |
ace} | |
0.4% 98.3% 0.078s 2.98e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)] | |
0.4% 98.8% 0.077s 2.96e-05s C 2611 1 GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)] | |
0.4% 99.2% 0.075s 2.87e-05s C 2611 1 GpuElemwise{Composite{tanh((i0 + i1))}}[(0, 1)] | |
0.4% 99.6% 0.072s 2.77e-05s C 2611 1 GpuElemwise{Composite{((i0 * i1) * (i2 - sqr(i3)))}}[(0, 0)] | |
0.3% 99.9% 0.053s 2.03e-05s C 2611 1 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
... (remaining 1 Ops account for 0.15%(0.03s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
4.5% 4.5% 0.825s 3.16e-04s 2611 31 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)},no_inplace}.0, | |
W_ui_copy.T_replace[cuda]) | |
4.5% 9.0% 0.820s 3.14e-04s 2611 52 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 13.4% 0.814s 3.12e-04s 2611 21 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * (i3 - i2))},no_inpla | |
ce}.0, W_uo_copy.T_replace[cuda]) | |
4.4% 17.8% 0.812s 3.11e-04s 2611 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{(((i0 * i1) * i2) * i3)},no_inplace}.0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 22.3% 0.809s 3.10e-04s 2611 40 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, Gp | |
uElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, W_uf_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.4% 26.7% 0.805s 3.08e-04s 2611 48 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite | |
{((i0 * i1) * i2)}}[(0, 0)].0, W_uc_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.3% 31.0% 0.794s 3.04e-04s 2611 37 GpuDot22(GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, W_u | |
i_copy.T_replace[cuda]) | |
4.2% 35.2% 0.771s 2.95e-04s 2611 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uc_copy[cuda], TensorConstant{1.0}) | |
4.2% 39.4% 0.762s 2.92e-04s 2611 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uf_copy[cuda], TensorConstant{1.0}) | |
4.1% 43.5% 0.753s 2.88e-04s 2611 56 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, TensorConstant{1.0}) | |
4.1% 47.6% 0.752s 2.88e-04s 2611 43 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 3)].0, TensorConstant{1.0}) | |
4.1% 51.7% 0.751s 2.88e-04s 2611 44 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(floa | |
t32, matrix)>, GpuElemwise{Composite{(((i0 * i1) * i2) * i3)}}[(0, 2)].0, TensorConstant{1.0}) | |
4.1% 55.7% 0.745s 2.85e-04s 2611 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_uo_copy[cuda], TensorConstant{1.0}) | |
4.1% 59.8% 0.744s 2.85e-04s 2611 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{ | |
1.0}, <CudaNdarrayType(float32, matrix)>, W_ui_copy[cuda], TensorConstant{1.0}) | |
4.0% 63.8% 0.724s 2.77e-04s 2611 51 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 | |
* i1) * i2)}}[(0, 0)].0) | |
3.9% 67.7% 0.716s 2.74e-04s 2611 30 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.9% 71.6% 0.715s 2.74e-04s 2611 33 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{(((i | |
0 * i1) * i2) * i3)},no_inplace}.0) | |
3.2% 74.8% 0.592s 2.27e-04s 2611 54 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.1% 77.9% 0.574s 2.20e-04s 2611 58 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
3.1% 81.0% 0.573s 2.19e-04s 2611 53 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm | |
{inplace}.0) | |
... (remaining 39 Apply instances account for 18.95%(3.47s) of the runtime) | |
Function profiling | |
================== | |
Message: Sum of all(4) printed profiles at exit excluding Scan op profile. | |
Time in 315 calls to Function.__call__: 6.879535e+02s | |
Time in Function.fn.__call__: 6.878845e+02s (99.990%) | |
Time in thunks: 6.831378e+02s (99.300%) | |
Total compile time: 1.192735e+02s | |
Number of Apply nodes: 344 | |
Theano Optimizer time: 9.820933e+01s | |
Theano validate time: 1.885482e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.060047e+01s | |
Import time 2.949057e-01s | |
Time in all call to theano.grad() 8.831870e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
63.7% 63.7% 435.193s 2.38e-01s Py 1830 9 theano.scan_module.scan_op.Scan | |
19.9% 83.6% 135.986s 1.19e-02s C 11462 49 theano.sandbox.cuda.blas.GpuDot22 | |
6.5% 90.1% 44.409s 2.93e-03s C 15179 53 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
2.7% 92.8% 18.330s 9.03e-04s C 20308 75 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.5% 94.3% 10.587s 3.38e-02s C 313 2 theano.sandbox.cuda.dnn.GpuDnnSoftmax | |
1.5% 95.8% 10.271s 3.46e-02s C 297 1 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1 | |
1.4% 97.3% 9.632s 3.24e-02s C 297 1 theano.sandbox.cuda.dnn.GpuDnnSoftmaxGrad | |
0.9% 98.2% 6.487s 4.37e-03s C 1485 5 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.6% 98.9% 4.434s 2.47e-04s C 17916 66 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.4% 99.3% 2.982s 1.41e-03s Py 2111 9 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.2% 99.5% 1.586s 7.46e-05s Py 21270 100 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.2% 99.7% 1.464s 1.90e-04s C 7722 26 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.0% 99.8% 0.338s 3.60e-04s Py 939 6 theano.sandbox.cuda.basic_ops.GpuAdvancedSubtensor1 | |
0.0% 99.8% 0.278s 9.37e-04s Py 297 1 theano.sandbox.cuda.basic_ops.GpuSplit | |
0.0% 99.9% 0.275s 1.81e-06s C 152275 613 theano.tensor.elemwise.Elemwise | |
0.0% 99.9% 0.247s 7.89e-04s C 313 2 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.0% 99.9% 0.162s 2.73e-04s C 594 2 theano.sandbox.cuda.basic_ops.GpuAdvancedIncSubtensor1_dev20 | |
0.0% 100.0% 0.159s 6.57e-05s C 2427 14 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 100.0% 0.068s 1.62e-06s C 41987 183 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.061s 2.82e-06s C 21592 85 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 15 Classes account for 0.03%(0.19s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
44.9% 44.9% 306.715s 1.03e+00s Py 297 1 forall_inplace,gpu,grad_of_scan_fn} | |
19.9% 64.8% 135.986s 1.19e-02s C 11462 49 GpuDot22 | |
9.1% 73.9% 61.915s 1.04e-01s Py 594 2 forall_inplace,gpu,grad_of_scan_fn} | |
8.3% 82.2% 57.021s 1.82e-01s Py 313 2 forall_inplace,gpu,scan_fn} | |
4.7% 86.9% 31.945s 3.36e-03s C 9504 32 GpuCAReduce{pre=sqr,red=add}{1,1} | |
1.8% 88.7% 12.223s 2.06e-02s C 594 2 GpuCAReduce{add}{1,1,0} | |
1.5% 90.2% 10.587s 3.38e-02s C 313 2 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='ac | |
curate'} | |
1.5% 91.7% 10.271s 3.46e-02s C 297 1 GpuAdvancedIncSubtensor1{inplace,inc} | |
1.4% 93.1% 9.632s 3.24e-02s C 297 1 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo | |
='accurate'} | |
1.4% 94.5% 9.542s 1.52e-02s Py 626 4 forall_inplace,gpu,scan_fn} | |
0.9% 95.5% 6.487s 4.37e-03s C 1485 5 GpuDot22Scalar | |
0.9% 96.4% 6.301s 4.82e-04s C 13068 44 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i | |
3, i4, ((i5 * i4) / i6))) * i7))}}[(0, 0)] | |
0.6% 97.1% 4.434s 2.47e-04s C 17916 66 GpuAlloc{memset_0=True} | |
0.5% 97.6% 3.693s 5.75e-03s C 642 5 GpuElemwise{Mul}[(0, 0)] | |
0.5% 98.1% 3.673s 6.18e-03s C 594 2 GpuElemwise{add,no_inplace} | |
0.5% 98.7% 3.644s 4.09e-03s C 891 3 GpuElemwise{mul,no_inplace} | |
0.4% 99.1% 2.974s 2.00e-03s Py 1485 5 GpuFlatten{2} | |
0.2% 99.3% 1.328s 9.74e-05s Py 13637 62 GpuReshape{2} | |
0.1% 99.4% 0.517s 3.48e-04s C 1485 5 GpuIncSubtensor{Inc;:int64:} | |
0.1% 99.5% 0.494s 1.39e-04s C 3564 12 GpuIncSubtensor{InplaceInc;int64::} | |
... (remaining 172 Ops account for 0.55%(3.75s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
44.9% 44.9% 306.715s 1.03e+00s 297 1063 forall_inplace,gpu,grad_of_scan_fn}(Shape_i{1}.0, GpuDimShuffle{0,2,1} | |
.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64: | |
int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=Tru | |
e}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
7.8% 52.7% 53.129s 1.79e-01s 297 959 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
6.3% 59.0% 42.972s 1.45e-01s 297 1044 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
5.7% 64.7% 38.873s 1.31e-01s 297 1136 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
5.1% 69.7% 34.667s 1.17e-01s 297 999 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
4.7% 74.5% 32.447s 1.09e-01s 297 1043 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
3.4% 77.9% 23.042s 7.76e-02s 297 1137 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum | |
(minimum(minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, GpuDimShuffle{0,2,1}.0, Elemwise{sub,no_inplace}.0, GpuSubt | |
ensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, | |
GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
1.8% 79.7% 12.410s 4.18e-02s 297 1050 GpuCAReduce{pre=sqr,red=add}{1,1}(GpuDimShuffle{0,1}.0) | |
1.8% 81.4% 12.040s 4.05e-02s 297 1042 GpuCAReduce{add}{1,1,0}(GpuReshape{3}.0) | |
1.5% 82.9% 10.271s 3.46e-02s 297 1031 GpuAdvancedIncSubtensor1{inplace,inc}(GpuAlloc{memset_0=True}.0, GpuEl | |
emwise{Composite{((i0 * i1 * i2) / (i3 * i4))},no_inplace}.0, Elemwise{Composite{((i0 * i1) + i2)}}.0) | |
1.5% 84.4% 9.947s 3.35e-02s 297 1019 GpuDnnSoftmax{tensor_format='bc01', mode='channel', algo='accurate'}(G | |
puContiguous.0) | |
1.4% 85.8% 9.632s 3.24e-02s 297 1037 GpuDnnSoftmaxGrad{tensor_format='bc01', mode='channel', algo='accurate | |
'}(GpuContiguous.0, GpuContiguous.0) | |
0.6% 86.4% 4.149s 1.40e-02s 297 881 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSub | |
tensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{ | |
memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuf | |
0.6% 87.0% 4.138s 1.39e-02s 297 906 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum( | |
minimum(i0, i1), i2), i3), i4)}}.0, InplaceDimShuffle{0,1,x}.0, Elemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSu | |
btensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuAlloc{memset_0=True}.0, GpuAl | |
loc{memset_0=True}.0, W_uo, W_uf, W_ui, W_uc, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDim | |
0.6% 87.6% 3.892s 2.43e-01s 16 309 forall_inplace,gpu,scan_fn}(Shape_i{1}.0, GpuSubtensor{int64:int64:int8 | |
}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, W_wa, W_wo, W_wo2, W_uo, W_wf, W_wf2, W_uf, W_wi, W_wi2, W_ui, W_wc, W_wc | |
2, W_uc, GpuJoin.0, GpuReshape{3}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}.0, Elemwis | |
e{neq,no_inplace}.0, GpuReshape{2}.0) | |
0.5% 88.1% 3.584s 1.21e-02s 297 1004 GpuElemwise{add,no_inplace}(GpuReshape{3}.0, GpuDimShuffle{x,x,0}.0) | |
0.5% 88.6% 3.438s 1.16e-02s 297 1012 GpuElemwise{mul,no_inplace}(GpuReshape{2}.0, GpuDimShuffle{0,x}.0) | |
0.5% 89.1% 3.342s 1.13e-02s 297 1039 GpuElemwise{Mul}[(0, 0)](GpuDimShuffle{0,1}.0, GpuDimShuffle{0,x}.0) | |
0.4% 89.5% 2.833s 9.54e-03s 297 1346 GpuElemwise{Composite{(i0 - (Switch(i1, (i2 * i0), Switch(i3, i4, ((i5 | |
* i4) / i6))) * i7))}}[(0, 0)](W_embed, GpuFromHost.0, CudaNdarrayConstant{[[ 0.01]]}, GpuFromHost.0, GpuAdvancedIncSubtensor1_dev2 | |
0{inplace,inc}.0, CudaNdarrayConstant{[[ 3.]]}, GpuDimShuffle{x,x}.0, GpuDimShuffle{x,x}.0) | |
0.4% 89.9% 2.804s 9.44e-03s 297 1116 GpuDot22(GpuFlatten{2}.0, GpuReshape{2}.0) | |
... (remaining 1678 Apply instances account for 10.07%(68.81s) of the runtime) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment