Created
January 16, 2015 20:25
-
-
Save skaae/aacb3ea0c1a5223c5dee to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Function profiling | |
================== | |
Message: experiment.py:196 | |
Time in 1 calls to Function.__call__: 7.953641e+00s | |
Time in Function.fn.__call__: 7.953413e+00s (99.997%) | |
Time in thunks: 7.929524e+00s (99.697%) | |
Total compile time: 1.766550e+02s | |
Number of Apply nodes: 1214 | |
Theano Optimizer time: 1.688680e+02s | |
Theano validate time: 2.550483e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 7.392569e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
88.0% 88.0% 6.978s 1.16e+00s Py 6 6 theano.scan_module.scan_op.Scan | |
4.9% 92.9% 0.390s 2.17e-02s C 18 18 theano.sandbox.cuda.blas.GpuDot22 | |
1.5% 94.4% 0.118s 1.23e-03s C 96 96 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.4% 95.8% 0.109s 1.36e-02s C 8 8 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.3% 97.0% 0.100s 2.62e-03s Py 38 38 theano.sandbox.cuda.basic_ops.GpuReshape | |
1.0% 98.0% 0.077s 2.20e-03s C 35 35 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.7% 98.7% 0.055s 1.07e-03s C 51 51 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.5% 99.2% 0.042s 7.04e-03s C 6 6 theano.sandbox.cuda.basic_ops.HostFromGpu | |
0.3% 99.5% 0.022s 7.34e-03s Py 3 3 theano.tensor.basic.Split | |
0.2% 99.7% 0.017s 5.73e-03s C 3 3 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.2% 99.9% 0.013s 1.20e-03s C 11 11 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 99.9% 0.002s 2.30e-03s C 1 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
0.0% 99.9% 0.002s 2.06e-03s C 1 1 theano.sandbox.cuda.nnet.GpuSoftmaxWithBias | |
0.0% 100.0% 0.001s 1.24e-03s C 1 1 theano.sandbox.cuda.blas.GpuGemm | |
0.0% 100.0% 0.001s 1.11e-03s C 1 1 theano.tensor.elemwise.Sum | |
0.0% 100.0% 0.001s 1.57e-06s C 507 507 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.000s 2.56e-06s C 113 113 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.0% 100.0% 0.000s 2.44e-06s C 82 82 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.0% 100.0% 0.000s 1.82e-06s C 89 89 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.000s 1.30e-05s Py 12 12 theano.compile.ops.Rebroadcast | |
... (remaining 3 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
39.2% 39.2% 3.110s 3.11e+00s Py 1 1 forall_inplace,gpu,grad_of_scan_fn} | |
23.2% 62.4% 1.839s 1.84e+00s Py 1 1 forall_inplace,gpu,grad_of_scan_fn} | |
16.6% 79.0% 1.312s 1.31e+00s Py 1 1 forall_inplace,gpu,grad_of_scan_fn} | |
4.9% 83.9% 0.390s 2.17e-02s C 18 18 GpuDot22 | |
3.9% 87.8% 0.307s 3.07e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
2.9% 90.7% 0.229s 2.29e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
2.3% 92.9% 0.180s 1.80e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
1.4% 94.3% 0.107s 1.79e-02s C 6 6 GpuCAReduce{add}{1,1,0} | |
1.2% 95.5% 0.099s 3.81e-03s Py 26 26 GpuReshape{2} | |
0.7% 96.2% 0.055s 1.07e-03s C 51 51 GpuAlloc{memset_0=True} | |
0.5% 96.7% 0.042s 7.04e-03s C 6 6 HostFromGpu | |
0.5% 97.2% 0.037s 6.16e-03s C 6 6 GpuIncSubtensor{Inc;:int64:} | |
0.4% 97.6% 0.033s 4.17e-03s C 8 8 GpuElemwise{add,no_inplace} | |
0.3% 98.0% 0.026s 2.19e-03s C 12 12 GpuIncSubtensor{InplaceInc;int64::} | |
0.3% 98.2% 0.022s 7.34e-03s Py 3 3 Split{2} | |
0.3% 98.5% 0.022s 9.13e-04s C 24 24 GpuElemwise{Add}[(0, 0)] | |
0.2% 98.7% 0.017s 5.73e-03s C 3 3 GpuJoin | |
0.2% 98.9% 0.015s 1.62e-03s C 9 9 GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace} | |
0.2% 99.1% 0.014s 6.80e-04s C 20 20 GpuElemwise{Composite{[sub(mul(i0, i1), mul(i2, i3))]}}[(0, 1)] | |
0.2% 99.3% 0.013s 1.20e-03s C 11 11 GpuFromHost | |
... (remaining 126 Ops account for 0.75%(0.06s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
39.2% 39.2% 3.110s 3.11e+00s 1 1119 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{sub,no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]}, | |
23.2% 62.4% 1.839s 1.84e+00s 1 1175 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{sub,no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]}, | |
16.6% 79.0% 1.312s 1.31e+00s 1 1063 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{sub,no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]}, | |
3.9% 82.8% 0.307s 3.07e-01s 1 405 forall_inplace,gpu,scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuElemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, W_hid_to_gates_fwd, W_hid_to_gates_bck, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}. | |
2.9% 85.7% 0.229s 2.29e-01s 1 680 forall_inplace,gpu,scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuElemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, W_hid_to_gates_fwd, W_hid_to_gates_bck, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}. | |
2.3% 88.0% 0.180s 1.80e-01s 1 972 forall_inplace,gpu,scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuElemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, W_hid_to_gates_fwd, W_hid_to_gates_bck, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}. | |
0.4% 88.4% 0.035s 3.55e-02s 1 1146 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.4% 88.9% 0.035s 3.55e-02s 1 490 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 89.3% 0.035s 3.54e-02s 1 501 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 89.8% 0.035s 3.47e-02s 1 1149 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.4% 90.2% 0.032s 3.16e-02s 1 1150 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
0.4% 90.6% 0.032s 3.16e-02s 1 1147 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
0.4% 90.9% 0.029s 2.94e-02s 1 1090 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.4% 91.3% 0.029s 2.92e-02s 1 1141 GpuCAReduce{add}{1,1,0}(GpuIncSubtensor{InplaceInc;int64::}.0) | |
0.4% 91.7% 0.028s 2.83e-02s 1 1139 GpuCAReduce{add}{1,1,0}(GpuIncSubtensor{InplaceInc;int64::}.0) | |
0.3% 92.0% 0.027s 2.71e-02s 1 1093 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.3% 92.3% 0.023s 2.29e-02s 1 1110 HostFromGpu(GpuDimShuffle{1,0,2}.0) | |
0.3% 92.6% 0.022s 2.21e-02s 1 778 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.3% 92.9% 0.022s 2.20e-02s 1 768 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.3% 93.1% 0.020s 2.01e-02s 1 1094 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
... (remaining 1194 Apply instances account for 6.89%(0.55s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 216 steps) 3.071730e-01s | |
Total time spent in calling the VM 2.666941e-01s (86.822%) | |
Total overhead (computing slices..) 4.047894e-02s (13.178%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
83.9% 83.9% 0.220s 5.09e-04s C 432 2 theano.sandbox.cuda.blas.GpuGemm | |
15.8% 99.7% 0.042s 4.81e-05s C 864 4 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.3% 100.0% 0.001s 7.63e-07s C 864 4 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
83.9% 83.9% 0.220s 5.09e-04s C 432 2 GpuGemm{no_inplace} | |
5.1% 89.0% 0.013s 6.20e-05s C 216 1 GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace} | |
4.2% 93.2% 0.011s 5.06e-05s C 216 1 GpuElemwise{Composite{[add(mul(add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4))), i5), mul(i0, i6))]},no_inplace} | |
3.3% 96.5% 0.009s 4.01e-05s C 216 1 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace} | |
3.3% 99.7% 0.009s 3.95e-05s C 216 1 GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace} | |
0.3% 100.0% 0.001s 7.63e-07s C 864 4 GpuSubtensor{::, int64:int64:} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
43.2% 43.2% 0.113s 5.24e-04s 216 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1][cuda], W_hid_to_gates_fwd_copy[cuda], TensorConstant{1.0}) | |
40.7% 83.9% 0.107s 4.94e-04s 216 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1][cuda], W_hid_to_gates_bck_copy[cuda], TensorConstant{1.0}) | |
5.1% 89.0% 0.013s 6.20e-05s 216 6 GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace}(cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0) | |
4.2% 93.2% 0.011s 5.06e-05s 216 7 GpuElemwise{Composite{[add(mul(add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4))), i5), mul(i0, i6))]},no_inplace}(cell_init_bck[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, col)>, <CudaNdarrayType(float32, col)>) | |
3.3% 96.5% 0.009s 4.01e-05s 216 9 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace}(GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace}.0, <CudaNdarrayType(float32, col)>, hid_init_bck[t-1][cuda], <CudaNdarrayType(float32, col)>) | |
3.3% 99.7% 0.009s 3.95e-05s 216 8 GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace}(cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace}.0) | |
0.1% 99.8% 0.000s 1.22e-06s 216 3 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{0}, Constant{156}) | |
0.1% 99.9% 0.000s 9.18e-07s 216 5 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{0}, Constant{156}) | |
0.0% 100.0% 0.000s 4.90e-07s 216 2 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{312}, Constant{468}) | |
0.0% 100.0% 0.000s 4.19e-07s 216 4 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{312}, Constant{468}) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 216 steps) 2.288671e-01s | |
Total time spent in calling the VM 1.899178e-01s (82.982%) | |
Total overhead (computing slices..) 3.894925e-02s (17.018%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
75.8% 75.8% 0.139s 3.23e-04s C 432 2 theano.sandbox.cuda.blas.GpuGemm | |
23.9% 99.7% 0.044s 5.09e-05s C 864 4 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.3% 100.0% 0.001s 6.65e-07s C 864 4 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
75.8% 75.8% 0.139s 3.23e-04s C 432 2 GpuGemm{no_inplace} | |
6.9% 82.7% 0.013s 5.87e-05s C 216 1 GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace} | |
6.9% 89.5% 0.013s 5.84e-05s C 216 1 GpuElemwise{Composite{[add(mul(add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4))), i5), mul(i0, i6))]},no_inplace} | |
5.4% 94.9% 0.010s 4.63e-05s C 216 1 GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace} | |
4.7% 99.7% 0.009s 4.03e-05s C 216 1 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace} | |
0.3% 100.0% 0.001s 6.65e-07s C 864 4 GpuSubtensor{::, int64:int64:} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
41.7% 41.7% 0.077s 3.55e-04s 216 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1][cuda], W_hid_to_gates_bck_copy[cuda], TensorConstant{1.0}) | |
34.1% 75.8% 0.063s 2.90e-04s 216 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1][cuda], W_hid_to_gates_fwd_copy[cuda], TensorConstant{1.0}) | |
6.9% 82.7% 0.013s 5.87e-05s 216 6 GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace}(cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0) | |
6.9% 89.5% 0.013s 5.84e-05s 216 7 GpuElemwise{Composite{[add(mul(add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4))), i5), mul(i0, i6))]},no_inplace}(cell_init_bck[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, col)>, <CudaNdarrayType(float32, col)>) | |
5.4% 94.9% 0.010s 4.63e-05s 216 8 GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace}(cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace}.0) | |
4.7% 99.7% 0.009s 4.03e-05s 216 9 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace}(GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace}.0, <CudaNdarrayType(float32, col)>, hid_init_bck[t-1][cuda], <CudaNdarrayType(float32, col)>) | |
0.1% 99.8% 0.000s 9.83e-07s 216 3 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{0}, Constant{300}) | |
0.1% 99.9% 0.000s 9.48e-07s 216 5 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{0}, Constant{300}) | |
0.0% 100.0% 0.000s 3.75e-07s 216 4 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{600}, Constant{900}) | |
0.0% 100.0% 0.000s 3.52e-07s 216 2 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{600}, Constant{900}) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 216 steps) 1.795299e-01s | |
Total time spent in calling the VM 1.442864e-01s (80.369%) | |
Total overhead (computing slices..) 3.524351e-02s (19.631%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
72.3% 72.3% 0.101s 2.34e-04s C 432 2 theano.sandbox.cuda.blas.GpuGemm | |
27.3% 99.6% 0.038s 4.43e-05s C 864 4 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.4% 100.0% 0.001s 6.75e-07s C 864 4 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
72.3% 72.3% 0.101s 2.34e-04s C 432 2 GpuGemm{no_inplace} | |
9.9% 82.1% 0.014s 6.40e-05s C 216 1 GpuElemwise{Composite{[add(mul(add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4))), i5), mul(i0, i6))]},no_inplace} | |
7.1% 89.3% 0.010s 4.63e-05s C 216 1 GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace} | |
5.7% 95.0% 0.008s 3.72e-05s C 216 1 GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace} | |
4.6% 99.6% 0.006s 2.98e-05s C 216 1 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace} | |
0.4% 100.0% 0.001s 6.75e-07s C 864 4 GpuSubtensor{::, int64:int64:} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
36.4% 36.4% 0.051s 2.36e-04s 216 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1][cuda], W_hid_to_gates_bck_copy[cuda], TensorConstant{1.0}) | |
35.9% 72.3% 0.050s 2.33e-04s 216 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1][cuda], W_hid_to_gates_fwd_copy[cuda], TensorConstant{1.0}) | |
9.9% 82.1% 0.014s 6.40e-05s 216 7 GpuElemwise{Composite{[add(mul(add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4))), i5), mul(i0, i6))]},no_inplace}(cell_init_bck[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, col)>, <CudaNdarrayType(float32, col)>) | |
7.1% 89.3% 0.010s 4.63e-05s 216 6 GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace}(cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, row)>, GpuSubtensor{::, int64:int64:}.0) | |
5.7% 95.0% 0.008s 3.72e-05s 216 8 GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace}(cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, row)>, GpuElemwise{Composite{[add(mul(scalar_sigmoid(mul(i0, i1)), i0), mul(scalar_sigmoid(add(i2, mul(i0, i3))), tanh(i4)))]},no_inplace}.0) | |
4.6% 99.6% 0.006s 2.98e-05s 216 9 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace}(GpuElemwise{Composite{[mul(scalar_sigmoid(mul(i0, i1)), tanh(i2))]},no_inplace}.0, <CudaNdarrayType(float32, col)>, hid_init_bck[t-1][cuda], <CudaNdarrayType(float32, col)>) | |
0.1% 99.7% 0.000s 9.63e-07s 216 3 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{0}, Constant{102}) | |
0.1% 99.9% 0.000s 8.41e-07s 216 5 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{0}, Constant{102}) | |
0.1% 99.9% 0.000s 4.72e-07s 216 2 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{204}, Constant{306}) | |
0.1% 100.0% 0.000s 4.23e-07s 216 4 GpuSubtensor{::, int64:int64:}(GpuGemm{no_inplace}.0, Constant{204}, Constant{306}) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 216 steps) 1.312062e+00s | |
Total time spent in calling the VM 1.180757e+00s (89.992%) | |
Total overhead (computing slices..) 1.313052e-01s (10.008%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
39.9% 39.9% 0.433s 2.51e-04s C 1728 8 theano.sandbox.cuda.blas.GpuGemm | |
33.9% 73.8% 0.369s 3.41e-05s C 10800 50 theano.sandbox.cuda.basic_ops.GpuElemwise | |
8.5% 82.3% 0.092s 2.13e-04s C 432 2 theano.sandbox.cuda.blas.GpuDot22 | |
8.1% 90.3% 0.088s 4.05e-05s C 2160 10 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
7.8% 98.1% 0.084s 2.17e-05s C 3888 18 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
1.8% 99.8% 0.019s 4.42e-05s C 432 2 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.001s 1.20e-06s C 864 4 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.1% 100.0% 0.001s 7.25e-07s C 864 4 theano.compile.ops.Shape_i | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
27.5% 27.5% 0.299s 2.77e-04s C 1080 5 GpuGemm{no_inplace} | |
12.4% 39.9% 0.134s 2.07e-04s C 648 3 GpuGemm{inplace} | |
8.5% 48.4% 0.092s 2.13e-04s C 432 2 GpuDot22 | |
8.1% 56.4% 0.088s 4.05e-05s C 2160 10 GpuCAReduce{add}{1,0} | |
7.6% 64.0% 0.082s 4.22e-05s C 1944 9 GpuElemwise{mul,no_inplace} | |
4.7% 68.7% 0.051s 3.95e-05s C 1296 6 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace} | |
3.4% 72.1% 0.037s 1.70e-05s C 2160 10 GpuElemwise{Mul}[(0, 0)] | |
3.3% 75.4% 0.036s 1.67e-05s C 2160 10 GpuIncSubtensor{InplaceInc;int64:int64:} | |
2.7% 78.1% 0.030s 6.89e-05s C 432 2 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace} | |
2.3% 80.4% 0.025s 5.82e-05s C 432 2 GpuIncSubtensor{Inc;::, int64:int64:} | |
2.3% 82.7% 0.025s 3.79e-05s C 648 3 GpuElemwise{Composite{[mul(mul(i0, i1), i2)]},no_inplace} | |
2.1% 84.8% 0.023s 1.77e-05s C 1296 6 GpuIncSubtensor{InplaceInc;::, int64:int64:} | |
1.9% 86.7% 0.021s 4.83e-05s C 432 2 GpuElemwise{Add}[(0, 0)] | |
1.8% 88.5% 0.019s 4.42e-05s C 432 2 GpuAlloc{memset_0=True} | |
1.4% 89.9% 0.015s 3.48e-05s C 432 2 GpuElemwise{Tanh}[(0, 0)] | |
1.0% 90.9% 0.011s 5.04e-05s C 216 1 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace} | |
1.0% 91.8% 0.011s 4.87e-05s C 216 1 GpuElemwise{Composite{[add(add(add(add(mul(i0, i1), mul(i2, i3)), mul(i4, i5)), mul(i6, i7)), i8)]},no_inplace} | |
1.0% 92.8% 0.011s 4.87e-05s C 216 1 GpuElemwise{sub,no_inplace} | |
0.9% 93.7% 0.010s 2.34e-05s C 432 2 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]}}[(0, 0)] | |
0.9% 94.6% 0.010s 4.42e-05s C 216 1 GpuElemwise{Composite{[add(i0, add(i1, i1))]},no_inplace} | |
... (remaining 13 Ops account for 5.38%(0.06s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
7.3% 7.3% 0.079s 3.66e-04s 216 73 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
5.7% 13.0% 0.062s 2.88e-04s 216 57 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
4.9% 17.9% 0.053s 2.46e-04s 216 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1][cuda], W_hid_to_gates_fwd_copy[cuda], TensorConstant{1.0}) | |
4.9% 22.8% 0.053s 2.45e-04s 216 74 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.8% 27.5% 0.052s 2.40e-04s 216 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1][cuda], W_hid_to_gates_bck_copy[cuda], TensorConstant{1.0}) | |
4.5% 32.1% 0.049s 2.28e-04s 216 67 GpuDot22(GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda]) | |
4.2% 36.3% 0.046s 2.12e-04s 216 58 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_bck_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.2% 40.4% 0.045s 2.10e-04s 216 76 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.0% 44.4% 0.043s 2.00e-04s 216 77 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
3.9% 48.4% 0.043s 1.98e-04s 216 68 GpuDot22(hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0) | |
1.4% 49.8% 0.015s 7.06e-05s 216 27 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}(GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, matrix)>) | |
1.3% 51.1% 0.015s 6.73e-05s 216 28 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}(GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, matrix)>) | |
1.2% 52.3% 0.013s 6.14e-05s 216 13 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}(<CudaNdarrayType(float32, matrix)>, cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>) | |
1.2% 53.5% 0.013s 5.89e-05s 216 24 GpuElemwise{mul,no_inplace}(GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, <CudaNdarrayType(float32, row)>) | |
1.2% 54.6% 0.013s 5.84e-05s 216 48 GpuIncSubtensor{Inc;::, int64:int64:}(GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, Constant{0}, Constant{102}) | |
1.2% 55.8% 0.013s 5.80e-05s 216 46 GpuIncSubtensor{Inc;::, int64:int64:}(GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, Constant{0}, Constant{102}) | |
1.1% 56.9% 0.012s 5.73e-05s 216 85 GpuElemwise{Add}[(0, 0)](GpuGemm{inplace}.0, GpuGemm{no_inplace}.0) | |
1.1% 58.0% 0.012s 5.36e-05s 216 80 GpuCAReduce{add}{1,0}(GpuElemwise{Mul}[(0, 0)].0) | |
1.0% 59.0% 0.011s 5.04e-05s 216 64 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace}(GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]}}[(0, 0)].0, <CudaNdarrayType(float32, row)>) | |
1.0% 60.0% 0.011s 4.93e-05s 216 21 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Shape_i{0}.0, Shape_i{1}.0) | |
... (remaining 78 Apply instances account for 40.01%(0.43s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 216 steps) 3.109816e+00s | |
Total time spent in calling the VM 2.880412e+00s (92.623%) | |
Total overhead (computing slices..) 2.294040e-01s (7.377%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
55.4% 55.4% 1.540s 8.91e-04s C 1728 8 theano.sandbox.cuda.blas.GpuGemm | |
18.9% 74.3% 0.524s 4.85e-05s C 10800 50 theano.sandbox.cuda.basic_ops.GpuElemwise | |
15.0% 89.2% 0.416s 9.62e-04s C 432 2 theano.sandbox.cuda.blas.GpuDot22 | |
6.2% 95.4% 0.171s 4.41e-05s C 3888 18 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
3.5% 98.9% 0.097s 4.51e-05s C 2160 10 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.0% 99.9% 0.028s 6.57e-05s C 432 2 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.0% 100.0% 0.001s 1.41e-06s C 864 4 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.0% 100.0% 0.001s 1.15e-06s C 864 4 theano.compile.ops.Shape_i | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
35.6% 35.6% 0.989s 9.16e-04s C 1080 5 GpuGemm{no_inplace} | |
19.8% 55.4% 0.551s 8.50e-04s C 648 3 GpuGemm{inplace} | |
15.0% 70.4% 0.416s 9.62e-04s C 432 2 GpuDot22 | |
3.5% 73.9% 0.097s 4.51e-05s C 2160 10 GpuCAReduce{add}{1,0} | |
3.0% 76.9% 0.084s 4.33e-05s C 1944 9 GpuElemwise{mul,no_inplace} | |
3.0% 79.9% 0.084s 6.49e-05s C 1296 6 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace} | |
2.9% 82.8% 0.080s 1.85e-04s C 432 2 GpuIncSubtensor{Inc;::, int64:int64:} | |
2.3% 85.1% 0.064s 1.48e-04s C 432 2 GpuElemwise{Add}[(0, 0)] | |
1.8% 86.9% 0.050s 2.31e-05s C 2160 10 GpuIncSubtensor{InplaceInc;int64:int64:} | |
1.7% 88.6% 0.047s 2.15e-05s C 2160 10 GpuElemwise{Mul}[(0, 0)] | |
1.5% 90.1% 0.042s 3.22e-05s C 1296 6 GpuIncSubtensor{InplaceInc;::, int64:int64:} | |
1.2% 91.3% 0.035s 8.00e-05s C 432 2 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace} | |
1.1% 92.4% 0.030s 1.39e-04s C 216 1 GpuElemwise{Composite{[add(add(i0, i1), i2)]}}[(0, 0)] | |
1.0% 93.4% 0.028s 6.57e-05s C 432 2 GpuAlloc{memset_0=True} | |
0.9% 94.3% 0.024s 3.76e-05s C 648 3 GpuElemwise{Composite{[mul(mul(i0, i1), i2)]},no_inplace} | |
0.8% 95.1% 0.023s 1.09e-04s C 216 1 GpuElemwise{Composite{[add(i0, add(i1, i1))]},no_inplace} | |
0.6% 95.8% 0.018s 4.17e-05s C 432 2 GpuElemwise{Tanh}[(0, 0)] | |
0.5% 96.3% 0.015s 6.97e-05s C 216 1 GpuElemwise{Composite{[add(add(add(add(mul(i0, i1), mul(i2, i3)), mul(i4, i5)), mul(i6, i7)), i8)]},no_inplace} | |
0.5% 96.8% 0.013s 2.94e-05s C 432 2 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]}}[(0, 0)] | |
0.4% 97.2% 0.010s 4.82e-05s C 216 1 GpuElemwise{Composite{[add(add(add(add(add(i0, i1), mul(i2, i3)), add(add(i4, i5), mul(i6, i3))), add(add(add(i7, i8), i9), mul(i10, i3))), i11)]}}[(0, 0)] | |
... (remaining 13 Ops account for 2.84%(0.08s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
13.0% 13.0% 0.360s 1.67e-03s 216 73 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
11.8% 24.8% 0.328s 1.52e-03s 216 57 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
10.9% 35.6% 0.303s 1.40e-03s 216 77 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
10.5% 46.1% 0.291s 1.35e-03s 216 68 GpuDot22(hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0) | |
4.7% 50.8% 0.131s 6.08e-04s 216 74 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.5% 55.3% 0.125s 5.78e-04s 216 67 GpuDot22(GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda]) | |
4.5% 59.8% 0.124s 5.75e-04s 216 58 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_bck_copy.T_replace[cuda], TensorConstant{1.0}) | |
4.5% 64.3% 0.124s 5.74e-04s 216 76 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda], TensorConstant{1.0}) | |
3.8% 68.0% 0.104s 4.83e-04s 216 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1][cuda], W_hid_to_gates_bck_copy[cuda], TensorConstant{1.0}) | |
2.4% 70.4% 0.065s 3.03e-04s 216 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1][cuda], W_hid_to_gates_fwd_copy[cuda], TensorConstant{1.0}) | |
2.0% 72.4% 0.057s 2.62e-04s 216 48 GpuIncSubtensor{Inc;::, int64:int64:}(GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, Constant{0}, Constant{300}) | |
1.8% 74.2% 0.051s 2.36e-04s 216 86 GpuElemwise{Add}[(0, 0)](GpuGemm{inplace}.0, GpuGemm{no_inplace}.0) | |
1.1% 75.3% 0.030s 1.39e-04s 216 72 GpuElemwise{Composite{[add(add(i0, i1), i2)]}}[(0, 0)](GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0) | |
0.8% 76.2% 0.023s 1.09e-04s 216 11 GpuElemwise{Composite{[add(i0, add(i1, i1))]},no_inplace}(<CudaNdarrayType(float32, vector)>, <CudaNdarrayType(float32, vector)>) | |
0.8% 77.0% 0.023s 1.07e-04s 216 46 GpuIncSubtensor{Inc;::, int64:int64:}(GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, Constant{0}, Constant{300}) | |
0.8% 77.8% 0.022s 1.03e-04s 216 8 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}(<CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>) | |
0.8% 78.6% 0.022s 1.01e-04s 216 13 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}(<CudaNdarrayType(float32, matrix)>, cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>) | |
0.7% 79.3% 0.018s 8.54e-05s 216 28 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}(GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, matrix)>) | |
0.6% 79.8% 0.016s 7.46e-05s 216 27 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}(GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, matrix)>) | |
0.5% 80.4% 0.015s 7.06e-05s 216 24 GpuElemwise{mul,no_inplace}(GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, <CudaNdarrayType(float32, row)>) | |
... (remaining 78 Apply instances account for 19.62%(0.55s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 1 calls of the op (for a total of 216 steps) 1.839203e+00s | |
Total time spent in calling the VM 1.672098e+00s (90.914%) | |
Total overhead (computing slices..) 1.671045e-01s (9.086%) | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
46.4% 46.4% 0.731s 4.23e-04s C 1728 8 theano.sandbox.cuda.blas.GpuGemm | |
26.7% 73.1% 0.421s 3.90e-05s C 10800 50 theano.sandbox.cuda.basic_ops.GpuElemwise | |
11.7% 84.8% 0.184s 4.26e-04s C 432 2 theano.sandbox.cuda.blas.GpuDot22 | |
7.2% 92.1% 0.114s 2.93e-05s C 3888 18 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
5.9% 98.0% 0.093s 4.31e-05s C 2160 10 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.9% 99.9% 0.030s 6.95e-05s C 432 2 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.001s 1.26e-06s C 864 4 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.1% 100.0% 0.001s 9.80e-07s C 864 4 theano.compile.ops.Shape_i | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
30.0% 30.0% 0.472s 4.37e-04s C 1080 5 GpuGemm{no_inplace} | |
16.4% 46.4% 0.258s 3.99e-04s C 648 3 GpuGemm{inplace} | |
11.7% 58.1% 0.184s 4.26e-04s C 432 2 GpuDot22 | |
5.9% 64.0% 0.093s 4.31e-05s C 2160 10 GpuCAReduce{add}{1,0} | |
4.8% 68.8% 0.075s 3.85e-05s C 1944 9 GpuElemwise{mul,no_inplace} | |
3.5% 72.3% 0.055s 4.25e-05s C 1296 6 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace} | |
2.9% 75.2% 0.046s 2.15e-05s C 2160 10 GpuElemwise{Mul}[(0, 0)] | |
2.5% 77.7% 0.040s 3.07e-05s C 1296 6 GpuIncSubtensor{InplaceInc;::, int64:int64:} | |
2.5% 80.3% 0.040s 1.84e-05s C 2160 10 GpuIncSubtensor{InplaceInc;int64:int64:} | |
2.2% 82.4% 0.034s 7.91e-05s C 432 2 GpuIncSubtensor{Inc;::, int64:int64:} | |
2.0% 84.4% 0.031s 7.18e-05s C 432 2 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace} | |
1.9% 86.3% 0.030s 6.95e-05s C 432 2 GpuAlloc{memset_0=True} | |
1.7% 88.0% 0.026s 4.08e-05s C 648 3 GpuElemwise{Composite{[mul(mul(i0, i1), i2)]},no_inplace} | |
1.7% 89.6% 0.026s 1.21e-04s C 216 1 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), sub(i3, i2))]},no_inplace} | |
1.5% 91.1% 0.023s 1.06e-04s C 216 1 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace} | |
1.3% 92.4% 0.020s 4.57e-05s C 432 2 GpuElemwise{Add}[(0, 0)] | |
1.1% 93.4% 0.017s 7.95e-05s C 216 1 GpuElemwise{Composite{[add(add(i0, i1), i2)]}}[(0, 0)] | |
0.8% 94.2% 0.012s 2.80e-05s C 432 2 GpuElemwise{Tanh}[(0, 0)] | |
0.7% 94.9% 0.011s 2.49e-05s C 432 2 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]}}[(0, 0)] | |
0.7% 95.6% 0.010s 4.79e-05s C 216 1 GpuElemwise{Composite{[add(i0, add(i1, i1))]},no_inplace} | |
... (remaining 13 Ops account for 4.45%(0.07s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
6.3% 6.3% 0.099s 4.57e-04s 216 74 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda], TensorConstant{1.0}) | |
6.1% 12.3% 0.096s 4.43e-04s 216 73 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
6.1% 18.4% 0.095s 4.41e-04s 216 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_fwd[t-1][cuda], W_hid_to_gates_fwd_copy[cuda], TensorConstant{1.0}) | |
6.0% 24.4% 0.094s 4.34e-04s 216 0 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1][cuda], W_hid_to_gates_bck_copy[cuda], TensorConstant{1.0}) | |
5.9% 30.3% 0.094s 4.33e-04s 216 67 GpuDot22(GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda]) | |
5.8% 36.0% 0.091s 4.20e-04s 216 68 GpuDot22(hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0) | |
5.6% 41.7% 0.089s 4.11e-04s 216 57 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, hid_init_bck[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
5.6% 47.3% 0.089s 4.10e-04s 216 58 GpuGemm{inplace}(GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_bck_copy.T_replace[cuda], TensorConstant{1.0}) | |
5.4% 52.7% 0.085s 3.95e-04s 216 76 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, W_hid_to_gates_fwd_copy.T_replace[cuda], TensorConstant{1.0}) | |
5.4% 58.1% 0.084s 3.91e-04s 216 77 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, hid_init_fwd[t-1].T_replace[cuda], GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, TensorConstant{1.0}) | |
1.7% 59.8% 0.026s 1.21e-04s 216 36 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), sub(i3, i2))]},no_inplace}(GpuElemwise{mul,no_inplace}.0, GpuElemwise{Tanh}[(0, 0)].0, GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}.0, CudaNdarrayConstant{[[ 1.]]}) | |
1.5% 61.2% 0.023s 1.06e-04s 216 64 GpuElemwise{Composite{[add(mul(i0, i1), mul(i2, i3))]},no_inplace}(GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, <CudaNdarrayType(float32, row)>, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]}}[(0, 0)].0, <CudaNdarrayType(float32, row)>) | |
1.4% 62.6% 0.022s 1.01e-04s 216 48 GpuIncSubtensor{Inc;::, int64:int64:}(GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}.0, Constant{0}, Constant{156}) | |
1.3% 63.9% 0.021s 9.61e-05s 216 17 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Shape_i{0}.0, Shape_i{1}.0) | |
1.2% 65.1% 0.018s 8.56e-05s 216 27 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}(GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, matrix)>) | |
1.1% 66.2% 0.017s 7.95e-05s 216 72 GpuElemwise{Composite{[add(add(i0, i1), i2)]}}[(0, 0)](GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0, GpuIncSubtensor{InplaceInc;::, int64:int64:}.0) | |
0.9% 67.1% 0.015s 6.79e-05s 216 13 GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]},no_inplace}(<CudaNdarrayType(float32, matrix)>, cell_init_fwd[t-1][cuda], <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>) | |
0.8% 68.0% 0.013s 6.12e-05s 216 49 GpuIncSubtensor{InplaceInc;::, int64:int64:}(GpuAlloc{memset_0=True}.0, GpuElemwise{Composite{[mul(mul(mul(i0, i1), i2), i3)]}}[(0, 3)].0, Constant{0}, Constant{156}) | |
0.8% 68.8% 0.013s 5.85e-05s 216 86 GpuElemwise{Add}[(0, 0)](GpuGemm{inplace}.0, GpuGemm{no_inplace}.0) | |
0.8% 69.6% 0.013s 5.79e-05s 216 28 GpuElemwise{Composite{[scalar_sigmoid(add(i0, i1))]},no_inplace}(GpuSubtensor{::, int64:int64:}.0, <CudaNdarrayType(float32, matrix)>) | |
... (remaining 78 Apply instances account for 30.45%(0.48s) of the runtime) | |
Function profiling | |
================== | |
Message: experiment.py:197 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 1.279212e+01s | |
Number of Apply nodes: 0 | |
Theano Optimizer time: 1.090553e+01s | |
Theano validate time: 4.676583e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.666232e+00s | |
Function profiling | |
================== | |
Message: experiment.py:198 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 1.224835e+01s | |
Number of Apply nodes: 0 | |
Theano Optimizer time: 1.069219e+01s | |
Theano validate time: 2.016317e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.361080e+00s | |
Function profiling | |
================== | |
Message: Sum of all(3) printed profiles at exit excluding Scan op profile. | |
Time in 1 calls to Function.__call__: 7.953641e+00s | |
Time in Function.fn.__call__: 7.953413e+00s (99.997%) | |
Time in thunks: 7.929524e+00s (99.697%) | |
Total compile time: 2.016955e+02s | |
Number of Apply nodes: 1214 | |
Theano Optimizer time: 1.904657e+02s | |
Theano validate time: 5.034458e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.041988e+01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
88.0% 88.0% 6.978s 1.16e+00s Py 6 6 theano.scan_module.scan_op.Scan | |
4.9% 92.9% 0.390s 2.17e-02s C 18 18 theano.sandbox.cuda.blas.GpuDot22 | |
1.5% 94.4% 0.118s 1.23e-03s C 96 96 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.4% 95.8% 0.109s 1.36e-02s C 8 8 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
1.3% 97.0% 0.100s 2.62e-03s Py 38 38 theano.sandbox.cuda.basic_ops.GpuReshape | |
1.0% 98.0% 0.077s 2.20e-03s C 35 35 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.7% 98.7% 0.055s 1.07e-03s C 51 51 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.5% 99.2% 0.042s 7.04e-03s C 6 6 theano.sandbox.cuda.basic_ops.HostFromGpu | |
0.3% 99.5% 0.022s 7.34e-03s Py 3 3 theano.tensor.basic.Split | |
0.2% 99.7% 0.017s 5.73e-03s C 3 3 theano.sandbox.cuda.basic_ops.GpuJoin | |
0.2% 99.9% 0.013s 1.20e-03s C 11 11 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 99.9% 0.002s 2.30e-03s C 1 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
0.0% 99.9% 0.002s 2.06e-03s C 1 1 theano.sandbox.cuda.nnet.GpuSoftmaxWithBias | |
0.0% 100.0% 0.001s 1.24e-03s C 1 1 theano.sandbox.cuda.blas.GpuGemm | |
0.0% 100.0% 0.001s 1.11e-03s C 1 1 theano.tensor.elemwise.Sum | |
0.0% 100.0% 0.001s 1.57e-06s C 507 507 theano.tensor.elemwise.Elemwise | |
0.0% 100.0% 0.000s 2.56e-06s C 113 113 theano.sandbox.cuda.basic_ops.GpuDimShuffle | |
0.0% 100.0% 0.000s 2.44e-06s C 82 82 theano.sandbox.cuda.basic_ops.GpuSubtensor | |
0.0% 100.0% 0.000s 1.82e-06s C 89 89 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.000s 1.30e-05s Py 12 12 theano.compile.ops.Rebroadcast | |
... (remaining 3 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
39.2% 39.2% 3.110s 3.11e+00s Py 1 1 forall_inplace,gpu,grad_of_scan_fn} | |
23.2% 62.4% 1.839s 1.84e+00s Py 1 1 forall_inplace,gpu,grad_of_scan_fn} | |
16.6% 79.0% 1.312s 1.31e+00s Py 1 1 forall_inplace,gpu,grad_of_scan_fn} | |
4.9% 83.9% 0.390s 2.17e-02s C 18 18 GpuDot22 | |
3.9% 87.8% 0.307s 3.07e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
2.9% 90.7% 0.229s 2.29e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
2.3% 92.9% 0.180s 1.80e-01s Py 1 1 forall_inplace,gpu,scan_fn} | |
1.4% 94.3% 0.107s 1.79e-02s C 6 6 GpuCAReduce{add}{1,1,0} | |
1.2% 95.5% 0.099s 3.81e-03s Py 26 26 GpuReshape{2} | |
0.7% 96.2% 0.055s 1.07e-03s C 51 51 GpuAlloc{memset_0=True} | |
0.5% 96.7% 0.042s 7.04e-03s C 6 6 HostFromGpu | |
0.5% 97.2% 0.037s 6.16e-03s C 6 6 GpuIncSubtensor{Inc;:int64:} | |
0.4% 97.6% 0.033s 4.17e-03s C 8 8 GpuElemwise{add,no_inplace} | |
0.3% 98.0% 0.026s 2.19e-03s C 12 12 GpuIncSubtensor{InplaceInc;int64::} | |
0.3% 98.2% 0.022s 7.34e-03s Py 3 3 Split{2} | |
0.3% 98.5% 0.022s 9.13e-04s C 24 24 GpuElemwise{Add}[(0, 0)] | |
0.2% 98.7% 0.017s 5.73e-03s C 3 3 GpuJoin | |
0.2% 98.9% 0.015s 1.62e-03s C 9 9 GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace} | |
0.2% 99.1% 0.014s 6.80e-04s C 20 20 GpuElemwise{Composite{[sub(mul(i0, i1), mul(i2, i3))]}}[(0, 1)] | |
0.2% 99.3% 0.013s 1.20e-03s C 11 11 GpuFromHost | |
... (remaining 126 Ops account for 0.75%(0.06s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
39.2% 39.2% 3.110s 3.11e+00s 1 1119 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{sub,no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]}, | |
23.2% 62.4% 1.839s 1.84e+00s 1 1175 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{sub,no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]}, | |
16.6% 79.0% 1.312s 1.31e+00s 1 1063 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{tanh,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{sub,no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]},no_inplace}.0, GpuElemwise{Composite{[scalar_sigmoid(mul(i0, i1))]}, | |
3.9% 82.8% 0.307s 3.07e-01s 1 405 forall_inplace,gpu,scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuElemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, W_hid_to_gates_fwd, W_hid_to_gates_bck, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}. | |
2.9% 85.7% 0.229s 2.29e-01s 1 680 forall_inplace,gpu,scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuElemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, W_hid_to_gates_fwd, W_hid_to_gates_bck, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}. | |
2.3% 88.0% 0.180s 1.80e-01s 1 972 forall_inplace,gpu,scan_fn}(Elemwise{Composite{[minimum(minimum(i0, i1), i2)]}}.0, GpuElemwise{sub,no_inplace}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, GpuIncSubtensor{InplaceSet;:int64:}.0, W_hid_to_gates_fwd, W_hid_to_gates_bck, GpuDimShuffle{x,0}.0, GpuDimShuffle{x,0}. | |
0.4% 88.4% 0.035s 3.55e-02s 1 1146 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.4% 88.9% 0.035s 3.55e-02s 1 490 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 89.3% 0.035s 3.54e-02s 1 501 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 89.8% 0.035s 3.47e-02s 1 1149 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.4% 90.2% 0.032s 3.16e-02s 1 1150 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
0.4% 90.6% 0.032s 3.16e-02s 1 1147 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
0.4% 90.9% 0.029s 2.94e-02s 1 1090 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.4% 91.3% 0.029s 2.92e-02s 1 1141 GpuCAReduce{add}{1,1,0}(GpuIncSubtensor{InplaceInc;int64::}.0) | |
0.4% 91.7% 0.028s 2.83e-02s 1 1139 GpuCAReduce{add}{1,1,0}(GpuIncSubtensor{InplaceInc;int64::}.0) | |
0.3% 92.0% 0.027s 2.71e-02s 1 1093 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
0.3% 92.3% 0.023s 2.29e-02s 1 1110 HostFromGpu(GpuDimShuffle{1,0,2}.0) | |
0.3% 92.6% 0.022s 2.21e-02s 1 778 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.3% 92.9% 0.022s 2.20e-02s 1 768 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.3% 93.1% 0.020s 2.01e-02s 1 1094 GpuDot22(GpuDimShuffle{1,0}.0, GpuReshape{2}.0) | |
... (remaining 1194 Apply instances account for 6.89%(0.55s) of the runtime) | |
bn09574:lasagne sorensonderby$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment