Last active
August 29, 2015 14:25
-
-
Save ffmpbgrnn/842e1910f216b1e00e27 to your computer and use it in GitHub Desktop.
theano profile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:328 | |
Time in 8 calls to Function.__call__: 1.477107e+02s | |
Time in Function.fn.__call__: 1.474529e+02s (99.825%) | |
Time in thunks: 1.470501e+02s (99.553%) | |
Total compile time: 1.377320e+02s | |
Number of Apply nodes: 525 | |
Theano Optimizer time: 2.186309e+01s | |
Theano validate time: 2.525887e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.157349e+02s | |
Import time 3.757732e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
64.7% 64.7% 95.128s 9.91e-01s C 96 12 theano.tensor.blas.Dot22 | |
11.5% 76.2% 16.857s 1.05e+00s Py 16 2 theano.scan_module.scan_op.Scan | |
9.5% 85.6% 13.954s 1.74e+00s C 8 1 theano.tensor.nnet.nnet.Softmax | |
7.9% 93.6% 11.631s 5.37e-03s C 2168 271 theano.tensor.elemwise.Elemwise | |
2.1% 95.7% 3.089s 3.86e-01s C 8 1 theano.tensor.blas.Dot22Scalar | |
1.6% 97.3% 2.389s 3.32e-02s C 72 9 theano.tensor.elemwise.Sum | |
1.4% 98.7% 2.029s 2.54e-01s Py 8 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
0.4% 99.1% 0.618s 7.72e-02s C 8 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
0.3% 99.4% 0.454s 1.89e-02s Py 24 3 theano.tensor.subtensor.AdvancedSubtensor | |
0.3% 99.7% 0.453s 1.95e-03s C 232 29 theano.tensor.basic.Reshape | |
0.2% 99.9% 0.278s 2.32e-03s C 120 15 theano.tensor.basic.Alloc | |
0.1% 100.0% 0.164s 2.28e-03s C 72 9 theano.tensor.subtensor.IncSubtensor | |
0.0% 100.0% 0.002s 2.15e-04s Py 8 1 theano.tensor.basic.Nonzero | |
0.0% 100.0% 0.001s 2.26e-06s C 424 53 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.001s 3.05e-06s C 264 33 theano.tensor.elemwise.DimShuffle | |
0.0% 100.0% 0.001s 4.06e-06s C 184 23 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.001s 3.71e-06s C 200 25 theano.tensor.subtensor.Subtensor | |
0.0% 100.0% 0.000s 1.16e-06s C 248 31 theano.tensor.basic.ScalarFromTensor | |
0.0% 100.0% 0.000s 2.26e-05s C 8 1 theano.tensor.basic.Join | |
0.0% 100.0% 0.000s 2.40e-06s C 24 3 theano.tensor.basic.Flatten | |
... (remaining 1 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
64.7% 64.7% 95.128s 9.91e-01s C 96 12 Dot22 | |
9.5% 74.2% 13.954s 1.74e+00s C 8 1 Softmax | |
7.1% 81.3% 10.468s 1.31e+00s Py 8 1 forall_inplace,cpu,grad_of_scan_fn} | |
5.4% 86.7% 7.876s 9.85e-01s C 8 1 Elemwise{Composite{(i0 * log((i1 / i2)))}} | |
4.3% 91.0% 6.388s 7.99e-01s Py 8 1 forall_inplace,cpu,scan_fn} | |
2.1% 93.1% 3.089s 3.86e-01s C 8 1 Dot22Scalar | |
1.4% 94.5% 2.029s 2.54e-01s Py 8 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
1.3% 95.8% 1.907s 4.77e-02s C 40 5 Sum{axis=[0, 1], acc_dtype=float64} | |
1.2% 97.0% 1.788s 2.24e-01s C 8 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}} | |
0.4% 97.4% 0.618s 7.72e-02s C 8 1 SoftmaxGrad | |
0.4% 97.8% 0.557s 1.51e-03s C 368 46 Elemwise{add,no_inplace} | |
0.3% 98.1% 0.482s 2.01e-02s C 24 3 Sum{axis=[1], acc_dtype=float64} | |
0.3% 98.4% 0.454s 1.89e-02s Py 24 3 AdvancedSubtensor | |
0.3% 98.7% 0.453s 2.83e-03s C 160 20 Reshape{2} | |
0.3% 99.0% 0.418s 5.22e-03s C 80 10 Elemwise{mul,no_inplace} | |
0.2% 99.2% 0.318s 3.98e-02s C 8 1 Elemwise{clip,no_inplace} | |
0.2% 99.4% 0.278s 2.32e-03s C 120 15 Alloc | |
0.2% 99.6% 0.262s 8.19e-03s C 32 4 Elemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))}}[(0, 0)] | |
0.1% 99.7% 0.206s 1.84e-03s C 112 14 Elemwise{Composite{(i0 * sqr(i1))}} | |
0.1% 99.8% 0.101s 3.15e-03s C 32 4 IncSubtensor{Inc;:int64:} | |
... (remaining 98 Ops account for 0.19%(0.27s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
11.8% 11.8% 17.349s 2.17e+00s 8 92 Dot22(Reshape{2}.0, Reshape{2}.0) | |
9.5% 21.3% 13.954s 1.74e+00s 8 397 Softmax(Reshape{2}.0) | |
9.1% 30.4% 13.341s 1.67e+00s 8 489 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
7.7% 38.1% 11.309s 1.41e+00s 8 93 Dot22(Reshape{2}.0, Reshape{2}.0) | |
7.1% 45.2% 10.468s 1.31e+00s 8 446 forall_inplace,cpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Elemwise{mul,no_inplace}.0, Elemwise{mul,no_inplace}.0, InplaceDimShuffle{0,2,1}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{::int64}.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Elem | |
6.6% 51.8% 9.739s 1.22e+00s 8 94 Dot22(Reshape{2}.0, Reshape{2}.0) | |
6.5% 58.2% 9.491s 1.19e+00s 8 91 Dot22(Reshape{2}.0, Reshape{2}.0) | |
5.6% 63.9% 8.252s 1.03e+00s 8 491 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
5.6% 69.5% 8.230s 1.03e+00s 8 433 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
5.4% 74.8% 7.876s 9.85e-01s 8 411 Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
4.3% 79.2% 6.388s 7.99e-01s 8 337 forall_inplace,cpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Alloc.0, Alloc.0, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>) | |
4.0% 83.2% 5.945s 7.43e-01s 8 485 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
3.0% 86.2% 4.401s 5.50e-01s 8 493 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
2.1% 88.3% 3.094s 3.87e-01s 8 486 Dot22(Flatten{2}.0, Reshape{2}.0) | |
2.1% 90.4% 3.089s 3.86e-01s 8 487 Dot22Scalar(Flatten{2}.0, Reshape{2}.0, TensorConstant{0.0010000000475}) | |
1.4% 91.8% 2.092s 2.62e-01s 8 434 Dot22(Reshape{2}.0, InplaceDimShuffle{1,0}.0) | |
1.4% 93.2% 2.029s 2.54e-01s 8 425 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.3% 94.5% 1.885s 2.36e-01s 8 370 Dot22(Reshape{2}.0, Reshape{2}.0) | |
1.2% 95.7% 1.788s 2.24e-01s 8 424 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0) | |
1.1% 96.8% 1.561s 1.95e-01s 8 430 Sum{axis=[0, 1], acc_dtype=float64}(InplaceDimShuffle{1,0,2}.0) | |
... (remaining 505 Apply instances account for 3.24%(4.77s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 8 calls of the op (for a total of 240 steps) 6.386277e+00s | |
Total time spent in calling the VM 6.356155e+00s (99.528%) | |
Total overhead (computing slices..) 3.012228e-02s (0.472%) | |
Time in all call to theano.grad() 4.098411e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
97.9% 97.9% 6.218s 6.48e-03s C 960 4 theano.tensor.blas.Gemm | |
2.1% 100.0% 0.135s 1.88e-04s C 720 3 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
97.9% 97.9% 6.218s 6.48e-03s C 960 4 Gemm{no_inplace} | |
1.3% 99.2% 0.085s 3.52e-04s C 240 1 Elemwise{Composite{((i0 * i1 * i2) + (i3 * i4))}} | |
0.8% 100.0% 0.051s 1.06e-04s C 480 2 Elemwise{mul,no_inplace} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
33.6% 33.6% 2.132s 8.88e-03s 240 2 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
23.1% 56.6% 1.467s 6.11e-03s 240 1 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
20.8% 77.5% 1.322s 5.51e-03s 240 4 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
20.4% 97.9% 1.297s 5.40e-03s 240 3 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, Elemwise{mul,no_inplace}.0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
1.3% 99.2% 0.085s 3.52e-04s 240 5 Elemwise{Composite{((i0 * i1 * i2) + (i3 * i4))}}(Gemm{no_inplace}.0, <TensorType(int8, col)>, <TensorType(float32, matrix)>, Gemm{no_inplace}.0, Gemm{no_inplace}.0) | |
0.5% 99.8% 0.035s 1.45e-04s 240 0 Elemwise{mul,no_inplace}(<TensorType(int8, col)>, <TensorType(float32, matrix)>) | |
0.2% 100.0% 0.016s 6.60e-05s 240 6 Elemwise{mul,no_inplace}(Gemm{no_inplace}.0, Elemwise{Composite{((i0 * i1 * i2) + (i3 * i4))}}.0) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 8 calls of the op (for a total of 240 steps) 1.046483e+01s | |
Total time spent in calling the VM 1.024789e+01s (97.927%) | |
Total overhead (computing slices..) 2.169311e-01s (2.073%) | |
Time in all call to theano.grad() 4.098411e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
60.8% 60.8% 6.226s 2.36e-03s C 2640 11 theano.tensor.blas.Gemm | |
32.6% 93.5% 3.340s 2.32e-03s C 1440 6 theano.tensor.blas.Dot22 | |
6.5% 100.0% 0.667s 1.85e-04s C 3600 15 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
37.8% 37.8% 3.867s 2.30e-03s C 1680 7 Gemm{inplace} | |
32.6% 70.4% 3.340s 2.32e-03s C 1440 6 Dot22 | |
23.0% 93.5% 2.359s 2.46e-03s C 960 4 Gemm{no_inplace} | |
3.2% 96.6% 0.323s 2.24e-04s C 1440 6 Elemwise{add,no_inplace} | |
1.1% 97.7% 0.111s 4.64e-04s C 240 1 Elemwise{Composite{(i0 + (i1 * i2 * i3 * i4) + (i1 * i5 * i2))}} | |
0.9% 98.7% 0.096s 1.00e-04s C 960 4 Elemwise{mul} | |
0.9% 99.5% 0.088s 3.65e-04s C 240 1 Elemwise{Composite{(i0 + ((i1 + i2) * i3) + (i4 * i3))}} | |
0.3% 99.8% 0.029s 6.00e-05s C 480 2 Elemwise{Mul}[(0, 1)] | |
0.2% 100.0% 0.020s 8.42e-05s C 240 1 Elemwise{Mul}[(0, 2)] | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
9.2% 9.2% 0.938s 3.91e-03s 240 26 Gemm{inplace}(Dot22.0, TensorConstant{1.0}, <TensorType(float32, matrix)>, Elemwise{Mul}[(0, 1)].0, TensorConstant{1.0}) | |
7.3% 16.4% 0.744s 3.10e-03s 240 18 Gemm{inplace}(Dot22.0, TensorConstant{1.0}, Elemwise{mul}.0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
7.1% 23.6% 0.731s 3.04e-03s 240 17 Dot22(Elemwise{mul}.0, <TensorType(float32, matrix)>) | |
7.0% 30.6% 0.720s 3.00e-03s 240 0 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
6.1% 36.7% 0.626s 2.61e-03s 240 19 Dot22(<TensorType(float32, matrix)>, Elemwise{Mul}[(0, 2)].0) | |
5.7% 42.4% 0.584s 2.43e-03s 240 24 Gemm{inplace}(Dot22.0, TensorConstant{1.0}, <TensorType(float32, matrix)>, Elemwise{Mul}[(0, 1)].0, TensorConstant{1.0}) | |
5.7% 48.1% 0.581s 2.42e-03s 240 10 Dot22(Elemwise{mul}.0, <TensorType(float32, matrix)>) | |
5.5% 53.6% 0.562s 2.34e-03s 240 2 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
5.5% 59.1% 0.561s 2.34e-03s 240 1 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
5.2% 64.3% 0.529s 2.20e-03s 240 14 Dot22(<TensorType(float32, matrix)>, Elemwise{mul}.0) | |
5.0% 69.3% 0.515s 2.15e-03s 240 4 Gemm{no_inplace}(<TensorType(float32, matrix)>, TensorConstant{1.0}, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
4.4% 73.7% 0.446s 1.86e-03s 240 25 Gemm{inplace}(Dot22.0, TensorConstant{1.0}, Elemwise{Mul}[(0, 2)].0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
4.3% 78.0% 0.440s 1.83e-03s 240 16 Dot22(<TensorType(float32, matrix)>, Elemwise{mul}.0) | |
4.2% 82.2% 0.434s 1.81e-03s 240 9 Dot22(Elemwise{mul}.0, <TensorType(float32, matrix)>) | |
4.0% 86.2% 0.411s 1.71e-03s 240 27 Gemm{inplace}(Gemm{inplace}.0, TensorConstant{1.0}, Elemwise{Mul}[(0, 1)].0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
3.8% 90.0% 0.391s 1.63e-03s 240 22 Gemm{inplace}(Dot22.0, TensorConstant{1.0}, <TensorType(float32, matrix)>, Elemwise{mul}.0, TensorConstant{1.0}) | |
3.4% 93.5% 0.352s 1.47e-03s 240 21 Gemm{inplace}(Dot22.0, TensorConstant{1.0}, Elemwise{Mul}[(0, 1)].0, <TensorType(float32, matrix)>, TensorConstant{1.0}) | |
1.1% 94.6% 0.111s 4.64e-04s 240 7 Elemwise{Composite{(i0 + (i1 * i2 * i3 * i4) + (i1 * i5 * i2))}}(<TensorType(float32, matrix)>, <TensorType(int8, col)>, Gemm{no_inplace}.0, <TensorType(float32, matrix)>, Gemm{no_inplace}.0, <TensorType(float32, matrix)>) | |
0.9% 95.4% 0.088s 3.65e-04s 240 31 Elemwise{Composite{(i0 + ((i1 + i2) * i3) + (i4 * i3))}}(<TensorType(float32, matrix)>, Gemm{inplace}.0, Gemm{inplace}.0, <TensorType(int8, col)>, Gemm{inplace}.0) | |
0.7% 96.2% 0.075s 3.12e-04s 240 30 Elemwise{add,no_inplace}(<TensorType(float32, matrix)>, Gemm{inplace}.0) | |
... (remaining 12 Apply instances account for 3.85%(0.39s) of the runtime) | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:330 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.309456e+01s | |
Number of Apply nodes: 536 | |
Theano Optimizer time: 1.634476e+01s | |
Theano validate time: 2.549255e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 6.595885e+00s | |
Import time 7.425475e-02s | |
Time in all call to theano.grad() 4.098411e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:332 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 8.350207e+00s | |
Number of Apply nodes: 134 | |
Theano Optimizer time: 1.283817e+00s | |
Theano validate time: 5.622768e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 7.014123e+00s | |
Import time 3.218102e-02s | |
Time in all call to theano.grad() 4.098411e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:334 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 5.702252e+00s | |
Number of Apply nodes: 152 | |
Theano Optimizer time: 1.817622e+00s | |
Theano validate time: 7.331610e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.821592e+00s | |
Import time 1.338577e-02s | |
Time in all call to theano.grad() 4.098411e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:336 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 1.727933e+00s | |
Number of Apply nodes: 163 | |
Theano Optimizer time: 1.428595e+00s | |
Theano validate time: 7.478404e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.332819e-01s | |
Import time 0.000000e+00s | |
Time in all call to theano.grad() 4.098411e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Function profiling | |
================== | |
Message: Sum of all(5) printed profiles at exit excluding Scan op profile. | |
Time in 8 calls to Function.__call__: 1.477107e+02s | |
Time in Function.fn.__call__: 1.474529e+02s (99.825%) | |
Time in thunks: 1.470501e+02s (99.553%) | |
Total compile time: 1.766070e+02s | |
Number of Apply nodes: 525 | |
Theano Optimizer time: 4.273788e+01s | |
Theano validate time: 7.118421e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.333998e+02s | |
Import time 4.955947e-01s | |
Time in all call to theano.grad() 4.098411e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
64.7% 64.7% 95.128s 9.91e-01s C 96 12 theano.tensor.blas.Dot22 | |
11.5% 76.2% 16.857s 1.05e+00s Py 16 2 theano.scan_module.scan_op.Scan | |
9.5% 85.6% 13.954s 1.74e+00s C 8 1 theano.tensor.nnet.nnet.Softmax | |
7.9% 93.6% 11.631s 5.37e-03s C 2168 271 theano.tensor.elemwise.Elemwise | |
2.1% 95.7% 3.089s 3.86e-01s C 8 1 theano.tensor.blas.Dot22Scalar | |
1.6% 97.3% 2.389s 3.32e-02s C 72 9 theano.tensor.elemwise.Sum | |
1.4% 98.7% 2.029s 2.54e-01s Py 8 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
0.4% 99.1% 0.618s 7.72e-02s C 8 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
0.3% 99.4% 0.454s 1.89e-02s Py 24 3 theano.tensor.subtensor.AdvancedSubtensor | |
0.3% 99.7% 0.453s 1.95e-03s C 232 29 theano.tensor.basic.Reshape | |
0.2% 99.9% 0.278s 2.32e-03s C 120 15 theano.tensor.basic.Alloc | |
0.1% 100.0% 0.164s 2.28e-03s C 72 9 theano.tensor.subtensor.IncSubtensor | |
0.0% 100.0% 0.002s 2.15e-04s Py 8 1 theano.tensor.basic.Nonzero | |
0.0% 100.0% 0.001s 2.26e-06s C 424 53 theano.compile.ops.Shape_i | |
0.0% 100.0% 0.001s 3.05e-06s C 264 33 theano.tensor.elemwise.DimShuffle | |
0.0% 100.0% 0.001s 4.06e-06s C 184 23 theano.tensor.opt.MakeVector | |
0.0% 100.0% 0.001s 3.71e-06s C 200 25 theano.tensor.subtensor.Subtensor | |
0.0% 100.0% 0.000s 1.16e-06s C 248 31 theano.tensor.basic.ScalarFromTensor | |
0.0% 100.0% 0.000s 2.26e-05s C 8 1 theano.tensor.basic.Join | |
0.0% 100.0% 0.000s 2.40e-06s C 24 3 theano.tensor.basic.Flatten | |
... (remaining 1 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
64.7% 64.7% 95.128s 9.91e-01s C 96 12 Dot22 | |
9.5% 74.2% 13.954s 1.74e+00s C 8 1 Softmax | |
7.1% 81.3% 10.468s 1.31e+00s Py 8 1 forall_inplace,cpu,grad_of_scan_fn} | |
5.4% 86.7% 7.876s 9.85e-01s C 8 1 Elemwise{Composite{(i0 * log((i1 / i2)))}} | |
4.3% 91.0% 6.388s 7.99e-01s Py 8 1 forall_inplace,cpu,scan_fn} | |
2.1% 93.1% 3.089s 3.86e-01s C 8 1 Dot22Scalar | |
1.4% 94.5% 2.029s 2.54e-01s Py 8 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
1.3% 95.8% 1.907s 4.77e-02s C 40 5 Sum{axis=[0, 1], acc_dtype=float64} | |
1.2% 97.0% 1.788s 2.24e-01s C 8 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}} | |
0.4% 97.4% 0.618s 7.72e-02s C 8 1 SoftmaxGrad | |
0.4% 97.8% 0.557s 1.51e-03s C 368 46 Elemwise{add,no_inplace} | |
0.3% 98.1% 0.482s 2.01e-02s C 24 3 Sum{axis=[1], acc_dtype=float64} | |
0.3% 98.4% 0.454s 1.89e-02s Py 24 3 AdvancedSubtensor | |
0.3% 98.7% 0.453s 2.83e-03s C 160 20 Reshape{2} | |
0.3% 99.0% 0.418s 5.22e-03s C 80 10 Elemwise{mul,no_inplace} | |
0.2% 99.2% 0.318s 3.98e-02s C 8 1 Elemwise{clip,no_inplace} | |
0.2% 99.4% 0.278s 2.32e-03s C 120 15 Alloc | |
0.2% 99.6% 0.262s 8.19e-03s C 32 4 Elemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))}}[(0, 0)] | |
0.1% 99.7% 0.206s 1.84e-03s C 112 14 Elemwise{Composite{(i0 * sqr(i1))}} | |
0.1% 99.8% 0.101s 3.15e-03s C 32 4 IncSubtensor{Inc;:int64:} | |
... (remaining 98 Ops account for 0.19%(0.27s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
11.8% 11.8% 17.349s 2.17e+00s 8 92 Dot22(Reshape{2}.0, Reshape{2}.0) | |
9.5% 21.3% 13.954s 1.74e+00s 8 397 Softmax(Reshape{2}.0) | |
9.1% 30.4% 13.341s 1.67e+00s 8 489 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
7.7% 38.1% 11.309s 1.41e+00s 8 93 Dot22(Reshape{2}.0, Reshape{2}.0) | |
7.1% 45.2% 10.468s 1.31e+00s 8 446 forall_inplace,cpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Elemwise{mul,no_inplace}.0, Elemwise{mul,no_inplace}.0, InplaceDimShuffle{0,2,1}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, Subtensor{::int64}.0, Alloc.0, Alloc.0, Alloc.0, Alloc.0, Elem | |
6.6% 51.8% 9.739s 1.22e+00s 8 94 Dot22(Reshape{2}.0, Reshape{2}.0) | |
6.5% 58.2% 9.491s 1.19e+00s 8 91 Dot22(Reshape{2}.0, Reshape{2}.0) | |
5.6% 63.9% 8.252s 1.03e+00s 8 491 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
5.6% 69.5% 8.230s 1.03e+00s 8 433 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
5.4% 74.8% 7.876s 9.85e-01s 8 411 Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
4.3% 79.2% 6.388s 7.99e-01s 8 337 forall_inplace,cpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, Alloc.0, Alloc.0, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>, <TensorType(float32, matrix)>) | |
4.0% 83.2% 5.945s 7.43e-01s 8 485 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
3.0% 86.2% 4.401s 5.50e-01s 8 493 Dot22(InplaceDimShuffle{1,0}.0, Reshape{2}.0) | |
2.1% 88.3% 3.094s 3.87e-01s 8 486 Dot22(Flatten{2}.0, Reshape{2}.0) | |
2.1% 90.4% 3.089s 3.86e-01s 8 487 Dot22Scalar(Flatten{2}.0, Reshape{2}.0, TensorConstant{0.0010000000475}) | |
1.4% 91.8% 2.092s 2.62e-01s 8 434 Dot22(Reshape{2}.0, InplaceDimShuffle{1,0}.0) | |
1.4% 93.2% 2.029s 2.54e-01s 8 425 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.3% 94.5% 1.885s 2.36e-01s 8 370 Dot22(Reshape{2}.0, Reshape{2}.0) | |
1.2% 95.7% 1.788s 2.24e-01s 8 424 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0) | |
1.1% 96.8% 1.561s 1.95e-01s 8 430 Sum{axis=[0, 1], acc_dtype=float64}(InplaceDimShuffle{1,0,2}.0) | |
... (remaining 505 Apply instances account for 3.24%(4.77s) of the runtime) | |
training time 148.004755974 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:328 | |
Time in 8 calls to Function.__call__: 1.782110e+01s | |
Time in Function.fn.__call__: 1.747024e+01s (98.031%) | |
Time in thunks: 1.711180e+01s (96.020%) | |
Total compile time: 2.021389e+01s | |
Number of Apply nodes: 530 | |
Theano Optimizer time: 1.830422e+01s | |
Theano validate time: 2.977979e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.739421e+00s | |
Import time 1.584115e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
60.1% 60.1% 10.281s 6.15e-03s C 1672 209 theano.tensor.elemwise.Elemwise | |
12.1% 72.2% 2.077s 2.60e-01s Py 8 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
5.1% 77.3% 0.874s 5.47e-02s Py 16 2 theano.scan_module.scan_op.Scan | |
4.2% 81.6% 0.725s 7.56e-03s C 96 12 theano.sandbox.cuda.blas.GpuDot22 | |
3.7% 85.3% 0.634s 2.64e-02s C 24 3 theano.sandbox.cuda.basic_ops.HostFromGpu | |
3.6% 88.8% 0.610s 7.63e-02s C 8 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
2.8% 91.6% 0.482s 1.51e-02s C 32 4 theano.tensor.elemwise.Sum | |
2.6% 94.2% 0.444s 1.85e-02s Py 24 3 theano.tensor.subtensor.AdvancedSubtensor | |
2.4% 96.7% 0.417s 1.74e-02s C 24 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
1.0% 97.7% 0.168s 1.05e-02s C 16 2 theano.tensor.basic.Alloc | |
0.7% 98.4% 0.124s 5.76e-04s C 216 27 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 99.1% 0.124s 2.45e-04s C 504 63 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.3% 99.4% 0.055s 1.38e-03s C 40 5 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.2% 99.6% 0.028s 3.87e-04s C 72 9 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.2% 99.8% 0.027s 2.62e-04s C 104 13 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.8% 0.015s 1.92e-03s C 8 1 theano.sandbox.cuda.nnet.GpuSoftmax | |
0.1% 99.9% 0.012s 1.47e-03s C 8 1 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.1% 100.0% 0.009s 1.16e-03s Py 8 1 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.0% 100.0% 0.002s 2.42e-04s Py 8 1 theano.tensor.basic.Nonzero | |
0.0% 100.0% 0.001s 2.13e-06s C 416 52 theano.compile.ops.Shape_i | |
... (remaining 10 Classes account for 0.02%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
46.1% 46.1% 7.889s 9.86e-01s C 8 1 Elemwise{Composite{(i0 * log((i1 / i2)))}} | |
12.1% 58.2% 2.077s 2.60e-01s Py 8 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
10.4% 68.7% 1.783s 2.23e-01s C 8 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}} | |
4.2% 72.9% 0.725s 7.56e-03s C 96 12 GpuDot22 | |
4.1% 77.0% 0.704s 8.80e-02s Py 8 1 forall_inplace,gpu,grad_of_scan_fn} | |
3.7% 80.7% 0.634s 2.64e-02s C 24 3 HostFromGpu | |
3.6% 84.3% 0.610s 7.63e-02s C 8 1 SoftmaxGrad | |
2.8% 87.1% 0.482s 2.01e-02s C 24 3 Sum{axis=[1], acc_dtype=float64} | |
2.6% 89.7% 0.444s 1.85e-02s Py 24 3 AdvancedSubtensor | |
2.4% 92.1% 0.417s 1.74e-02s C 24 3 GpuFromHost | |
1.8% 94.0% 0.315s 3.94e-02s C 8 1 Elemwise{clip,no_inplace} | |
1.7% 95.7% 0.290s 1.21e-02s C 24 3 Elemwise{mul,no_inplace} | |
1.0% 96.7% 0.170s 2.13e-02s Py 8 1 forall_inplace,gpu,scan_fn} | |
1.0% 97.6% 0.168s 1.05e-02s C 16 2 Alloc | |
0.7% 98.4% 0.124s 8.16e-04s C 152 19 GpuReshape{2} | |
0.3% 98.7% 0.055s 1.38e-03s C 40 5 GpuCAReduce{add}{1,1,0} | |
0.2% 98.9% 0.030s 3.46e-04s C 88 11 GpuElemwise{Add}[(0, 0)] | |
0.2% 99.0% 0.027s 2.78e-04s C 96 12 GpuAlloc{memset_0=True} | |
0.1% 99.2% 0.025s 2.41e-04s C 104 13 GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace} | |
0.1% 99.3% 0.022s 3.85e-04s C 56 7 GpuElemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))},no_inplace} | |
... (remaining 107 Ops account for 0.71%(0.12s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
46.1% 46.1% 7.889s 9.86e-01s 8 418 Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
12.1% 58.2% 2.077s 2.60e-01s 8 421 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
10.4% 68.7% 1.783s 2.23e-01s 8 419 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0) | |
4.1% 72.8% 0.704s 8.80e-02s 8 451 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{Mul}[(0, 1)].0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
3.6% 76.3% 0.610s 7.63e-02s 8 425 SoftmaxGrad(Reshape{2}.0, HostFromGpu.0) | |
1.9% 78.2% 0.319s 3.99e-02s 8 406 HostFromGpu(GpuReshape{3}.0) | |
1.8% 80.0% 0.315s 3.94e-02s 8 414 Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}) | |
1.8% 81.9% 0.314s 3.93e-02s 8 401 HostFromGpu(GpuSoftmax.0) | |
1.7% 83.6% 0.290s 3.63e-02s 8 150 Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0) | |
1.3% 84.9% 0.228s 2.84e-02s 8 75 AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.3% 86.2% 0.216s 2.70e-02s 8 412 AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.2% 87.4% 0.209s 2.61e-02s 8 6 GpuFromHost(<TensorType(float32, 3D)>) | |
1.2% 88.6% 0.208s 2.60e-02s 8 427 GpuFromHost(SoftmaxGrad.0) | |
1.0% 89.6% 0.170s 2.13e-02s 8 352 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType( | |
1.0% 90.6% 0.167s 2.09e-02s 8 413 Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0) | |
0.9% 91.5% 0.161s 2.01e-02s 8 420 Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0) | |
0.9% 92.5% 0.161s 2.01e-02s 8 173 Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0) | |
0.9% 93.4% 0.161s 2.01e-02s 8 415 Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0) | |
0.4% 93.8% 0.066s 8.26e-03s 8 94 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 94.2% 0.065s 8.16e-03s 8 435 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
... (remaining 510 Apply instances account for 5.84%(1.00s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 8 calls of the op (for a total of 240 steps) 1.655595e-01s | |
Total time spent in calling the VM 1.615922e-01s (97.604%) | |
Total overhead (computing slices..) 3.967285e-03s (2.396%) | |
Time in all call to theano.grad() 4.073255e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
81.1% 81.1% 0.129s 1.35e-04s C 960 4 theano.sandbox.cuda.blas.GpuGemm | |
13.8% 95.0% 0.022s 3.06e-05s C 720 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
4.9% 99.8% 0.008s 3.23e-05s C 240 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.2% 100.0% 0.000s 1.12e-06s C 240 1 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
81.1% 81.1% 0.129s 1.35e-04s C 960 4 GpuGemm{no_inplace} | |
8.6% 89.7% 0.014s 2.84e-05s C 480 2 GpuElemwise{mul,no_inplace} | |
5.3% 95.0% 0.008s 3.50e-05s C 240 1 GpuElemwise{Composite{((i0 * (i1 * i2)) + (i3 * i4))},no_inplace} | |
4.9% 99.8% 0.008s 3.23e-05s C 240 1 GpuFromHost | |
0.2% 100.0% 0.000s 1.12e-06s C 240 1 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
20.7% 20.7% 0.033s 1.38e-04s 240 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
20.5% 41.2% 0.033s 1.36e-04s 240 6 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
20.0% 61.2% 0.032s 1.33e-04s 240 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
19.9% 81.1% 0.032s 1.32e-04s 240 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
5.3% 86.4% 0.008s 3.50e-05s 240 7 GpuElemwise{Composite{((i0 * (i1 * i2)) + (i3 * i4))},no_inplace}(GpuGemm{no_inplace}.0, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuGemm{no_inplace}.0, GpuGemm{no_inplace}.0) | |
5.2% 91.6% 0.008s 3.45e-05s 240 2 GpuElemwise{mul,no_inplace}(GpuFromHost.0, <CudaNdarrayType(float32, matrix)>) | |
4.9% 96.5% 0.008s 3.23e-05s 240 1 GpuFromHost(Elemwise{Cast{float32}}.0) | |
3.4% 99.8% 0.005s 2.23e-05s 240 8 GpuElemwise{mul,no_inplace}(GpuGemm{no_inplace}.0, GpuElemwise{Composite{((i0 * (i1 * i2)) + (i3 * i4))},no_inplace}.0) | |
0.2% 100.0% 0.000s 1.12e-06s 240 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 8 calls of the op (for a total of 240 steps) 6.921008e-01s | |
Total time spent in calling the VM 6.207879e-01s (89.696%) | |
Total overhead (computing slices..) 7.131290e-02s (10.304%) | |
Time in all call to theano.grad() 4.073255e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
53.6% 53.6% 0.329s 1.25e-04s C 2640 11 theano.sandbox.cuda.blas.GpuGemm | |
24.6% 78.2% 0.151s 1.05e-04s C 1440 6 theano.sandbox.cuda.blas.GpuDot22 | |
20.2% 98.4% 0.124s 3.24e-05s C 3840 16 theano.sandbox.cuda.basic_ops.GpuElemwise | |
1.5% 99.9% 0.009s 3.94e-05s C 240 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.1% 100.0% 0.000s 1.36e-06s C 240 1 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
32.2% 32.2% 0.198s 1.18e-04s C 1680 7 GpuGemm{inplace} | |
24.6% 56.8% 0.151s 1.05e-04s C 1440 6 GpuDot22 | |
21.4% 78.2% 0.131s 1.37e-04s C 960 4 GpuGemm{no_inplace} | |
8.6% 86.8% 0.053s 3.67e-05s C 1440 6 GpuElemwise{add,no_inplace} | |
4.2% 91.0% 0.026s 2.69e-05s C 960 4 GpuElemwise{mul,no_inplace} | |
2.8% 93.7% 0.017s 2.36e-05s C 720 3 GpuElemwise{Mul}[(0, 1)] | |
1.9% 95.6% 0.012s 4.86e-05s C 240 1 GpuElemwise{Composite{((((i0 + i1) * i2) + (i3 * i2)) + i4)},no_inplace} | |
1.9% 97.5% 0.012s 4.85e-05s C 240 1 GpuElemwise{Composite{((((i0 * i1) * i2) + ((i3 * i1) * i2)) + i4)},no_inplace} | |
1.5% 99.1% 0.009s 3.94e-05s C 240 1 GpuFromHost | |
0.9% 99.9% 0.005s 2.25e-05s C 240 1 GpuElemwise{Mul}[(0, 0)] | |
0.1% 100.0% 0.000s 1.36e-06s C 240 1 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
5.6% 5.6% 0.034s 1.43e-04s 240 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
5.3% 10.9% 0.033s 1.36e-04s 240 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
5.3% 16.2% 0.032s 1.35e-04s 240 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
5.2% 21.4% 0.032s 1.33e-04s 240 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.7% 26.1% 0.029s 1.21e-04s 240 28 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 0)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.7% 30.8% 0.029s 1.21e-04s 240 18 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.7% 35.5% 0.029s 1.20e-04s 240 24 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.7% 40.2% 0.029s 1.19e-04s 240 30 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.6% 44.7% 0.028s 1.17e-04s 240 10 GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>) | |
4.5% 49.2% 0.028s 1.15e-04s 240 21 GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>) | |
4.5% 53.7% 0.028s 1.15e-04s 240 25 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0, TensorConstant{1.0}) | |
4.5% 58.2% 0.027s 1.15e-04s 240 29 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0}) | |
4.5% 62.7% 0.027s 1.14e-04s 240 27 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0}) | |
4.4% 67.1% 0.027s 1.14e-04s 240 9 GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>) | |
3.7% 70.8% 0.023s 9.53e-05s 240 17 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0) | |
3.7% 74.5% 0.023s 9.42e-05s 240 22 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 0)].0) | |
3.7% 78.2% 0.023s 9.40e-05s 240 20 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0) | |
2.0% 80.2% 0.012s 5.16e-05s 240 31 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0) | |
2.0% 82.2% 0.012s 5.06e-05s 240 32 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0) | |
2.0% 84.1% 0.012s 5.06e-05s 240 33 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0) | |
... (remaining 15 Apply instances account for 15.86%(0.10s) of the runtime) | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:330 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.060873e+01s | |
Number of Apply nodes: 541 | |
Theano Optimizer time: 1.904841e+01s | |
Theano validate time: 8.013768e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 1.383735e+00s | |
Import time 1.165748e-02s | |
Time in all call to theano.grad() 4.073255e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:332 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 1.925702e+00s | |
Number of Apply nodes: 137 | |
Theano Optimizer time: 1.554818e+00s | |
Theano validate time: 5.621839e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.086841e-01s | |
Import time 7.719040e-03s | |
Time in all call to theano.grad() 4.073255e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:334 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.623655e+00s | |
Number of Apply nodes: 155 | |
Theano Optimizer time: 2.232972e+00s | |
Theano validate time: 7.306647e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.238871e-01s | |
Import time 5.110025e-03s | |
Time in all call to theano.grad() 4.073255e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:336 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.094579e+00s | |
Number of Apply nodes: 166 | |
Theano Optimizer time: 1.699915e+00s | |
Theano validate time: 7.587528e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 3.269272e-01s | |
Import time 0.000000e+00s | |
Time in all call to theano.grad() 4.073255e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Function profiling | |
================== | |
Message: Sum of all(5) printed profiles at exit excluding Scan op profile. | |
Time in 8 calls to Function.__call__: 1.782110e+01s | |
Time in Function.fn.__call__: 1.747024e+01s (98.031%) | |
Time in thunks: 1.711180e+01s (96.020%) | |
Total compile time: 4.746655e+01s | |
Number of Apply nodes: 530 | |
Theano Optimizer time: 4.284033e+01s | |
Theano validate time: 1.304335e+00s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.082654e+00s | |
Import time 1.828980e-01s | |
Time in all call to theano.grad() 4.073255e-01s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
60.1% 60.1% 10.281s 6.15e-03s C 1672 209 theano.tensor.elemwise.Elemwise | |
12.1% 72.2% 2.077s 2.60e-01s Py 8 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
5.1% 77.3% 0.874s 5.47e-02s Py 16 2 theano.scan_module.scan_op.Scan | |
4.2% 81.6% 0.725s 7.56e-03s C 96 12 theano.sandbox.cuda.blas.GpuDot22 | |
3.7% 85.3% 0.634s 2.64e-02s C 24 3 theano.sandbox.cuda.basic_ops.HostFromGpu | |
3.6% 88.8% 0.610s 7.63e-02s C 8 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
2.8% 91.6% 0.482s 1.51e-02s C 32 4 theano.tensor.elemwise.Sum | |
2.6% 94.2% 0.444s 1.85e-02s Py 24 3 theano.tensor.subtensor.AdvancedSubtensor | |
2.4% 96.7% 0.417s 1.74e-02s C 24 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
1.0% 97.7% 0.168s 1.05e-02s C 16 2 theano.tensor.basic.Alloc | |
0.7% 98.4% 0.124s 5.76e-04s C 216 27 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 99.1% 0.124s 2.45e-04s C 504 63 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.3% 99.4% 0.055s 1.38e-03s C 40 5 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.2% 99.6% 0.028s 3.87e-04s C 72 9 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.2% 99.8% 0.027s 2.62e-04s C 104 13 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.8% 0.015s 1.92e-03s C 8 1 theano.sandbox.cuda.nnet.GpuSoftmax | |
0.1% 99.9% 0.012s 1.47e-03s C 8 1 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.1% 100.0% 0.009s 1.16e-03s Py 8 1 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.0% 100.0% 0.002s 2.42e-04s Py 8 1 theano.tensor.basic.Nonzero | |
0.0% 100.0% 0.001s 2.13e-06s C 416 52 theano.compile.ops.Shape_i | |
... (remaining 10 Classes account for 0.02%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
46.1% 46.1% 7.889s 9.86e-01s C 8 1 Elemwise{Composite{(i0 * log((i1 / i2)))}} | |
12.1% 58.2% 2.077s 2.60e-01s Py 8 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
10.4% 68.7% 1.783s 2.23e-01s C 8 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}} | |
4.2% 72.9% 0.725s 7.56e-03s C 96 12 GpuDot22 | |
4.1% 77.0% 0.704s 8.80e-02s Py 8 1 forall_inplace,gpu,grad_of_scan_fn} | |
3.7% 80.7% 0.634s 2.64e-02s C 24 3 HostFromGpu | |
3.6% 84.3% 0.610s 7.63e-02s C 8 1 SoftmaxGrad | |
2.8% 87.1% 0.482s 2.01e-02s C 24 3 Sum{axis=[1], acc_dtype=float64} | |
2.6% 89.7% 0.444s 1.85e-02s Py 24 3 AdvancedSubtensor | |
2.4% 92.1% 0.417s 1.74e-02s C 24 3 GpuFromHost | |
1.8% 94.0% 0.315s 3.94e-02s C 8 1 Elemwise{clip,no_inplace} | |
1.7% 95.7% 0.290s 1.21e-02s C 24 3 Elemwise{mul,no_inplace} | |
1.0% 96.7% 0.170s 2.13e-02s Py 8 1 forall_inplace,gpu,scan_fn} | |
1.0% 97.6% 0.168s 1.05e-02s C 16 2 Alloc | |
0.7% 98.4% 0.124s 8.16e-04s C 152 19 GpuReshape{2} | |
0.3% 98.7% 0.055s 1.38e-03s C 40 5 GpuCAReduce{add}{1,1,0} | |
0.2% 98.9% 0.030s 3.46e-04s C 88 11 GpuElemwise{Add}[(0, 0)] | |
0.2% 99.0% 0.027s 2.78e-04s C 96 12 GpuAlloc{memset_0=True} | |
0.1% 99.2% 0.025s 2.41e-04s C 104 13 GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace} | |
0.1% 99.3% 0.022s 3.85e-04s C 56 7 GpuElemwise{Composite{(i0 - ((i1 * i2) / sqrt((i3 + i4 + i5))))},no_inplace} | |
... (remaining 107 Ops account for 0.71%(0.12s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
46.1% 46.1% 7.889s 9.86e-01s 8 418 Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
12.1% 58.2% 2.077s 2.60e-01s 8 421 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
10.4% 68.7% 1.783s 2.23e-01s 8 419 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0) | |
4.1% 72.8% 0.704s 8.80e-02s 8 451 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{Mul}[(0, 1)].0, GpuElemwise{mul,no_inplace}.0, GpuDimShuffle{0,2,1}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{::int64}.0, GpuAlloc{memset_0=True} | |
3.6% 76.3% 0.610s 7.63e-02s 8 425 SoftmaxGrad(Reshape{2}.0, HostFromGpu.0) | |
1.9% 78.2% 0.319s 3.99e-02s 8 406 HostFromGpu(GpuReshape{3}.0) | |
1.8% 80.0% 0.315s 3.94e-02s 8 414 Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}) | |
1.8% 81.9% 0.314s 3.93e-02s 8 401 HostFromGpu(GpuSoftmax.0) | |
1.7% 83.6% 0.290s 3.63e-02s 8 150 Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0) | |
1.3% 84.9% 0.228s 2.84e-02s 8 75 AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.3% 86.2% 0.216s 2.70e-02s 8 412 AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.2% 87.4% 0.209s 2.61e-02s 8 6 GpuFromHost(<TensorType(float32, 3D)>) | |
1.2% 88.6% 0.208s 2.60e-02s 8 427 GpuFromHost(SoftmaxGrad.0) | |
1.0% 89.6% 0.170s 2.13e-02s 8 352 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType( | |
1.0% 90.6% 0.167s 2.09e-02s 8 413 Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0) | |
0.9% 91.5% 0.161s 2.01e-02s 8 420 Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0) | |
0.9% 92.5% 0.161s 2.01e-02s 8 173 Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0) | |
0.9% 93.4% 0.161s 2.01e-02s 8 415 Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0) | |
0.4% 93.8% 0.066s 8.26e-03s 8 94 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 94.2% 0.065s 8.16e-03s 8 435 GpuDot22(GpuReshape{2}.0, GpuDimShuffle{1,0}.0) | |
... (remaining 510 Apply instances account for 5.84%(1.00s) of the runtime) | |
training time 18.1833021641 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:328 | |
Time in 8 calls to Function.__call__: 1.868278e+01s | |
Time in Function.fn.__call__: 1.834628e+01s (98.199%) | |
Time in thunks: 1.794744e+01s (96.064%) | |
Total compile time: 2.445579e+01s | |
Number of Apply nodes: 532 | |
Theano Optimizer time: 2.181430e+01s | |
Theano validate time: 2.990005e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.419208e+00s | |
Import time 1.909029e-01s | |
Time in all call to theano.grad() 1.037111e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
61.6% 61.6% 11.056s 6.61e-03s C 1672 209 theano.tensor.elemwise.Elemwise | |
11.4% 73.0% 2.045s 2.56e-01s Py 8 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
5.6% 78.6% 0.999s 6.24e-02s Py 16 2 theano.scan_module.scan_op.Scan | |
4.2% 82.8% 0.753s 7.84e-03s C 96 12 theano.sandbox.cuda.blas.GpuDot22 | |
3.6% 86.4% 0.647s 2.69e-02s C 24 3 theano.sandbox.cuda.basic_ops.HostFromGpu | |
3.4% 89.8% 0.613s 7.66e-02s C 8 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
2.7% 92.5% 0.482s 1.51e-02s C 32 4 theano.tensor.elemwise.Sum | |
2.4% 94.9% 0.436s 1.82e-02s Py 24 3 theano.tensor.subtensor.AdvancedSubtensor | |
1.8% 96.7% 0.321s 1.34e-02s C 24 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.9% 97.6% 0.169s 1.06e-02s C 16 2 theano.tensor.basic.Alloc | |
0.7% 98.3% 0.129s 5.98e-04s C 216 27 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 99.1% 0.129s 2.48e-04s C 520 65 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.3% 99.4% 0.054s 1.35e-03s C 40 5 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.2% 99.6% 0.044s 6.08e-04s C 72 9 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.2% 99.8% 0.028s 2.67e-04s C 104 13 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.016s 1.99e-03s C 8 1 theano.sandbox.cuda.nnet.GpuSoftmax | |
0.1% 99.9% 0.012s 1.47e-03s C 8 1 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.1% 100.0% 0.009s 1.15e-03s Py 8 1 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.0% 100.0% 0.002s 2.06e-04s Py 8 1 theano.tensor.basic.Nonzero | |
0.0% 100.0% 0.001s 1.92e-06s C 416 52 theano.compile.ops.Shape_i | |
... (remaining 10 Classes account for 0.01%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
48.1% 48.1% 8.638s 1.08e+00s C 8 1 Elemwise{Composite{(i0 * log((i1 / i2)))}} | |
11.4% 59.5% 2.045s 2.56e-01s Py 8 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
10.0% 69.5% 1.796s 2.24e-01s C 8 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}} | |
4.6% 74.1% 0.826s 1.03e-01s Py 8 1 forall_inplace,gpu,grad_of_scan_fn} | |
4.2% 78.3% 0.753s 7.84e-03s C 96 12 GpuDot22 | |
3.6% 81.9% 0.647s 2.69e-02s C 24 3 HostFromGpu | |
3.4% 85.3% 0.613s 7.66e-02s C 8 1 SoftmaxGrad | |
2.7% 88.0% 0.482s 2.01e-02s C 24 3 Sum{axis=[1], acc_dtype=float64} | |
2.4% 90.5% 0.436s 1.82e-02s Py 24 3 AdvancedSubtensor | |
1.8% 92.3% 0.321s 1.34e-02s C 24 3 GpuFromHost | |
1.8% 94.0% 0.317s 3.96e-02s C 8 1 Elemwise{clip,no_inplace} | |
1.7% 95.7% 0.302s 1.26e-02s C 24 3 Elemwise{mul,no_inplace} | |
1.0% 96.7% 0.173s 2.17e-02s Py 8 1 forall_inplace,gpu,scan_fn} | |
0.9% 97.6% 0.169s 1.06e-02s C 16 2 Alloc | |
0.7% 98.3% 0.129s 8.48e-04s C 152 19 GpuReshape{2} | |
0.3% 98.6% 0.054s 1.35e-03s C 40 5 GpuCAReduce{add}{1,1,0} | |
0.2% 98.8% 0.035s 1.11e-03s C 32 4 GpuIncSubtensor{Inc;:int64:} | |
0.2% 99.0% 0.030s 3.46e-04s C 88 11 GpuElemwise{Add}[(0, 0)] | |
0.2% 99.1% 0.027s 2.84e-04s C 96 12 GpuAlloc{memset_0=True} | |
0.1% 99.3% 0.025s 2.39e-04s C 104 13 GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace} | |
... (remaining 109 Ops account for 0.71%(0.13s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
48.1% 48.1% 8.638s 1.08e+00s 8 424 Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
11.4% 59.5% 2.045s 2.56e-01s 8 427 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
10.0% 69.5% 1.796s 2.24e-01s 8 425 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0) | |
4.6% 74.1% 0.826s 1.03e-01s 8 453 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{Tanh}[(0, 0)].0, GpuDimShuffle{0,2,1}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0 | |
3.4% 77.5% 0.613s 7.66e-02s 8 431 SoftmaxGrad(Reshape{2}.0, HostFromGpu.0) | |
1.8% 79.4% 0.326s 4.07e-02s 8 406 HostFromGpu(GpuReshape{3}.0) | |
1.8% 81.2% 0.320s 4.01e-02s 8 401 HostFromGpu(GpuSoftmax.0) | |
1.8% 82.9% 0.317s 3.96e-02s 8 416 Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}) | |
1.7% 84.6% 0.302s 3.77e-02s 8 150 Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0) | |
1.2% 85.8% 0.218s 2.73e-02s 8 75 AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.2% 87.0% 0.218s 2.72e-02s 8 412 AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.0% 88.0% 0.173s 2.17e-02s 8 352 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType( | |
0.9% 88.9% 0.169s 2.11e-02s 8 413 Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0) | |
0.9% 89.9% 0.166s 2.07e-02s 8 6 GpuFromHost(<TensorType(float32, 3D)>) | |
0.9% 90.8% 0.161s 2.01e-02s 8 173 Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0) | |
0.9% 91.6% 0.161s 2.01e-02s 8 426 Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0) | |
0.9% 92.5% 0.161s 2.01e-02s 8 420 Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0) | |
0.9% 93.4% 0.155s 1.93e-02s 8 433 GpuFromHost(SoftmaxGrad.0) | |
0.5% 93.9% 0.092s 1.15e-02s 8 91 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 94.3% 0.066s 8.25e-03s 8 94 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
... (remaining 512 Apply instances account for 5.72%(1.03s) of the runtime) | |
Scan Op profiling ( scan_fn ) | |
================== | |
Message: None | |
Time in 8 calls of the op (for a total of 240 steps) 1.686399e-01s | |
Total time spent in calling the VM 1.647773e-01s (97.710%) | |
Total overhead (computing slices..) 3.862619e-03s (2.290%) | |
Time in all call to theano.grad() 1.037111e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
79.9% 79.9% 0.130s 1.35e-04s C 960 4 theano.sandbox.cuda.blas.GpuGemm | |
15.2% 95.1% 0.025s 3.44e-05s C 720 3 theano.sandbox.cuda.basic_ops.GpuElemwise | |
4.7% 99.8% 0.008s 3.18e-05s C 240 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.2% 100.0% 0.000s 1.09e-06s C 240 1 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
79.9% 79.9% 0.130s 1.35e-04s C 960 4 GpuGemm{no_inplace} | |
5.9% 85.8% 0.010s 4.00e-05s C 240 1 GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * (i4 * i5)) + (clip((i0 + i6), i2, i3) * tanh(i7)))},no_inplace} | |
5.2% 91.0% 0.008s 3.50e-05s C 240 1 GpuElemwise{mul,no_inplace} | |
4.7% 95.7% 0.008s 3.18e-05s C 240 1 GpuFromHost | |
4.2% 99.8% 0.007s 2.81e-05s C 240 1 GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace} | |
0.2% 100.0% 0.000s 1.09e-06s C 240 1 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
20.6% 20.6% 0.033s 1.39e-04s 240 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
20.1% 40.7% 0.033s 1.36e-04s 240 6 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}) | |
19.7% 60.4% 0.032s 1.33e-04s 240 5 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}) | |
19.5% 79.9% 0.032s 1.32e-04s 240 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}) | |
5.9% 85.8% 0.010s 4.00e-05s 240 7 GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * (i4 * i5)) + (clip((i0 + i6), i2, i3) * tanh(i7)))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuFromHost.0, <CudaNdarrayType(float32, matrix)>, GpuGemm{no_inplace}.0, GpuGemm{no_inplace}.0) | |
5.2% 91.0% 0.008s 3.50e-05s 240 2 GpuElemwise{mul,no_inplace}(GpuFromHost.0, <CudaNdarrayType(float32, matrix)>) | |
4.7% 95.7% 0.008s 3.18e-05s 240 1 GpuFromHost(Elemwise{Cast{float32}}.0) | |
4.2% 99.8% 0.007s 2.81e-05s 240 8 GpuElemwise{Composite{(clip((i0 + i1), i2, i3) * tanh(i4))},no_inplace}(CudaNdarrayConstant{[[ 0.5]]}, GpuGemm{no_inplace}.0, CudaNdarrayConstant{[[ 0.]]}, CudaNdarrayConstant{[[ 1.]]}, GpuElemwise{Composite{((clip((i0 + i1), i2, i3) * (i4 * i5)) + (clip((i0 + i6), i2, i3) * tanh(i7)))},no_inplace}.0) | |
0.2% 100.0% 0.000s 1.09e-06s 240 0 Elemwise{Cast{float32}}(<TensorType(int8, col)>) | |
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime) | |
Scan Op profiling ( grad_of_scan_fn ) | |
================== | |
Message: None | |
Time in 8 calls of the op (for a total of 240 steps) 8.120770e-01s | |
Total time spent in calling the VM 6.931341e-01s (85.353%) | |
Total overhead (computing slices..) 1.189430e-01s (14.647%) | |
Time in all call to theano.grad() 1.037111e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
49.0% 49.0% 0.336s 1.27e-04s C 2640 11 theano.sandbox.cuda.blas.GpuGemm | |
27.1% 76.1% 0.185s 2.97e-05s C 6240 26 theano.sandbox.cuda.basic_ops.GpuElemwise | |
22.4% 98.5% 0.154s 1.07e-04s C 1440 6 theano.sandbox.cuda.blas.GpuDot22 | |
1.4% 100.0% 0.010s 4.07e-05s C 240 1 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.0% 100.0% 0.000s 1.42e-06s C 240 1 theano.tensor.elemwise.Elemwise | |
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
29.3% 29.3% 0.201s 1.19e-04s C 1680 7 GpuGemm{inplace} | |
22.4% 51.7% 0.154s 1.07e-04s C 1440 6 GpuDot22 | |
19.7% 71.4% 0.135s 1.41e-04s C 960 4 GpuGemm{no_inplace} | |
8.1% 79.5% 0.055s 3.83e-05s C 1440 6 GpuElemwise{add,no_inplace} | |
2.2% 81.7% 0.015s 2.10e-05s C 720 3 GpuElemwise{Add}[(0, 1)] | |
2.0% 83.7% 0.014s 2.91e-05s C 480 2 GpuElemwise{mul,no_inplace} | |
2.0% 85.7% 0.014s 2.83e-05s C 480 2 GpuElemwise{Mul}[(0, 1)] | |
1.8% 87.5% 0.012s 5.01e-05s C 240 1 GpuElemwise{Composite{((((i0 * clip(i1, i2, i3)) * i4) + ((i5 * clip(i1, i2, i3)) * i4)) + i6)},no_inplace} | |
1.7% 89.2% 0.012s 2.47e-05s C 480 2 GpuElemwise{Clip}[(0, 0)] | |
1.7% 90.9% 0.011s 4.78e-05s C 240 1 GpuElemwise{Composite{((((i0 + i1) * i2) + (i3 * i2)) + i4)},no_inplace} | |
1.5% 92.4% 0.010s 2.12e-05s C 480 2 GpuElemwise{Composite{Cast{float32}(AND(GE(i0, i1), LE(i0, i2)))},no_inplace} | |
1.4% 93.8% 0.010s 4.07e-05s C 240 1 GpuFromHost | |
1.1% 94.9% 0.008s 3.15e-05s C 240 1 GpuElemwise{Composite{((i0 * i1) * i2)},no_inplace} | |
1.1% 96.0% 0.007s 3.08e-05s C 240 1 GpuElemwise{Mul}[(0, 3)] | |
0.9% 96.9% 0.006s 2.67e-05s C 240 1 GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)] | |
0.8% 97.7% 0.005s 2.29e-05s C 240 1 GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)] | |
0.8% 98.5% 0.005s 2.27e-05s C 240 1 GpuElemwise{Tanh}[(0, 0)] | |
0.8% 99.3% 0.005s 2.15e-05s C 240 1 GpuElemwise{Composite{Cast{float32}(AND(GE(i0, i1), LE(i0, i2)))}}[(0, 0)] | |
0.7% 100.0% 0.005s 1.94e-05s C 240 1 GpuElemwise{Composite{(i0 - sqr(i1))}}[(0, 1)] | |
0.0% 100.0% 0.000s 1.42e-06s C 240 1 Elemwise{Cast{float32}} | |
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
5.4% 5.4% 0.037s 1.53e-04s 240 2 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}) | |
5.0% 10.3% 0.034s 1.41e-04s 240 4 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}) | |
4.7% 15.0% 0.032s 1.35e-04s 240 1 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{0.20000000298}) | |
4.7% 19.7% 0.032s 1.34e-04s 240 3 GpuGemm{no_inplace}(<CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.4% 24.1% 0.030s 1.24e-04s 240 24 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.3% 28.4% 0.029s 1.23e-04s 240 34 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.3% 32.6% 0.029s 1.22e-04s 240 38 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.2% 36.9% 0.029s 1.21e-04s 240 29 GpuDot22(GpuElemwise{Mul}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>) | |
4.2% 41.1% 0.029s 1.20e-04s 240 41 GpuGemm{inplace}(GpuGemm{inplace}.0, TensorConstant{1.0}, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, <CudaNdarrayType(float32, matrix)>, TensorConstant{1.0}) | |
4.1% 45.2% 0.028s 1.16e-04s 240 16 GpuDot22(GpuElemwise{mul,no_inplace}.0, <CudaNdarrayType(float32, matrix)>) | |
4.1% 49.2% 0.028s 1.16e-04s 240 27 GpuDot22(GpuElemwise{Mul}[(0, 3)].0, <CudaNdarrayType(float32, matrix)>) | |
4.1% 53.3% 0.028s 1.16e-04s 240 33 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0}) | |
4.1% 57.3% 0.028s 1.16e-04s 240 31 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 1)].0, TensorConstant{1.0}) | |
4.0% 61.4% 0.028s 1.15e-04s 240 42 GpuGemm{inplace}(GpuDot22.0, TensorConstant{1.0}, <CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 1)].0, TensorConstant{1.0}) | |
3.4% 64.8% 0.023s 9.66e-05s 240 23 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{mul,no_inplace}.0) | |
3.3% 68.1% 0.023s 9.52e-05s 240 26 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Mul}[(0, 3)].0) | |
3.3% 71.4% 0.023s 9.51e-05s 240 37 GpuDot22(<CudaNdarrayType(float32, matrix)>, GpuElemwise{Composite{((i0 * i1) * i2)}}[(0, 0)].0) | |
1.8% 73.3% 0.013s 5.24e-05s 240 36 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0) | |
1.8% 75.1% 0.012s 5.14e-05s 240 44 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0) | |
1.8% 76.9% 0.012s 5.11e-05s 240 39 GpuElemwise{add,no_inplace}(<CudaNdarrayType(float32, matrix)>, GpuGemm{inplace}.0) | |
... (remaining 25 Apply instances account for 23.13%(0.16s) of the runtime) | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:330 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.442388e+01s | |
Number of Apply nodes: 543 | |
Theano Optimizer time: 2.154786e+01s | |
Theano validate time: 3.120501e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 2.651706e+00s | |
Import time 1.034188e-02s | |
Time in all call to theano.grad() 1.037111e+00s | |
Time in all call to theano.grad() 1.037111e+00s | |
Time in all call to theano.grad() 1.037111e+00s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:332 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.318422e+00s | |
Number of Apply nodes: 137 | |
Theano Optimizer time: 1.716493e+00s | |
Theano validate time: 5.609584e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 4.905031e-01s | |
Import time 7.891417e-03s | |
Time in all call to theano.grad() 1.037111e+00s | |
Time in all call to theano.grad() 1.037111e+00s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:334 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 2.467041e+00s | |
Number of Apply nodes: 155 | |
Theano Optimizer time: 1.826469e+00s | |
Theano validate time: 7.342625e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 5.233629e-01s | |
Import time 5.436182e-03s | |
Time in all call to theano.grad() 1.037111e+00s | |
Time in all call to theano.grad() 1.037111e+00s | |
Function profiling | |
================== | |
Message: build/bdist.linux-x86_64/egg/keras/models.py:336 | |
Time in 0 calls to Function.__call__: 0.000000e+00s | |
Total compile time: 3.259403e+00s | |
Number of Apply nodes: 166 | |
Theano Optimizer time: 2.627834e+00s | |
Theano validate time: 7.786059e-02s | |
Theano Linker time (includes C, CUDA code generation/compiling): 5.099900e-01s | |
Import time 0.000000e+00s | |
Time in all call to theano.grad() 1.037111e+00s | |
Time in all call to theano.grad() 1.037111e+00s | |
Function profiling | |
================== | |
Message: Sum of all(5) printed profiles at exit excluding Scan op profile. | |
Time in 8 calls to Function.__call__: 1.868278e+01s | |
Time in Function.fn.__call__: 1.834628e+01s (98.199%) | |
Time in thunks: 1.794744e+01s (96.064%) | |
Total compile time: 5.692453e+01s | |
Number of Apply nodes: 532 | |
Theano Optimizer time: 4.953296e+01s | |
Theano validate time: 8.184333e-01s | |
Theano Linker time (includes C, CUDA code generation/compiling): 6.594770e+00s | |
Import time 2.145724e-01s | |
Time in all call to theano.grad() 1.037111e+00s | |
Class | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name> | |
61.6% 61.6% 11.056s 6.61e-03s C 1672 209 theano.tensor.elemwise.Elemwise | |
11.4% 73.0% 2.045s 2.56e-01s Py 8 1 theano.tensor.subtensor.AdvancedIncSubtensor | |
5.6% 78.6% 0.999s 6.24e-02s Py 16 2 theano.scan_module.scan_op.Scan | |
4.2% 82.8% 0.753s 7.84e-03s C 96 12 theano.sandbox.cuda.blas.GpuDot22 | |
3.6% 86.4% 0.647s 2.69e-02s C 24 3 theano.sandbox.cuda.basic_ops.HostFromGpu | |
3.4% 89.8% 0.613s 7.66e-02s C 8 1 theano.tensor.nnet.nnet.SoftmaxGrad | |
2.7% 92.5% 0.482s 1.51e-02s C 32 4 theano.tensor.elemwise.Sum | |
2.4% 94.9% 0.436s 1.82e-02s Py 24 3 theano.tensor.subtensor.AdvancedSubtensor | |
1.8% 96.7% 0.321s 1.34e-02s C 24 3 theano.sandbox.cuda.basic_ops.GpuFromHost | |
0.9% 97.6% 0.169s 1.06e-02s C 16 2 theano.tensor.basic.Alloc | |
0.7% 98.3% 0.129s 5.98e-04s C 216 27 theano.sandbox.cuda.basic_ops.GpuReshape | |
0.7% 99.1% 0.129s 2.48e-04s C 520 65 theano.sandbox.cuda.basic_ops.GpuElemwise | |
0.3% 99.4% 0.054s 1.35e-03s C 40 5 theano.sandbox.cuda.basic_ops.GpuCAReduce | |
0.2% 99.6% 0.044s 6.08e-04s C 72 9 theano.sandbox.cuda.basic_ops.GpuIncSubtensor | |
0.2% 99.8% 0.028s 2.67e-04s C 104 13 theano.sandbox.cuda.basic_ops.GpuAlloc | |
0.1% 99.9% 0.016s 1.99e-03s C 8 1 theano.sandbox.cuda.nnet.GpuSoftmax | |
0.1% 99.9% 0.012s 1.47e-03s C 8 1 theano.sandbox.cuda.blas.GpuDot22Scalar | |
0.1% 100.0% 0.009s 1.15e-03s Py 8 1 theano.sandbox.cuda.basic_ops.GpuFlatten | |
0.0% 100.0% 0.002s 2.06e-04s Py 8 1 theano.tensor.basic.Nonzero | |
0.0% 100.0% 0.001s 1.92e-06s C 416 52 theano.compile.ops.Shape_i | |
... (remaining 10 Classes account for 0.01%(0.00s) of the runtime) | |
Ops | |
--- | |
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name> | |
48.1% 48.1% 8.638s 1.08e+00s C 8 1 Elemwise{Composite{(i0 * log((i1 / i2)))}} | |
11.4% 59.5% 2.045s 2.56e-01s Py 8 1 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False} | |
10.0% 69.5% 1.796s 2.24e-01s C 8 1 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}} | |
4.6% 74.1% 0.826s 1.03e-01s Py 8 1 forall_inplace,gpu,grad_of_scan_fn} | |
4.2% 78.3% 0.753s 7.84e-03s C 96 12 GpuDot22 | |
3.6% 81.9% 0.647s 2.69e-02s C 24 3 HostFromGpu | |
3.4% 85.3% 0.613s 7.66e-02s C 8 1 SoftmaxGrad | |
2.7% 88.0% 0.482s 2.01e-02s C 24 3 Sum{axis=[1], acc_dtype=float64} | |
2.4% 90.5% 0.436s 1.82e-02s Py 24 3 AdvancedSubtensor | |
1.8% 92.3% 0.321s 1.34e-02s C 24 3 GpuFromHost | |
1.8% 94.0% 0.317s 3.96e-02s C 8 1 Elemwise{clip,no_inplace} | |
1.7% 95.7% 0.302s 1.26e-02s C 24 3 Elemwise{mul,no_inplace} | |
1.0% 96.7% 0.173s 2.17e-02s Py 8 1 forall_inplace,gpu,scan_fn} | |
0.9% 97.6% 0.169s 1.06e-02s C 16 2 Alloc | |
0.7% 98.3% 0.129s 8.48e-04s C 152 19 GpuReshape{2} | |
0.3% 98.6% 0.054s 1.35e-03s C 40 5 GpuCAReduce{add}{1,1,0} | |
0.2% 98.8% 0.035s 1.11e-03s C 32 4 GpuIncSubtensor{Inc;:int64:} | |
0.2% 99.0% 0.030s 3.46e-04s C 88 11 GpuElemwise{Add}[(0, 0)] | |
0.2% 99.1% 0.027s 2.84e-04s C 96 12 GpuAlloc{memset_0=True} | |
0.1% 99.3% 0.025s 2.39e-04s C 104 13 GpuElemwise{Composite{(i0 * sqr(i1))},no_inplace} | |
... (remaining 109 Ops account for 0.71%(0.13s) of the runtime) | |
Apply | |
------ | |
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name> | |
48.1% 48.1% 8.638s 1.08e+00s 8 424 Elemwise{Composite{(i0 * log((i1 / i2)))}}(AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, InplaceDimShuffle{0,x}.0) | |
11.4% 59.5% 2.045s 2.56e-01s 8 427 AdvancedIncSubtensor{inplace=False, set_instead_of_inc=False}(Alloc.0, Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
10.0% 69.5% 1.796s 2.24e-01s 8 425 Elemwise{Composite{(AND(GE(i0, i1), LE(i0, i2)) * (((i3 * i4 * i5) / i6) + i7))}}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}, TensorConstant{(1, 1) of -1.0}, InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0, Elemwise{clip,no_inplace}.0, Elemwise{true_div,no_inplace}.0) | |
4.6% 74.1% 0.826s 1.03e-01s 8 453 forall_inplace,gpu,grad_of_scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{mul,no_inplace}.0, GpuElemwise{Tanh}[(0, 0)].0, GpuDimShuffle{0,2,1}.0, GpuElemwise{Composite{(i0 - sqr(i1))},no_inplace}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, GpuSubtensor{int64:int64:int64}.0, Subtensor{int64:int64:int64}.0 | |
3.4% 77.5% 0.613s 7.66e-02s 8 431 SoftmaxGrad(Reshape{2}.0, HostFromGpu.0) | |
1.8% 79.4% 0.326s 4.07e-02s 8 406 HostFromGpu(GpuReshape{3}.0) | |
1.8% 81.2% 0.320s 4.01e-02s 8 401 HostFromGpu(GpuSoftmax.0) | |
1.8% 82.9% 0.317s 3.96e-02s 8 416 Elemwise{clip,no_inplace}(AdvancedSubtensor.0, TensorConstant{(1, 1) of 1e-07}, TensorConstant{(1, 1) of 1.0}) | |
1.7% 84.6% 0.302s 3.77e-02s 8 150 Elemwise{mul,no_inplace}(InplaceDimShuffle{0,x}.0, AdvancedSubtensor.0) | |
1.2% 85.8% 0.218s 2.73e-02s 8 75 AdvancedSubtensor(y, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.2% 87.0% 0.218s 2.72e-02s 8 412 AdvancedSubtensor(HostFromGpu.0, Subtensor{int64}.0, Subtensor{int64}.0) | |
1.0% 88.0% 0.173s 2.17e-02s 8 352 forall_inplace,gpu,scan_fn}(Elemwise{Composite{minimum(minimum(minimum(minimum(i0, i1), i2), i3), i4)}}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, GpuSubtensor{int64:int64:int8}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType( | |
0.9% 88.9% 0.169s 2.11e-02s 8 413 Alloc(TensorConstant{(1, 1, 1) of 0.0}, Shape_i{0}.0, Shape_i{1}.0, Shape_i{2}.0) | |
0.9% 89.9% 0.166s 2.07e-02s 8 6 GpuFromHost(<TensorType(float32, 3D)>) | |
0.9% 90.8% 0.161s 2.01e-02s 8 173 Sum{axis=[1], acc_dtype=float64}(Elemwise{mul,no_inplace}.0) | |
0.9% 91.6% 0.161s 2.01e-02s 8 426 Sum{axis=[1], acc_dtype=float64}(Elemwise{Composite{(i0 * log((i1 / i2)))}}.0) | |
0.9% 92.5% 0.161s 2.01e-02s 8 420 Sum{axis=[1], acc_dtype=float64}(Elemwise{clip,no_inplace}.0) | |
0.9% 93.4% 0.155s 1.93e-02s 8 433 GpuFromHost(SoftmaxGrad.0) | |
0.5% 93.9% 0.092s 1.15e-02s 8 91 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
0.4% 94.3% 0.066s 8.25e-03s 8 94 GpuDot22(GpuReshape{2}.0, GpuReshape{2}.0) | |
... (remaining 512 Apply instances account for 5.72%(1.03s) of the runtime) | |
training time 19.0255179405 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment