Skip to content

Instantly share code, notes, and snippets.

@shackenberg
Created September 9, 2013 11:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shackenberg/6494612 to your computer and use it in GitHub Desktop.
Save shackenberg/6494612 to your computer and use it in GitHub Desktop.
Profile results for forward function in Theano using scan
from numpy import zeros, dot, exp, tanh, array, allclose
from numpy.random import randn
from copy import deepcopy
from time import time
from theano import tensor as T
from theano import function, shared, config, scan
FLOAT_PRECISION = config.floatX
class Network:
def __init__(self,ni,ns,initial=0.1,maxlen=2500):
na = 1+ni+ns
self.dims = ni,ns,na
self.init_variables(initial)
def init_variables(self,initial,maxlen=2500):
n = maxlen
ni,ns,na = self.dims
self.WGI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
self.WGO = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
self.WCI = array(randn(ns,na)*initial, dtype=FLOAT_PRECISION)
self.source = array(zeros([n,na]), dtype=FLOAT_PRECISION)
self.cix = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.ci = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.gix = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.gi = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.gox = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.go = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.state = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
self.output = array(zeros([n,ns]), dtype=FLOAT_PRECISION)
def forward(self,xs):
def ffunc(x):
return 1.0/(1.0+exp(-x))
ni,ns,na = self.dims
prev = zeros(ns)
n = len(xs)
for t in range(n):
self.source[t,0] = 1
self.source[t,1:1+ni] = xs[t]
self.source[t,1+ni:] = prev
dot(self.WGI,self.source[t],out=self.gix[t])
dot(self.WGO,self.source[t],out=self.gox[t])
dot(self.WCI,self.source[t],out=self.cix[t])
self.gi[t] = ffunc(self.gix[t])
self.ci[t] = tanh(self.cix[t])
self.state[t] = self.ci[t]*self.gi[t]
self.go[t] = ffunc(self.gox[t])
self.output[t] = tanh(self.state[t]) * self.go[t]
prev = self.output[t]
return self.output[:n]
class Network_Theano_Scan():
def __init__(self, original_net, ni,ns,maxlen=2500):
na = 1+ni+ns
self.copy_weights(original_net)
# to make sure, both networks produce the same results
self.uploadweightsTheano()
self.initforwardTheano(ns, maxlen, na)
def copy_weights(self, original_net):
self.WGI = deepcopy(original_net.WGI)
self.WGO = deepcopy(original_net.WGO)
self.WCI = deepcopy(original_net.WCI)
def uploadweightsTheano(self):
self.TWGI_shared = shared(self.WGI)
self.TWGO_shared = shared(self.WGO)
self.TWCI_shared = shared(self.WCI)
def initforwardTheano(self, ns, n, na):
def Tffunc(x):
Tone = array([1.0], dtype=FLOAT_PRECISION)
return Tone/(Tone+T.exp(-x))
self.Toutput = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
self.Tgo_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
self.Txs_shared = shared(zeros([n, ns], dtype=FLOAT_PRECISION))
Tone = array([1.0], dtype=FLOAT_PRECISION)
Tn = T.iscalar('Tn')
Ta = T.arange(Tn)
def step(Tt, Tprev_output):
Txs = self.Txs_shared[Tt]
Tsource = T.concatenate([Tone, Txs, Tprev_output])
Tgix = T.dot(self.TWGI_shared, Tsource)
Tgox = T.dot(self.TWGO_shared, Tsource)
Tcix = T.dot(self.TWCI_shared, Tsource)
Tgi = Tone/(Tone+T.exp(-Tgix))
Tci = T.tanh(Tcix)
Tstate = Tci * Tgi
Tgo = Tffunc(Tgox)
output = T.tanh(Tstate) * Tgo
return output
Toutput0 = T.vector('Toutput0')
Toutput0 = T.zeros([ns])
Toutput, _ = scan(step,
sequences=[Ta],
outputs_info=[Toutput0],
non_sequences=[])
TToutput = (self.Toutput, T.set_subtensor(self.Toutput[:Tn], Toutput))
updates = [TToutput]
self.Tforward = function([Tn], outputs=[], updates=updates)
def forward(self, xs):
n = len(xs)
self.Tforward(n)
# init
ninput = 48
nstates = 100
seqlength = 1000
network_orig = Network(ninput, nstates)
network_theano_scan = Network_Theano_Scan(network_orig, ninput, nstates)
data = array(randn(seqlength, ninput), dtype=FLOAT_PRECISION)
# numpy
starttime = time()
output = network_orig.forward(data)
print "nympy takes {}s".format(time() - starttime)
# theano_scan
network_theano_scan.Txs_shared.set_value(data)
starttime = time()
network_theano_scan.forward(data)
print "theano scan takes {}s".format(time() - starttime)
output_theano_scan = network_theano_scan.Toutput.get_value()[:seqlength]
#check the results
rtol=1e-04
atol=1e-05
if not allclose(output, output_theano_scan, rtol, atol):
import pdb; pdb.set_trace()
print "pass: theano scan code"
With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile=True
Function profiling
==================
Message: None
Time in 1 calls to Function.__call__: 3.572400e-01s
Time in Function.fn.__call__: 3.571680e-01s (99.980%)
Time in thunks: 3.571451e-01s (99.973%)
Total compile time: 7.711120e-01s
Theano Optimizer time: 5.242310e-01s
Theano validate time: 1.065993e-02s
Theano Linker time (includes C, CUDA code generation/compiling): 2.425768e-01s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
99.9% 99.9% 0.357s 3.57e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'>
0.1% 100.0% 0.000s 1.99e-04s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
0.0% 100.0% 0.000s 3.81e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'>
0.0% 100.0% 0.000s 3.22e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 <class 'theano.tensor.basic.ARange'>
0.0% 100.0% 0.000s 5.96e-06s C 1 1 <class 'theano.tensor.subtensor.Subtensor'>
0.0% 100.0% 0.000s 9.54e-07s C 5 5 <class 'theano.tensor.basic.ScalarFromTensor'>
0.0% 100.0% 0.000s 4.05e-06s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
99.9% 99.9% 0.357s 3.57e-01s Py 1 1 forall_inplace,gpu,scan_fn}
0.1% 100.0% 0.000s 1.99e-04s C 1 1 GpuAlloc{memset_0=True}
0.0% 100.0% 0.000s 3.22e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:}
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 ARange
0.0% 100.0% 0.000s 3.46e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
0.0% 100.0% 0.000s 6.91e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Subtensor{int64:int64:int8}
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
0.0% 100.0% 0.000s 9.54e-07s C 5 5 ScalarFromTensor
0.0% 100.0% 0.000s 4.77e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
0.0% 100.0% 0.000s 4.05e-06s C 1 1 GpuSubtensor{int64:int64:int8}
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Cast{int64}}
0.0% 100.0% 0.000s 2.86e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
0.0% 100.0% 0.000s 2.15e-06s C 1 1 Elemwise{add,no_inplace}
0.0% 100.0% 0.000s 2.15e-06s C 1 1 Elemwise{le,no_inplace}
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime)
Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
99.9% 99.9% 0.357s 3.57e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
0.1% 100.0% 0.000s 1.99e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 1.09508501e+21]]}, Elemwise{Compo
0.0% 100.0% 0.000s 3.22e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
0.0% 100.0% 0.000s 1.50e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
0.0% 100.0% 0.000s 6.91e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
0.0% 100.0% 0.000s 5.96e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
0.0% 100.0% 0.000s 5.96e-06s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
0.0% 100.0% 0.000s 5.01e-06s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3),
0.0% 100.0% 0.000s 5.01e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
0.0% 100.0% 0.000s 4.77e-06s 1 8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp
0.0% 100.0% 0.000s 4.05e-06s 1 20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0,
0.0% 100.0% 0.000s 3.10e-06s 1 10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca
0.0% 100.0% 0.000s 3.10e-06s 1 1 Elemwise{Cast{int64}}(Tn)
0.0% 100.0% 0.000s 2.86e-06s 1 5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add
0.0% 100.0% 0.000s 2.15e-06s 1 3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0)
0.0% 100.0% 0.000s 2.15e-06s 1 6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i
0.0% 100.0% 0.000s 9.54e-07s 1 14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
0.0% 100.0% 0.000s 9.54e-07s 1 16 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
0.0% 100.0% 0.000s 9.54e-07s 1 18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
0.0% 100.0% 0.000s 9.54e-07s 1 13 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}.0)
... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime)
Scan Op profiling ( scan_fn )
==================
Message: None
Time in 1 calls of the op (for a total of 1000 steps) 3.567450e-01s
Total time spent in calling the VM 2.880974e-01s (80.757%)
Total overhead (computing slices..) 6.864762e-02s (19.243%)
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
36.8% 36.8% 0.092s 3.06e-05s C 3000 3 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
29.1% 65.9% 0.072s 7.24e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
24.4% 90.3% 0.061s 2.02e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'>
7.6% 97.8% 0.019s 1.88e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
1.2% 99.0% 0.003s 9.77e-07s C 3000 3 <class 'theano.tensor.opt.Shape_i'>
0.7% 99.7% 0.002s 1.81e-06s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
0.3% 100.0% 0.001s 7.09e-07s C 1000 1 <class 'theano.tensor.basic.ScalarFromTensor'>
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
36.8% 36.8% 0.092s 3.06e-05s C 3000 3 GpuAlloc{memset_0=True}
29.1% 65.9% 0.072s 7.24e-05s C 1000 1 GpuJoin
24.4% 90.3% 0.061s 2.02e-05s C 3000 3 GpuGemv{inplace}
7.6% 97.8% 0.019s 1.88e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
1.2% 99.0% 0.003s 9.77e-07s C 3000 3 Shape_i{0}
0.7% 99.7% 0.002s 1.81e-06s C 1000 1 GpuSubtensor{int32}
0.3% 100.0% 0.001s 7.09e-07s C 1000 1 ScalarFromTensor
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime)
Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
29.1% 29.1% 0.072s 7.24e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{
12.5% 41.5% 0.031s 3.10e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
12.4% 53.9% 0.031s 3.09e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
12.0% 65.9% 0.030s 2.98e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
8.3% 74.2% 0.021s 2.08e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
8.1% 82.4% 0.020s 2.03e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
7.9% 90.3% 0.020s 1.97e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
7.6% 97.8% 0.019s 1.88e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
0.7% 98.5% 0.002s 1.81e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
0.4% 99.0% 0.001s 1.10e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
0.4% 99.4% 0.001s 9.84e-07s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
0.3% 99.7% 0.001s 8.51e-07s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
0.3% 100.0% 0.001s 7.09e-07s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>)
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
Function profiling
==================
Message: Sum of all printed profiles at exit
Time in 1 calls to Function.__call__: 3.572400e-01s
Time in Function.fn.__call__: 6.452653e-01s (180.625%)
Time in thunks: 6.063292e-01s (169.726%)
Total compile time: 9.907391e-01s
Theano Optimizer time: 7.288530e-01s
Theano validate time: 2.013111e-02s
Theano Linker time (includes C, CUDA code generation/compiling): 2.546458e-01s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
58.9% 58.9% 0.357s 3.57e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'>
15.2% 74.0% 0.092s 3.06e-05s C 3001 4 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
11.9% 86.0% 0.072s 7.24e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
10.0% 96.0% 0.061s 2.02e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'>
3.1% 99.1% 0.019s 1.88e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
0.5% 99.6% 0.003s 9.77e-07s C 3000 3 <class 'theano.tensor.opt.Shape_i'>
0.3% 99.9% 0.002s 1.81e-06s C 1001 2 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
0.1% 100.0% 0.001s 7.10e-07s C 1005 6 <class 'theano.tensor.basic.ScalarFromTensor'>
0.0% 100.0% 0.000s 3.81e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'>
0.0% 100.0% 0.000s 3.22e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 <class 'theano.tensor.basic.ARange'>
0.0% 100.0% 0.000s 5.96e-06s C 1 1 <class 'theano.tensor.subtensor.Subtensor'>
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
58.9% 58.9% 0.357s 3.57e-01s Py 1 1 forall_inplace,gpu,scan_fn}
15.2% 74.0% 0.092s 3.06e-05s C 3001 4 GpuAlloc{memset_0=True}
11.9% 86.0% 0.072s 7.24e-05s C 1000 1 GpuJoin
10.0% 96.0% 0.061s 2.02e-05s C 3000 3 GpuGemv{inplace}
3.1% 99.1% 0.019s 1.88e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
0.5% 99.6% 0.003s 9.77e-07s C 3000 3 Shape_i{0}
0.3% 99.9% 0.002s 1.81e-06s C 1000 1 GpuSubtensor{int32}
0.1% 100.0% 0.001s 7.10e-07s C 1005 6 ScalarFromTensor
0.0% 100.0% 0.000s 3.22e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:}
0.0% 100.0% 0.000s 1.50e-05s Py 1 1 ARange
0.0% 100.0% 0.000s 3.46e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
0.0% 100.0% 0.000s 6.91e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Subtensor{int64:int64:int8}
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
0.0% 100.0% 0.000s 4.77e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
0.0% 100.0% 0.000s 4.05e-06s C 1 1 GpuSubtensor{int64:int64:int8}
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
0.0% 100.0% 0.000s 3.10e-06s C 1 1 Elemwise{Cast{int64}}
0.0% 100.0% 0.000s 2.86e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
... (remaining 2 Ops account for 0.00%(0.00s) of the runtime)
Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
58.9% 58.9% 0.357s 3.57e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
11.9% 70.8% 0.072s 7.24e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{
5.1% 75.9% 0.031s 3.10e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
5.1% 81.0% 0.031s 3.09e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
4.9% 85.9% 0.030s 2.98e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 0.]}, Shape_i{0}.0)
3.4% 89.4% 0.021s 2.08e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
3.3% 92.7% 0.020s 2.03e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
3.2% 95.9% 0.020s 1.97e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
3.1% 99.1% 0.019s 1.88e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
0.3% 99.3% 0.002s 1.81e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
0.2% 99.5% 0.001s 1.10e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
0.2% 99.7% 0.001s 9.84e-07s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
0.1% 99.8% 0.001s 8.51e-07s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
0.1% 100.0% 0.001s 7.09e-07s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>)
0.0% 100.0% 0.000s 1.99e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 1.09508501e+21]]}, Elemwise{Compo
0.0% 100.0% 0.000s 3.22e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
0.0% 100.0% 0.000s 1.50e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
0.0% 100.0% 0.000s 6.91e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
0.0% 100.0% 0.000s 5.96e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
0.0% 100.0% 0.000s 5.96e-06s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime)
With following flags: export THEANO_FLAGS=mode=FAST_RUN,device=gpu,floatX=float32,profile_memory=True,profile=True
Function profiling
==================
Message: None
Time in 1 calls to Function.__call__: 8.473389e-01s
Time in Function.fn.__call__: 8.472540e-01s (99.990%)
Time in thunks: 8.466234e-01s (99.916%)
Total compile time: 7.679639e-01s
Theano Optimizer time: 5.219090e-01s
Theano validate time: 1.056147e-02s
Theano Linker time (includes C, CUDA code generation/compiling): 2.417691e-01s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
100.0% 100.0% 0.846s 8.46e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'>
0.0% 100.0% 0.000s 2.18e-04s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
0.0% 100.0% 0.000s 6.44e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'>
0.0% 100.0% 0.000s 3.81e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
0.0% 100.0% 0.000s 3.62e-06s C 5 5 <class 'theano.tensor.basic.ScalarFromTensor'>
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 <class 'theano.tensor.basic.ARange'>
0.0% 100.0% 0.000s 1.00e-05s C 1 1 <class 'theano.tensor.subtensor.Subtensor'>
0.0% 100.0% 0.000s 8.11e-06s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
100.0% 100.0% 0.846s 8.46e-01s Py 1 1 forall_inplace,gpu,scan_fn}
0.0% 100.0% 0.000s 2.18e-04s C 1 1 GpuAlloc{memset_0=True}
0.0% 100.0% 0.000s 3.81e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:}
0.0% 100.0% 0.000s 3.62e-06s C 5 5 ScalarFromTensor
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 ARange
0.0% 100.0% 0.000s 5.36e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Subtensor{int64:int64:int8}
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
0.0% 100.0% 0.000s 8.11e-06s C 1 1 GpuSubtensor{int64:int64:int8}
0.0% 100.0% 0.000s 8.11e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{le,no_inplace}
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Cast{int64}}
0.0% 100.0% 0.000s 4.05e-06s C 1 1 Elemwise{add,no_inplace}
0.0% 100.0% 0.000s 3.81e-06s C 1 1 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime)
Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
100.0% 100.0% 0.846s 8.46e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int32, shape=(1000,), strides=c
input 2: dtype=float32, shape=(1000, 100), strides=(100, 1)
input 3: dtype=float32, shape=(100, 149), strides=c
input 4: dtype=float32, shape=(1000, 48), strides=c
input 5: dtype=float32, shape=(100, 149), strides=c
input 6: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1)
0.0% 100.0% 0.000s 2.18e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composi
input 0: dtype=float32, shape=(1, 1), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1)
0.0% 100.0% 0.000s 3.81e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
input 0: dtype=float32, shape=(2500, 100), strides=c
input 1: dtype=float32, shape=(1000, 100), strides=(100, 1)
input 2: dtype=int32, shape=4, strides=c
output 0: dtype=float32, shape=(2500, 100), strides=(100, 1)
0.0% 100.0% 0.000s 1.60e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
input 0: dtype=int8, shape=(), strides=c
input 1: dtype=int32, shape=(), strides=c
input 2: dtype=int8, shape=(), strides=c
output 0: dtype=int32, shape=(1000,), strides=c
0.0% 100.0% 0.000s 1.00e-05s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3),
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int8, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 1.00e-05s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
input 0: dtype=int32, shape=(1000,), strides=c
input 1: dtype=int64, shape=8, strides=c
input 2: dtype=int64, shape=8, strides=c
input 3: dtype=int8, shape=1, strides=c
output 0: dtype=int32, shape=(1000,), strides=c
0.0% 100.0% 0.000s 9.06e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 9.06e-06s 1 9 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}(Elemwise{le,no_inplace}.0,
input 0: dtype=int8, shape=(), strides=c
input 1: dtype=int8, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
input 3: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 8.11e-06s 1 8 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)](Elemwise{le,no_inp
input 0: dtype=int8, shape=(), strides=c
input 1: dtype=int8, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
input 3: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 8.11e-06s 1 20 GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0,
input 0: dtype=float32, shape=(1000, 100), strides=(100, 1)
input 1: dtype=int64, shape=8, strides=c
input 2: dtype=int64, shape=8, strides=c
input 3: dtype=int8, shape=1, strides=c
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1)
0.0% 100.0% 0.000s 7.87e-06s 1 14 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
input 3: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 6.91e-06s 1 0 ScalarFromTensor(Tn)
input 0: dtype=int32, shape=(), strides=c
output 0: dtype=int32, shape=4, strides=c
0.0% 100.0% 0.000s 5.96e-06s 1 10 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Ca
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int8, shape=(), strides=c
input 3: dtype=int64, shape=(), strides=c
input 4: dtype=int8, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 5.01e-06s 1 6 Elemwise{le,no_inplace}(Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int8, shape=(), strides=c
output 0: dtype=int8, shape=(), strides=c
0.0% 100.0% 0.000s 5.01e-06s 1 1 Elemwise{Cast{int64}}(Tn)
input 0: dtype=int32, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 4.05e-06s 1 3 Elemwise{add,no_inplace}(TensorConstant{1}, Elemwise{Cast{int64}}.0)
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 3.81e-06s 1 5 Elemwise{Composite{[Switch(LT(i0, i1), i0, i1)]}}(TensorConstant{1}, Elemwise{add
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 3.10e-06s 1 18 ScalarFromTensor(Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1
input 0: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=8, strides=c
0.0% 100.0% 0.000s 3.10e-06s 1 12 ScalarFromTensor(Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)].0
input 0: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=8, strides=c
0.0% 100.0% 0.000s 2.86e-06s 1 11 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2, i3)]}}[(0,
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
input 3: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
... (remaining 2 Apply instances account for 0.00%(0.00s) of the runtime)
Memory Profile
(Sparse variables are ignored)
---
Max if linker=cvm (default): unknown
Max if no gc (allow_gc=False): 395KB
Max if linker=c|py: 395KB
Memory saved if gc is enabled (linker=c|py): 0KB
<Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>
1000000B [(2500, 100)] i GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubtensor{int64:int64:int8}.0, ScalarFromTensor.0)
400000B [(1000, 100)] c GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Cast{int64}(i0), int_div(i1, i2))]}(NEQ(i0, i1), i2, i3)]}(mod(i0, i1), i2, i0, i1)]}(Composite{[Composite{[Composite{[sub(Switch(i0, i1, i2), i3)]}(LT(i0, i1), Switch(i2, i1, i3), Switch(i4, i0, i5), i1)]}(i0, i1, LT(i2, i1), i2, LT(i0, i3), i3)]}(i0, i1, i2, i3), i4, i1)]}(i0, i1, add(i0, i2), i2, i3)]}(maximum(i0, i1), i2, i3, i4)]}}.0, TensorConstant{100})
400000B [(1000, 100)] i forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>)
400000B [(1000, 100)] v GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
4000B [(1000,)] c ARange(TensorConstant{0}, Tn, TensorConstant{1})
4000B [(1000,)] v Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
... (remaining 16 Apply account for 117B/2208117B ((0.01%)) of the Apply with dense outputs sizes)
<created/inplace/view> is taken from the Op's declaration.
Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.
Scan Op profiling ( scan_fn )
==================
Message: None
Time in 1 calls of the op (for a total of 1000 steps) 8.461490e-01s
Total time spent in calling the VM 7.661736e-01s (90.548%)
Total overhead (computing slices..) 7.997537e-02s (9.452%)
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
37.2% 37.2% 0.131s 4.37e-05s C 3000 3 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
25.3% 62.6% 0.089s 2.97e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'>
24.8% 87.3% 0.087s 8.73e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
7.4% 94.8% 0.026s 2.62e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
3.1% 97.9% 0.011s 3.61e-06s C 3000 3 <class 'theano.tensor.opt.Shape_i'>
1.1% 99.0% 0.004s 3.88e-06s C 1000 1 <class 'theano.tensor.basic.ScalarFromTensor'>
1.0% 100.0% 0.004s 3.65e-06s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
37.2% 37.2% 0.131s 4.37e-05s C 3000 3 GpuAlloc{memset_0=True}
25.3% 62.6% 0.089s 2.97e-05s C 3000 3 GpuGemv{inplace}
24.8% 87.3% 0.087s 8.73e-05s C 1000 1 GpuJoin
7.4% 94.8% 0.026s 2.62e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
3.1% 97.9% 0.011s 3.61e-06s C 3000 3 Shape_i{0}
1.1% 99.0% 0.004s 3.88e-06s C 1000 1 ScalarFromTensor
1.0% 100.0% 0.004s 3.65e-06s C 1000 1 GpuSubtensor{int32}
... (remaining 0 Ops account for 0.00%(0.00s) of the runtime)
Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
24.8% 24.8% 0.087s 8.73e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{
input 0: dtype=int8, shape=(), strides=c
input 1: dtype=float32, shape=(1,), strides=c
input 2: dtype=float32, shape=(48,), strides=(1,)
input 3: dtype=float32, shape=(100,), strides=c
output 0: dtype=float32, shape=(149,), strides=(1,)
12.5% 37.3% 0.044s 4.41e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0)
input 0: dtype=float32, shape=(1,), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
12.5% 49.8% 0.044s 4.39e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0)
input 0: dtype=float32, shape=(1,), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
12.2% 62.0% 0.043s 4.31e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0)
input 0: dtype=float32, shape=(1,), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
8.5% 70.5% 0.030s 2.99e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(), strides=c
input 2: dtype=float32, shape=(100, 149), strides=c
input 3: dtype=float32, shape=(149,), strides=(1,)
input 4: dtype=float32, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
8.4% 79.0% 0.030s 2.97e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(), strides=c
input 2: dtype=float32, shape=(100, 149), strides=c
input 3: dtype=float32, shape=(149,), strides=(1,)
input 4: dtype=float32, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
8.4% 87.3% 0.030s 2.96e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(), strides=c
input 2: dtype=float32, shape=(100, 149), strides=c
input 3: dtype=float32, shape=(149,), strides=(1,)
input 4: dtype=float32, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
7.4% 94.8% 0.026s 2.62e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(100,), strides=(1,)
input 2: dtype=float32, shape=(100,), strides=(1,)
output 0: dtype=float32, shape=(100,), strides=(1,)
1.1% 95.9% 0.004s 3.93e-06s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
input 0: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=int64, shape=(), strides=c
1.1% 97.0% 0.004s 3.88e-06s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>)
input 0: dtype=int32, shape=(), strides=c
output 0: dtype=int32, shape=4, strides=c
1.0% 98.0% 0.004s 3.65e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
input 0: dtype=float32, shape=(1000, 48), strides=c
input 1: dtype=int32, shape=4, strides=c
output 0: dtype=float32, shape=(48,), strides=(1,)
1.0% 99.0% 0.003s 3.48e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
input 0: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=int64, shape=(), strides=c
1.0% 100.0% 0.003s 3.40e-06s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
input 0: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=int64, shape=(), strides=c
... (remaining 0 Apply instances account for 0.00%(0.00s) of the runtime)
Memory Profile
(Sparse variables are ignored)
---
Max if linker=cvm (default): unknown
Max if no gc (allow_gc=False): 2KB
Max if linker=c|py: 2KB
Memory saved if gc is enabled (linker=c|py): 0KB
<Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>
... (remaining 13 Apply account for 3616B/3616B ((100.00%)) of the Apply with dense outputs sizes)
All Apply nodes have output sizes that take less than 1024B.
<created/inplace/view> is taken from the Op's declaration.
Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.
Function profiling
==================
Message: Sum of all printed profiles at exit
Time in 1 calls to Function.__call__: 8.473389e-01s
Time in Function.fn.__call__: 1.613428e+00s (190.411%)
Time in thunks: 1.198847e+00s (141.484%)
Total compile time: 9.870379e-01s
Theano Optimizer time: 7.257051e-01s
Theano validate time: 1.985645e-02s
Theano Linker time (includes C, CUDA code generation/compiling): 2.541780e-01s
Class
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Class name>
70.6% 70.6% 0.846s 8.46e-01s Py 1 1 <class 'theano.scan_module.scan_op.Scan'>
11.0% 81.5% 0.131s 4.38e-05s C 3001 4 <class 'theano.sandbox.cuda.basic_ops.GpuAlloc'>
7.4% 89.0% 0.089s 2.97e-05s C 3000 3 <class 'theano.sandbox.cuda.blas.GpuGemv'>
7.3% 96.3% 0.087s 8.73e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuJoin'>
2.2% 98.5% 0.026s 2.62e-05s C 1000 1 <class 'theano.sandbox.cuda.basic_ops.GpuElemwise'>
0.9% 99.4% 0.011s 3.61e-06s C 3000 3 <class 'theano.tensor.opt.Shape_i'>
0.3% 99.7% 0.004s 3.88e-06s C 1005 6 <class 'theano.tensor.basic.ScalarFromTensor'>
0.3% 100.0% 0.004s 3.65e-06s C 1001 2 <class 'theano.sandbox.cuda.basic_ops.GpuSubtensor'>
0.0% 100.0% 0.000s 6.44e-06s C 11 11 <class 'theano.tensor.elemwise.Elemwise'>
0.0% 100.0% 0.000s 3.81e-05s C 1 1 <class 'theano.sandbox.cuda.basic_ops.GpuIncSubtensor'>
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 <class 'theano.tensor.basic.ARange'>
0.0% 100.0% 0.000s 1.00e-05s C 1 1 <class 'theano.tensor.subtensor.Subtensor'>
... (remaining 0 Classes account for 0.00%(0.00s) of the runtime)
Ops
---
<% time> <sum %> <apply time> <time per call> <type> <#call> <#apply> <Op name>
70.6% 70.6% 0.846s 8.46e-01s Py 1 1 forall_inplace,gpu,scan_fn}
11.0% 81.5% 0.131s 4.38e-05s C 3001 4 GpuAlloc{memset_0=True}
7.4% 89.0% 0.089s 2.97e-05s C 3000 3 GpuGemv{inplace}
7.3% 96.3% 0.087s 8.73e-05s C 1000 1 GpuJoin
2.2% 98.5% 0.026s 2.62e-05s C 1000 1 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), t
0.9% 99.4% 0.011s 3.61e-06s C 3000 3 Shape_i{0}
0.3% 99.7% 0.004s 3.88e-06s C 1005 6 ScalarFromTensor
0.3% 100.0% 0.004s 3.65e-06s C 1000 1 GpuSubtensor{int32}
0.0% 100.0% 0.000s 3.81e-05s C 1 1 GpuIncSubtensor{InplaceSet;:int32:}
0.0% 100.0% 0.000s 1.60e-05s Py 1 1 ARange
0.0% 100.0% 0.000s 5.36e-06s C 2 2 Elemwise{Composite{[Composite{[add(sub(i0, i1), i2)]}(sub(i0, i1), i2,
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Subtensor{int64:int64:int8}
0.0% 100.0% 0.000s 1.00e-05s C 1 1 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1),
0.0% 100.0% 0.000s 9.06e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}
0.0% 100.0% 0.000s 8.11e-06s C 1 1 GpuSubtensor{int64:int64:int8}
0.0% 100.0% 0.000s 8.11e-06s C 1 1 Elemwise{Composite{[Switch(i0, i1, minimum(i2, i3))]}}[(0, 2)]
0.0% 100.0% 0.000s 5.96e-06s C 1 1 Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Compos
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{le,no_inplace}
0.0% 100.0% 0.000s 5.01e-06s C 1 1 Elemwise{Cast{int64}}
... (remaining 2 Ops account for 0.00%(0.00s) of the runtime)
Apply
------
<% time> <sum %> <apply time> <time per call> <#call> <id> <Apply name>
70.6% 70.6% 0.846s 8.46e-01s 1 19 forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int32, shape=(1000,), strides=c
input 2: dtype=float32, shape=(1000, 100), strides=(100, 1)
input 3: dtype=float32, shape=(100, 149), strides=c
input 4: dtype=float32, shape=(1000, 48), strides=c
input 5: dtype=float32, shape=(100, 149), strides=c
input 6: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1)
7.3% 77.9% 0.087s 8.73e-05s 1000 8 GpuJoin(TensorConstant{0}, CudaNdarrayConstant{[ 1.34258989e+13]}, GpuSubtensor{
input 0: dtype=int8, shape=(), strides=c
input 1: dtype=float32, shape=(1,), strides=c
input 2: dtype=float32, shape=(48,), strides=(1,)
input 3: dtype=float32, shape=(100,), strides=c
output 0: dtype=float32, shape=(149,), strides=(1,)
3.7% 81.5% 0.044s 4.41e-05s 1000 7 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0)
input 0: dtype=float32, shape=(1,), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
3.7% 85.2% 0.044s 4.39e-05s 1000 6 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0)
input 0: dtype=float32, shape=(1,), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
3.6% 88.8% 0.043s 4.31e-05s 1000 5 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[ 1.34258989e+13]}, Shape_i{0}.0)
input 0: dtype=float32, shape=(1,), strides=c
input 1: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
2.5% 91.3% 0.030s 2.99e-05s 1000 9 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(), strides=c
input 2: dtype=float32, shape=(100, 149), strides=c
input 3: dtype=float32, shape=(149,), strides=(1,)
input 4: dtype=float32, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
2.5% 93.8% 0.030s 2.97e-05s 1000 10 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(), strides=c
input 2: dtype=float32, shape=(100, 149), strides=c
input 3: dtype=float32, shape=(149,), strides=(1,)
input 4: dtype=float32, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
2.5% 96.3% 0.030s 2.96e-05s 1000 11 GpuGemv{inplace}(GpuAlloc{memset_0=True}.0, TensorConstant{1.0}, <CudaNdarrayType
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(), strides=c
input 2: dtype=float32, shape=(100, 149), strides=c
input 3: dtype=float32, shape=(149,), strides=(1,)
input 4: dtype=float32, shape=(), strides=c
output 0: dtype=float32, shape=(100,), strides=(1,)
2.2% 98.4% 0.026s 2.62e-05s 1000 12 GpuElemwise{Composite{[Composite{[Composite{[mul(scalar_sigmoid(i0), tanh(i1))]}(
input 0: dtype=float32, shape=(100,), strides=(1,)
input 1: dtype=float32, shape=(100,), strides=(1,)
input 2: dtype=float32, shape=(100,), strides=(1,)
output 0: dtype=float32, shape=(100,), strides=(1,)
0.3% 98.8% 0.004s 3.93e-06s 1000 1 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
input 0: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=int64, shape=(), strides=c
0.3% 99.1% 0.004s 3.88e-06s 1000 0 ScalarFromTensor(<TensorType(int32, scalar)>)
input 0: dtype=int32, shape=(), strides=c
output 0: dtype=int32, shape=4, strides=c
0.3% 99.4% 0.004s 3.65e-06s 1000 4 GpuSubtensor{int32}(<CudaNdarrayType(float32, matrix)>, ScalarFromTensor.0)
input 0: dtype=float32, shape=(1000, 48), strides=c
input 1: dtype=int32, shape=4, strides=c
output 0: dtype=float32, shape=(48,), strides=(1,)
0.3% 99.7% 0.003s 3.48e-06s 1000 2 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
input 0: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=int64, shape=(), strides=c
0.3% 100.0% 0.003s 3.40e-06s 1000 3 Shape_i{0}(<CudaNdarrayType(float32, matrix)>)
input 0: dtype=float32, shape=(100, 149), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 2.18e-04s 1 15 GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composi
input 0: dtype=float32, shape=(1, 1), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
output 0: dtype=float32, shape=(1000, 100), strides=(100, 1)
0.0% 100.0% 0.000s 3.81e-05s 1 21 GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubten
input 0: dtype=float32, shape=(2500, 100), strides=c
input 1: dtype=float32, shape=(1000, 100), strides=(100, 1)
input 2: dtype=int32, shape=4, strides=c
output 0: dtype=float32, shape=(2500, 100), strides=(100, 1)
0.0% 100.0% 0.000s 1.60e-05s 1 2 ARange(TensorConstant{0}, Tn, TensorConstant{1})
input 0: dtype=int8, shape=(), strides=c
input 1: dtype=int32, shape=(), strides=c
input 2: dtype=int8, shape=(), strides=c
output 0: dtype=int32, shape=(1000,), strides=c
0.0% 100.0% 0.000s 1.00e-05s 1 4 Elemwise{Composite{[Composite{[Composite{[Switch(LT(i0, i1), Switch(i2, i1, i3),
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int8, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
0.0% 100.0% 0.000s 1.00e-05s 1 17 Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Con
input 0: dtype=int32, shape=(1000,), strides=c
input 1: dtype=int64, shape=8, strides=c
input 2: dtype=int64, shape=8, strides=c
input 3: dtype=int8, shape=1, strides=c
output 0: dtype=int32, shape=(1000,), strides=c
0.0% 100.0% 0.000s 9.06e-06s 1 7 Elemwise{Composite{[Composite{[maximum(add(i0, i1), i1)]}(sub(i0, i1), i2)]}}(Ele
input 0: dtype=int64, shape=(), strides=c
input 1: dtype=int64, shape=(), strides=c
input 2: dtype=int64, shape=(), strides=c
output 0: dtype=int64, shape=(), strides=c
... (remaining 15 Apply instances account for 0.01%(0.00s) of the runtime)
Memory Profile (the max between all functions in that profile)
(Sparse variables are ignored)
---
Max if linker=cvm (default): unknown
Max if no gc (allow_gc=False): 395KB
Max if linker=c|py: 395KB
Memory saved if gc is enabled (linker=c|py): 0KB
This list is based on all functions in the profile
<Sum apply outputs (bytes)> <Apply outputs shape> <created/inplace/view> <Apply node>
1000000B [(2500, 100)] i GpuIncSubtensor{InplaceSet;:int32:}(<CudaNdarrayType(float32, matrix)>, GpuSubtensor{int64:int64:int8}.0, ScalarFromTensor.0)
400000B [(1000, 100)] c GpuAlloc{memset_0=True}(CudaNdarrayConstant{[[ 0.]]}, Elemwise{Composite{[Composite{[Composite{[Composite{[Composite{[Composite{[add(Cast{int64}(i0), int_div(i1, i2))]}(NEQ(i0, i1), i2, i3)]}(mod(i0, i1), i2, i0, i1)]}(Composite{[Composite{[Composite{[sub(Switch(i0, i1, i2), i3)]}(LT(i0, i1), Switch(i2, i1, i3), Switch(i4, i0, i5), i1)]}(i0, i1, LT(i2, i1), i2, LT(i0, i3), i3)]}(i0, i1, i2, i3), i4, i1)]}(i0, i1, add(i0, i2), i2, i3)]}(maximum(i0, i1), i2, i3, i4)]}}.0, TensorConstant{100})
400000B [(1000, 100)] i forall_inplace,gpu,scan_fn}(Elemwise{Cast{int64}}.0, Subtensor{int64:int64:int8}.0, GpuAlloc{memset_0=True}.0, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>, <CudaNdarrayType(float32, matrix)>)
400000B [(1000, 100)] v GpuSubtensor{int64:int64:int8}(forall_inplace,gpu,scan_fn}.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
4000B [(1000,)] c ARange(TensorConstant{0}, Tn, TensorConstant{1})
4000B [(1000,)] v Subtensor{int64:int64:int8}(ARange.0, ScalarFromTensor.0, ScalarFromTensor.0, Constant{1})
... (remaining 29 Apply account for 3733B/2211733B ((0.17%)) of the Apply with dense outputs sizes)
<created/inplace/view> is taken from the Op's declaration.
Apply nodes marked 'inplace' or 'view' may actually allocate memory, this is not reported here. If you use DebugMode, warnings will be emitted in those cases.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment