Skip to content

Instantly share code, notes, and snippets.

@nudles
Last active May 31, 2020 05:23
Show Gist options
  • Save nudles/d7f8043f251872333ec06f2701696cce to your computer and use it in GitHub Desktop.
Save nudles/d7f8043f251872333ec06f2701696cce to your computer and use it in GitHub Desktop.
New APIs of SINGA
  1. model.Model.save_states/load_states are used when we have the model class definition. The states are saved for inference/evaluation/retraining/trasnfer learning
  2. model.save/load are used to pickle the model class and the attributes. The model is saved for inference/evaluation/transfer learning, where we may not have the model class definition.
  3. soonx.to_onnx() and SONNXModel are used to save as onnx format and load the onnx format model as a Model instance. The model is saved for inference/evaluation/transfer learning, where we may not have the model class definition.

Note: for 1, we compile the model first and then load_states. for 2 and 3, we load the model first and then compile. It's not consistent currently. I do not have a better solution right now..

class Layer:
def __init__(self,):
self.has_initialized = False
def get_params(self):
"""return the params of this layer and sublayers as a dict;
param name is: layername.param_name. e.g.,
self.W = Tensor(), self.b=Tensor()
name of W and b is like conv1.W and conv1.b
"""
params = {}
for each sublayer:
params.update(sublayer.get_params())
for each param in this layer:
params.add(param.name, param)
def set_params(self, params):
""" params: a dict, name --> tensor or numpy array
copy values of params into layer's param tensors.
"""
for each sublayer of this layer
sublayer.set_params(params)
for each param of this layer
param.copy_from(params[param.name])
def get_states(self):
"""states of this layer and sublayers that are necessary
for model training/evaluation/inference.
the states include the params and others, e.g., the running mean and var of batchnorm.
"""
def set_states(self, states):
"""states: dict, name --> value
if value is tensor or numpy array, copy the value
""
def initialize(self, *args, **kwargs):
""" initialize internal states, e.g., cudnn handle, params based on the inputs.
set the name of each param based on the layer name. e.g., if the layer name is 'conv1',
then the weight param's name is 'conv1.W', the bias's name is 'conv1.b'.
"""
self.has_initialized = True
def forward(self, *args, **kwargs):
if self.has_initialized == False:
self.initialize(args, kwargs)
# do the real transformation
class MyLayer(Layer):
def get_params(self):
if self.b is not None:
return {self.W.name: self.W, self.b.name: self.b}
else:
return {self.W.name: self.W}
def set_params(self, params):
self.W.copy_from(params[self.W.name])
if self.b is not None:
self.b.copy_from(params[self.b.name])
class Model(Layer):
def __init__(self,):
self.temp_arrays = {} # to store numpy arrays loaded from disk.
def compile(self, inputs, dev, use_graph, graph_alg):
""" 1. forware through each layer to call layer's initialize() method.
2. set the name of each layer and sublayers, which will be used to create the dict
for get_params and get_states. Then no need to manually config the layer name
the __init__ method of a layer.
For instance,
class Blk(Layer):
def __init__(self):
self.conv1= Conv2d()
self.conv2 = Conv2d()
class MyModel(Module):
def __init__(self):
self.blk1 = Blk() --> blk1.conv1, blk1.conv2
self.blk2 = Blk() --> blk2.conv1, blk2.conv2
3. copy the self.temp_arrays into the corresponding tensors via set_states()
"""
def save_states(self, fpath, aux_states={}):
"""Save states.
Args:
fpath: output file path (without the extension)
aux_states(dict): values are standard data types or Tensor,
e.g., epoch ID, learning rate, optimizer states
"""
states = get_states() + aux_states + input_placeholders
tensor_dict = {}
for k, v in states:
if type(v) is Tensor:
tensor_dict[k] = v
states[k] = {'shape': v.shape, 'dtype': v.dtype}
save states as json file
save tensor_dict via numpy or hdf5 or protobuf
zip the output files
def load_states(self, fpath):
"""Load the model states and auxiliary states from disk.
Usage:
m = MyModel()
m.compile(...)
aux_states = m.load_states('mymodel.zip')
Args:
path: input file path (without the extension)
Returns:
dict
```
unzip the input file
load the json file --> states
load the tensor files --> numpy_dict
states --> model_states + input_placeholders + aux_states
self.set_states(model_states)
return the rest as a dict for the aux_states
def save(fpath, model):
""" save the class (including the methods) and the instance's all attributes via pickle
Usage:
m = MyModel()
m.compile(...)
# training
singa.model.save('mymodel.zip', m)
"""
attributes <-- get the model and all layers' attributes
replace all tensors in attributes --> {'shape': v.shape, 'dtype': v.dtype}
dump the tensors via numpy or protobuf or hdf5
dump model via pickle
zip the output files
def load(fpath):
"""load the model from disk without knowing the model class definition.
usage:
m = singa.model.load('mymodel.zip')
m.compile(...)
"""
unzip the input file
load model via pickle
load tensors
put model states --> temp_arrays
return the model
from singa.device import create_cuda_gpu
from singa.layer import Conv2D, MaxPool2D, ReLU, SoftmaxCrossEntropy
from singa.tensor import Tensor, PlaceHolder
from singa.model import Model
from singa.opt import SGD, DistOpt
class MyModel(Model):
def __init__(self):
self.conv = Conv2D(32, 3)
self.relu = ReLU()
self.pool = MaxPool2D(3, 2)
self.loss = SoftmaxCrossEntropy()
def forward(self, x):
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
return x
def train_one_batch(self, x, y, opt):
pred = self.forward(x)
l = self.loss(pred, y)
opt(l)
sgd = SGD(0.01)
dist_sgd = DistOpt(sgd)
cuda = device.create_cuda_gpu_on(sgd.local_rank)
bs = 128
m = MyModel()
x = tensor.PlaceHolder((bs, 3, 32, 32), cuda, dtype=tensor.float32)
y = tensor.PlaceHodler((bs, ), cuda, dtype=tensor.int32)
m.compile([x], use_graph=True)
for (x, y) in train_data:
m.train_one_batch(x, y, dist_sgd)
class DecayScheduler:
# to be used for decaying learning rate or regularization coefficient or momentum, etc.
def __init__(self, init_value):
self.init_value = init_value
def __call__(self, step):
assert isinstance(step, Tensor)
return self.call(step)
def call(self, step):
# step is a Tensor with a single scalar value
# return the current value as a Tensor
class Constant(DecayScheduler):
def call(self, step):
return self.init_value * step / step # to return a tensor
class ExponentialDecay(DecayScheduler):
def __init__(self, init_value, decay_steps, decay_rate, staircase=False):
# store the input args
def call(self, step):
if staircase:
s = step // decay_steps
else:
s = step / decay_steps
ret = self.init_value * decay_rate * step / step # to make ret a tensor
return ret ^ s
class Optimizer:
def __init__(self, lr):
"""lr could be a constant scalar or a learning rate scheduler"""
if type(lr) == float:
self. lr = Constant(lr)
elif lr is a DecayScheduler:
self.lr = lr
self.step_counter = Tensor((1, ), dtype=singa.int)
self.step_counter.set_value(0)
self.lr_value = self.lr(self.step_counter)
def get_states(self)
# skip DecayScheduler as it does not have persistent states
return {'step_counter': self.step_counter.get_value(0)}
def set_states(self, states):
self.step_counter = Tensor((1, ), dtype=singa.int)
self.step_counter.set_value(states['step_counter'])
self.lr_value = self.lr(self.step_counter)
def __call__(loss):
self.call(loss)
self.step()
def call(self, loss):
for p, g in autograd.backward(loss):
# each tensor can have a name; set the name of the param tensor in compile?
self.apply(p.name, p, g)
def step(self):
self.step_counter += 1
self.lr_value = self.lr(self.step_counter)
def apply(self, param_name, param_value, param_grad):
pass
@deprecated #for backward compatibility
def update(self, p, g):
if p.name is None:
p.name = id(p)
self.apply(p.name, p, g)
class SGD(Optimizer):
def __init__(self, lr, momentum=0.0):
super().__init__(self, lr)
if type(momentum) == float:
self. momentum= Constant(momentum)
elif momentumis a DecayScheduler:
self.momentum= momentum
self.mom_value = self.momentum(self.step_counter)
def apply(self, pname, pvalue, pgrad):
pass # update the pvalue inplace using pgrad, lr_value and mom_value
def step(self):
super.step()
self.mom_value = self.momentum(self.step_counter)
def get_states(self):
states = super.get_states()
if self.mom_value > 0:
states['moments'] = self.moments # a dict for 1st order moments tensors
return states
def set_states(self, states):
super.set_states(states)
if 'moments' in states:
self.moments = states['moments']
self.mom_value = self.momentum(self.step_counter)
def to_onnx(model):
return a onnx model
class SONNXModel(Module):
def __init__(self, onnx_mode):
singa_rep = sonnx.prepare(onnx_model) # will update the prepare function to remove device and batchsize
for layer_name, layer in singa_rep.layers:
self.__dict__[layer_name] = layer
# store weights here as numpy
for weith_name, weight in singa_rep.weights:
self.temp_arrays[weith_name] = weight
# store layer info such as input and output name(only weights)
for layer_name, layer_info in singa_rep.layer_infos:
self.layer_infos[layer_name] = layer_info
def forward(self, aux_output):
# run forward according to onnx graph
return the last output + aux_output
class MyModel(SONNXModel):
def __init__(self, onnx):
super.__init__(onnx)
self.layer1 = Conv()
self.layer2 = Conv()
def forward(self, x):
x1, x2 = super.forward(x, aux_output)
x = self.layer1.forward(x2)
return self.layer2.forward(x1) + x
def train_one_batch(self, x, y):
y_ = self.forward(x)
....
ox = onnx.load(fpath)
x = Placeholder((2, 3), device = gpu, dtype=singa.float) # alias of Tensor
m = MyModel(ox)
# compatible with existing code which does not have the following two statements.
m.compile([x], is_train=True, use_graph=True, graph_alg='sequence')
y = Placeholder((2,), device = gpu)
for npx, npy in data:
x.copy_from(npx)
y.copy_from(npy)
m.train_one_batch(x, y) # build the graph in the first iter. For the old code, the params are initialized here.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment