nudles/layer.py

## readme.md

      
    Raw
  

              readme.md
            
          
model.Model.save_states/load_states are used when we have the model class definition. The states are saved for inference/evaluation/retraining/trasnfer learning
model.save/load are used to pickle the model class and the attributes. The model is saved for inference/evaluation/transfer learning, where we may not have the model class definition.
soonx.to_onnx() and SONNXModel are used to save as onnx format and load the onnx format model as a Model instance. The model is saved for inference/evaluation/transfer learning, where we may not have the model class definition.

Note: for 1, we compile the model first and then load_states. for 2 and 3, we load the model first and then compile. It's not consistent currently. I do not have a better solution right now..

  
## layer.py
class Layer:
   def __init__(self,):
       self.has_initialized = False

   def get_params(self):
       """return the params of this layer and sublayers as a dict;
          param name is: layername.param_name. e.g.,

          self.W = Tensor(), self.b=Tensor()
          name of W and b is  like conv1.W and conv1.b
       """
          params = {}
          for each sublayer:
            params.update(sublayer.get_params())
          for each param in this layer:
            params.add(param.name, param)

    def set_params(self, params):
       """ params: a dict, name --> tensor or numpy array

           copy values of params into layer's param tensors.
       """
           for each sublayer of this layer
               sublayer.set_params(params)

           for each param of this layer
               param.copy_from(params[param.name])

    def get_states(self):
       """states of this layer and sublayers that are necessary
          for model training/evaluation/inference.
          the states include the params and others, e.g., the running mean and var of batchnorm.
      """

    def set_states(self, states):
       """states: dict, name --> value

          if value is tensor or numpy array, copy the value
       ""

    def initialize(self, *args, **kwargs):
       """ initialize internal states, e.g., cudnn handle, params based on the inputs.
         set the name of each param based on the layer name. e.g., if the layer name is 'conv1',
         then the weight param's name is 'conv1.W', the bias's name is 'conv1.b'.
       """
       self.has_initialized = True

    def forward(self, *args, **kwargs):
       if self.has_initialized == False:
          self.initialize(args, kwargs)
       # do the real transformation


class MyLayer(Layer):
   def get_params(self):
      if self.b is not None:
        return {self.W.name: self.W, self.b.name: self.b}
      else:
        return {self.W.name: self.W}

   def set_params(self, params):
      self.W.copy_from(params[self.W.name])
      if self.b is not None:
        self.b.copy_from(params[self.b.name])

## model.py
class Model(Layer):
  def __init__(self,):
    self.temp_arrays = {} # to store numpy arrays loaded from disk.

  def compile(self, inputs, dev, use_graph, graph_alg):
     """ 1. forware through each layer to call layer's initialize() method.
         2. set the name of each layer and sublayers, which will be used to create the dict
            for get_params and get_states. Then no need to manually config the layer name
            the __init__ method of a layer.

            For instance,
            class Blk(Layer):
              def __init__(self):
                  self.conv1= Conv2d()
                  self.conv2 = Conv2d()

            class MyModel(Module):
               def __init__(self):
                 self.blk1 = Blk() --> blk1.conv1, blk1.conv2
                 self.blk2 = Blk()  --> blk2.conv1, blk2.conv2

        3. copy the self.temp_arrays into the corresponding tensors via set_states()
     """


  def save_states(self, fpath, aux_states={}):
      """Save states.

      Args:
         fpath: output file path (without the extension)
         aux_states(dict): values are standard data types or Tensor,
                           e.g., epoch ID, learning rate, optimizer states
      """
      states = get_states() + aux_states + input_placeholders
      tensor_dict = {}
      for k, v in states:
           if type(v) is Tensor:
             tensor_dict[k] = v
             states[k] = {'shape': v.shape, 'dtype': v.dtype}
      save states as json file
      save tensor_dict via numpy or hdf5 or protobuf
      zip the output files

  def load_states(self, fpath):
      """Load the model states and auxiliary states from disk.

       Usage:
         m = MyModel()
         m.compile(...)
         aux_states = m.load_states('mymodel.zip')

       Args:
         path: input file path (without the extension)
       Returns:
          dict
      ```
      unzip the input file
      load the json file --> states
      load the tensor files --> numpy_dict
      states --> model_states + input_placeholders + aux_states
      self.set_states(model_states)
      return the rest as a dict for the aux_states


def save(fpath, model):
    """ save the class (including the methods) and the instance's all attributes via pickle
    Usage:
      m = MyModel()
      m.compile(...)
      # training
      singa.model.save('mymodel.zip', m)
    """
    attributes <-- get the model and all layers' attributes
    replace all tensors in attributes --> {'shape': v.shape, 'dtype': v.dtype}
    dump the tensors via numpy or protobuf or hdf5
    dump model via pickle
    zip the output files

def load(fpath):
    """load the model from disk without knowing the model class definition.

    usage:
       m = singa.model.load('mymodel.zip')
       m.compile(...)
     """
     unzip the input file
     load model via pickle
     load tensors
     put model states --> temp_arrays
     return the model

## mymodel.py
from singa.device import create_cuda_gpu
from singa.layer import Conv2D, MaxPool2D, ReLU, SoftmaxCrossEntropy
from singa.tensor import Tensor, PlaceHolder
from singa.model import Model
from singa.opt import SGD, DistOpt

class MyModel(Model):
  def __init__(self):
    self.conv = Conv2D(32, 3)
    self.relu = ReLU()
    self.pool = MaxPool2D(3, 2)
    self.loss = SoftmaxCrossEntropy()

  def forward(self, x):
    x = self.conv(x)
    x = self.relu(x)
    x = self.pool(x)
    return x

  def train_one_batch(self, x, y, opt):
    pred = self.forward(x)
    l = self.loss(pred, y)
    opt(l)

sgd = SGD(0.01)
dist_sgd = DistOpt(sgd)
cuda = device.create_cuda_gpu_on(sgd.local_rank)

bs = 128
m = MyModel()
x = tensor.PlaceHolder((bs, 3, 32, 32), cuda, dtype=tensor.float32)
y = tensor.PlaceHodler((bs, ), cuda, dtype=tensor.int32)
m.compile([x], use_graph=True)


for (x, y) in train_data:
  m.train_one_batch(x, y, dist_sgd)


## opt.py
class DecayScheduler:
      # to be used for decaying learning rate or regularization coefficient or momentum, etc.

      def __init__(self, init_value):
          self.init_value = init_value

      def __call__(self, step):
           assert isinstance(step, Tensor)
           return self.call(step)

      def  call(self, step):
           # step is a Tensor with a single scalar value
           # return the current value as a Tensor

class Constant(DecayScheduler):
       def call(self, step):
           return self.init_value * step / step  # to return a tensor

class ExponentialDecay(DecayScheduler):
        def __init__(self, init_value, decay_steps, decay_rate, staircase=False):
           # store the input args

        def call(self, step):
           if staircase:
               s = step // decay_steps
           else:
               s = step / decay_steps
           ret = self.init_value * decay_rate * step / step  # to make ret a tensor
           return ret ^ s

class Optimizer:
   def __init__(self, lr):
       """lr could be a constant scalar or a learning rate scheduler"""
        if type(lr) == float:
           self. lr = Constant(lr)
        elif lr is a DecayScheduler:
           self.lr = lr
        self.step_counter = Tensor((1, ), dtype=singa.int)
        self.step_counter.set_value(0)
        self.lr_value = self.lr(self.step_counter)

   def get_states(self)
        # skip DecayScheduler as it does not have persistent states
        return {'step_counter': self.step_counter.get_value(0)}

   def set_states(self, states):
        self.step_counter = Tensor((1, ), dtype=singa.int)
        self.step_counter.set_value(states['step_counter'])
        self.lr_value = self.lr(self.step_counter)

   def __call__(loss):
          self.call(loss)
          self.step()

   def call(self, loss):
         for p, g in autograd.backward(loss):
            # each tensor can have a name; set the name of the param tensor in compile?
            self.apply(p.name, p, g)

   def step(self):
        self.step_counter += 1
        self.lr_value = self.lr(self.step_counter)

   def apply(self, param_name, param_value, param_grad):
         pass

   @deprecated  #for backward compatibility
   def update(self, p, g):
         if p.name is None:
            p.name = id(p)
         self.apply(p.name, p, g)

class SGD(Optimizer):
     def __init__(self, lr, momentum=0.0):
        super().__init__(self, lr)
         if type(momentum) == float:
           self. momentum= Constant(momentum)
        elif momentumis a DecayScheduler:
           self.momentum= momentum
        self.mom_value = self.momentum(self.step_counter)

    def apply(self, pname, pvalue, pgrad):
        pass  # update the pvalue inplace using pgrad, lr_value and mom_value

    def step(self):
      super.step()
      self.mom_value = self.momentum(self.step_counter)

    def get_states(self):
      states = super.get_states()
      if self.mom_value > 0:
            states['moments'] = self.moments # a dict for 1st order moments tensors
      return states

    def set_states(self, states):
      super.set_states(states)
      if 'moments' in states:
            self.moments = states['moments']
            self.mom_value = self.momentum(self.step_counter)


## sonnx.py
def to_onnx(model):
    return a onnx model

class SONNXModel(Module):
    def __init__(self, onnx_mode):
        singa_rep = sonnx.prepare(onnx_model) # will update the prepare function to remove device and batchsize
        for layer_name, layer in singa_rep.layers:
            self.__dict__[layer_name] = layer
        # store weights here as numpy
        for weith_name, weight in singa_rep.weights:
            self.temp_arrays[weith_name] = weight
        # store layer info such as input and output name(only weights)
        for layer_name, layer_info in singa_rep.layer_infos:
            self.layer_infos[layer_name] = layer_info

    def forward(self, aux_output):
        # run forward according to onnx graph
        return the last output + aux_output

class MyModel(SONNXModel):
     def __init__(self, onnx):
          super.__init__(onnx)
          self.layer1 = Conv()
          self.layer2 = Conv()

     def forward(self, x):
           x1, x2 = super.forward(x, aux_output)
           x = self.layer1.forward(x2)
           return self.layer2.forward(x1) + x

      def train_one_batch(self, x, y):
           y_ = self.forward(x)
           ....


ox = onnx.load(fpath)
x = Placeholder((2, 3), device = gpu, dtype=singa.float) # alias of Tensor
m = MyModel(ox)
# compatible with existing code which does not have the following two statements.
m.compile([x], is_train=True, use_graph=True, graph_alg='sequence')

y = Placeholder((2,), device = gpu)
for npx, npy in data:
   x.copy_from(npx)
   y.copy_from(npy)
   m.train_one_batch(x, y)  # build the graph in the first iter.  For the old code, the params are initialized here.
	class Layer:
	def __init__(self,):
	self.has_initialized = False

	def get_params(self):
	"""return the params of this layer and sublayers as a dict;
	param name is: layername.param_name. e.g.,

	self.W = Tensor(), self.b=Tensor()
	name of W and b is like conv1.W and conv1.b
	"""
	params = {}
	for each sublayer:
	params.update(sublayer.get_params())
	for each param in this layer:
	params.add(param.name, param)

	def set_params(self, params):
	""" params: a dict, name --> tensor or numpy array

	copy values of params into layer's param tensors.
	"""
	for each sublayer of this layer
	sublayer.set_params(params)

	for each param of this layer
	param.copy_from(params[param.name])

	def get_states(self):
	"""states of this layer and sublayers that are necessary
	for model training/evaluation/inference.
	the states include the params and others, e.g., the running mean and var of batchnorm.
	"""

	def set_states(self, states):
	"""states: dict, name --> value

	if value is tensor or numpy array, copy the value
	""

	def initialize(self, args, *kwargs):
	""" initialize internal states, e.g., cudnn handle, params based on the inputs.
	set the name of each param based on the layer name. e.g., if the layer name is 'conv1',
	then the weight param's name is 'conv1.W', the bias's name is 'conv1.b'.
	"""
	self.has_initialized = True

	def forward(self, args, *kwargs):
	if self.has_initialized == False:
	self.initialize(args, kwargs)
	# do the real transformation


	class MyLayer(Layer):
	def get_params(self):
	if self.b is not None:
	return {self.W.name: self.W, self.b.name: self.b}
	else:
	return {self.W.name: self.W}

	def set_params(self, params):
	self.W.copy_from(params[self.W.name])
	if self.b is not None:
	self.b.copy_from(params[self.b.name])
	class Model(Layer):
	def __init__(self,):
	self.temp_arrays = {} # to store numpy arrays loaded from disk.

	def compile(self, inputs, dev, use_graph, graph_alg):
	""" 1. forware through each layer to call layer's initialize() method.
	2. set the name of each layer and sublayers, which will be used to create the dict
	for get_params and get_states. Then no need to manually config the layer name
	the __init__ method of a layer.

	For instance,
	class Blk(Layer):
	def __init__(self):
	self.conv1= Conv2d()
	self.conv2 = Conv2d()

	class MyModel(Module):
	def __init__(self):
	self.blk1 = Blk() --> blk1.conv1, blk1.conv2
	self.blk2 = Blk() --> blk2.conv1, blk2.conv2

	3. copy the self.temp_arrays into the corresponding tensors via set_states()
	"""


	def save_states(self, fpath, aux_states={}):
	"""Save states.

	Args:
	fpath: output file path (without the extension)
	aux_states(dict): values are standard data types or Tensor,
	e.g., epoch ID, learning rate, optimizer states
	"""
	states = get_states() + aux_states + input_placeholders
	tensor_dict = {}
	for k, v in states:
	if type(v) is Tensor:
	tensor_dict[k] = v
	states[k] = {'shape': v.shape, 'dtype': v.dtype}
	save states as json file
	save tensor_dict via numpy or hdf5 or protobuf
	zip the output files

	def load_states(self, fpath):
	"""Load the model states and auxiliary states from disk.

	Usage:
	m = MyModel()
	m.compile(...)
	aux_states = m.load_states('mymodel.zip')

	Args:
	path: input file path (without the extension)
	Returns:
	dict
	```
	unzip the input file
	load the json file --> states
	load the tensor files --> numpy_dict
	states --> model_states + input_placeholders + aux_states
	self.set_states(model_states)
	return the rest as a dict for the aux_states


	def save(fpath, model):
	""" save the class (including the methods) and the instance's all attributes via pickle
	Usage:
	m = MyModel()
	m.compile(...)
	# training
	singa.model.save('mymodel.zip', m)
	"""
	attributes <-- get the model and all layers' attributes
	replace all tensors in attributes --> {'shape': v.shape, 'dtype': v.dtype}
	dump the tensors via numpy or protobuf or hdf5
	dump model via pickle
	zip the output files

	def load(fpath):
	"""load the model from disk without knowing the model class definition.

	usage:
	m = singa.model.load('mymodel.zip')
	m.compile(...)
	"""
	unzip the input file
	load model via pickle
	load tensors
	put model states --> temp_arrays
	return the model
	from singa.device import create_cuda_gpu
	from singa.layer import Conv2D, MaxPool2D, ReLU, SoftmaxCrossEntropy
	from singa.tensor import Tensor, PlaceHolder
	from singa.model import Model
	from singa.opt import SGD, DistOpt

	class MyModel(Model):
	def __init__(self):
	self.conv = Conv2D(32, 3)
	self.relu = ReLU()
	self.pool = MaxPool2D(3, 2)
	self.loss = SoftmaxCrossEntropy()

	def forward(self, x):
	x = self.conv(x)
	x = self.relu(x)
	x = self.pool(x)
	return x

	def train_one_batch(self, x, y, opt):
	pred = self.forward(x)
	l = self.loss(pred, y)
	opt(l)

	sgd = SGD(0.01)
	dist_sgd = DistOpt(sgd)
	cuda = device.create_cuda_gpu_on(sgd.local_rank)

	bs = 128
	m = MyModel()
	x = tensor.PlaceHolder((bs, 3, 32, 32), cuda, dtype=tensor.float32)
	y = tensor.PlaceHodler((bs, ), cuda, dtype=tensor.int32)
	m.compile([x], use_graph=True)



	for (x, y) in train_data:
	m.train_one_batch(x, y, dist_sgd)
	class DecayScheduler:
	# to be used for decaying learning rate or regularization coefficient or momentum, etc.

	def __init__(self, init_value):
	self.init_value = init_value

	def __call__(self, step):
	assert isinstance(step, Tensor)
	return self.call(step)

	def call(self, step):
	# step is a Tensor with a single scalar value
	# return the current value as a Tensor

	class Constant(DecayScheduler):
	def call(self, step):
	return self.init_value * step / step # to return a tensor

	class ExponentialDecay(DecayScheduler):
	def __init__(self, init_value, decay_steps, decay_rate, staircase=False):
	# store the input args

	def call(self, step):
	if staircase:
	s = step // decay_steps
	else:
	s = step / decay_steps
	ret = self.init_value * decay_rate * step / step # to make ret a tensor
	return ret ^ s

	class Optimizer:
	def __init__(self, lr):
	"""lr could be a constant scalar or a learning rate scheduler"""
	if type(lr) == float:
	self. lr = Constant(lr)
	elif lr is a DecayScheduler:
	self.lr = lr
	self.step_counter = Tensor((1, ), dtype=singa.int)
	self.step_counter.set_value(0)
	self.lr_value = self.lr(self.step_counter)

	def get_states(self)
	# skip DecayScheduler as it does not have persistent states
	return {'step_counter': self.step_counter.get_value(0)}

	def set_states(self, states):
	self.step_counter = Tensor((1, ), dtype=singa.int)
	self.step_counter.set_value(states['step_counter'])
	self.lr_value = self.lr(self.step_counter)

	def __call__(loss):
	self.call(loss)
	self.step()

	def call(self, loss):
	for p, g in autograd.backward(loss):
	# each tensor can have a name; set the name of the param tensor in compile?
	self.apply(p.name, p, g)

	def step(self):
	self.step_counter += 1
	self.lr_value = self.lr(self.step_counter)

	def apply(self, param_name, param_value, param_grad):
	pass

	@deprecated #for backward compatibility
	def update(self, p, g):
	if p.name is None:
	p.name = id(p)
	self.apply(p.name, p, g)

	class SGD(Optimizer):
	def __init__(self, lr, momentum=0.0):
	super().__init__(self, lr)
	if type(momentum) == float:
	self. momentum= Constant(momentum)
	elif momentumis a DecayScheduler:
	self.momentum= momentum
	self.mom_value = self.momentum(self.step_counter)

	def apply(self, pname, pvalue, pgrad):
	pass # update the pvalue inplace using pgrad, lr_value and mom_value

	def step(self):
	super.step()
	self.mom_value = self.momentum(self.step_counter)

	def get_states(self):
	states = super.get_states()
	if self.mom_value > 0:
	states['moments'] = self.moments # a dict for 1st order moments tensors
	return states

	def set_states(self, states):
	super.set_states(states)
	if 'moments' in states:
	self.moments = states['moments']
	self.mom_value = self.momentum(self.step_counter)
	def to_onnx(model):
	return a onnx model

	class SONNXModel(Module):
	def __init__(self, onnx_mode):
	singa_rep = sonnx.prepare(onnx_model) # will update the prepare function to remove device and batchsize
	for layer_name, layer in singa_rep.layers:
	self.__dict__[layer_name] = layer
	# store weights here as numpy
	for weith_name, weight in singa_rep.weights:
	self.temp_arrays[weith_name] = weight
	# store layer info such as input and output name(only weights)
	for layer_name, layer_info in singa_rep.layer_infos:
	self.layer_infos[layer_name] = layer_info

	def forward(self, aux_output):
	# run forward according to onnx graph
	return the last output + aux_output

	class MyModel(SONNXModel):
	def __init__(self, onnx):
	super.__init__(onnx)
	self.layer1 = Conv()
	self.layer2 = Conv()

	def forward(self, x):
	x1, x2 = super.forward(x, aux_output)
	x = self.layer1.forward(x2)
	return self.layer2.forward(x1) + x

	def train_one_batch(self, x, y):
	y_ = self.forward(x)
	....


	ox = onnx.load(fpath)
	x = Placeholder((2, 3), device = gpu, dtype=singa.float) # alias of Tensor
	m = MyModel(ox)
	# compatible with existing code which does not have the following two statements.
	m.compile([x], is_train=True, use_graph=True, graph_alg='sequence')

	y = Placeholder((2,), device = gpu)
	for npx, npy in data:
	x.copy_from(npx)
	y.copy_from(npy)
	m.train_one_batch(x, y) # build the graph in the first iter. For the old code, the params are initialized here.