Skip to content

Instantly share code, notes, and snippets.

@AppleHolic
Last active July 18, 2017 07:19
Show Gist options
  • Save AppleHolic/2ca0d5c918c227bd3a4c0ad0863b60e0 to your computer and use it in GitHub Desktop.
Save AppleHolic/2ca0d5c918c227bd3a4c0ad0863b60e0 to your computer and use it in GitHub Desktop.
Text Classification Training Code (mxnet)
import mxnet as mx
import numpy as np
# make and return data iterator
def get_data_iter(data, tags, labels, shuffle=False, batch_size=64):
nditer = mx.io.NDArrayIter(data={'data' : data, 'tags' : tags}, label={'labels': labels}, batch_size=batch_size, shuffle=shuffle)
return nditer
# origins (precision recall functions)
'''
def precision(y_true, y_pred):
true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
predicted_positives = np.sum(np.round(np.clip(y_pred, 0, 1)))
precision = true_positives / (predicted_positives + 1e-8)
return precision
def recall(y_true, y_pred):
true_positives = np.sum(np.round(np.clip(y_true * y_pred, 0, 1)))
possible_positives = np.sum(np.round(np.clip(y_true, 0, 1)))
recall = true_positives / (possible_positives + 1e-8)
return recall
'''
# top 3 metric functions
def precision(y_true, y_pred, topk=3):
# mark 1 on top 3 pred labels
topk_indices = []
for idx in range(len(y_pred)):
topk_indices.append(y_pred[idx].argsort()[-topk:][::-1])
temp = np.zeros_like(y_pred)
for idx in range(len(temp)):
temp[idx, topk_indices[idx]] = 1.
y_pred = temp
# multiply marks and trues on axis 1
true_positives = np.sum(y_true * y_pred, axis=1)
# sum up only binary values
true_positives = np.sum(true_positives==np.sum(y_true, axis=1))
predicted_positives = len(y_pred)# same as np.sum(y_pred)/topk
precision = true_positives / (predicted_positives + 1e-8)
return precision
def recall(y_true, y_pred, topk=3):
topk_indices = []
for idx in range(len(y_pred)):
topk_indices.append(y_pred[idx].argsort()[-topk:][::-1])
temp = np.zeros_like(y_pred)
for idx in range(len(temp)):
temp[idx, topk_indices[idx]] = 1.
y_pred = temp
true_positives = np.sum(y_true * y_pred, axis=1)
true_positives = np.sum(true_positives==np.sum(y_true, axis=1))
possible_positives = np.sum(y_true)
recall = true_positives / (possible_positives + 1e-8)
return recall
def fbeta_score(y_true, y_pred, beta=1, prec=None, rec=None):
if beta < 0:
raise ValueError('The lowest choosable beta is zero (only precision).')
# If there are no true positives, fix the F score at 0 like sklearn.
if np.sum(np.round(np.clip(y_true, 0, 1))) == 0:
return 0
p = precision(y_true, y_pred)
r = recall(y_true, y_pred)
bb = beta ** 2
fbeta_score = (1 + bb) * (p * r) / (bb * p + r + 1e-8)
return fbeta_score
def fmeasure(y_true, y_pred):
return fbeta_score(y_true, y_pred, beta=1)
#make custom operation layer
class CrossEntropyLoss(mx.operator.CustomOp):
eps = 1e-6 # Avoid -inf when taking log(0)
eps1 = 1. + eps
eps_1 = 1. - eps
def forward(self, is_train, req, in_data, out_data, aux):
# Shapes:
# b = minibatch size
# d = number of dimensions
actually_calculate_loss = False
if actually_calculate_loss:
p = in_data[0].asnumpy() # shape=(b,d)
y = in_data[1].asnumpy()
out = y * np.log(p+self.eps) + (1.-y) * np.log((self.eps1) - p)
self.assign(out_data[0], req[0], mx.nd.array(out))
else:
# Just copy the predictions forward
self.assign(out_data[0], req[0], in_data[0])
def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
#self.approx_backward(req, out_grad, in_data, out_data, in_grad, aux)
self.exact_backward(req, out_grad, in_data, out_data, in_grad, aux)
def approx_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
"""Correct grad = (y-p)/(p-p^2)
But if y is just 1 or 0, then this simplifies to
grad = 1/(p-1+y)
which is more numerically stable
"""
p = in_data[0].asnumpy() # shape=(b,d)
y = in_data[1].asnumpy()
grad = 1. / (p - self.eps_1 + y)
self.assign(in_grad[0], req[0], mx.nd.array(grad))
def exact_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
"""grad = (y-p)/(p-p^2)
"""
p = in_data[0].asnumpy() # shape=(b,d)
y = in_data[1].asnumpy() # seems right
grad = (p - y) / ((p+self.eps) * (self.eps1 - p))
self.assign(in_grad[0], req[0], mx.nd.array(grad))
# adoption custom oprator in mxnet
@mx.operator.register("CrossEntropyLoss")
class CrossEntropyProp(mx.operator.CustomOpProp):
def __init__(self):
super(CrossEntropyProp, self).__init__(need_top_grad=False)
def list_arguments(self):
return ['data','label']
def list_outputs(self):
return ['preds']
def create_operator(self, ctx, shapes, dtypes):
return CrossEntropyLoss()
def infer_shape(self, in_shape):
if in_shape[0] != in_shape[1]:
raise ValueError("Input shapes differ. data:%s. label:%s. must be same"
% (str(in_shape[0]),str(in_shape[1])))
output_shape = in_shape[0]
return in_shape, [output_shape], []
import mxnet as mx
import os
import logging
class Network(object):
'''
Initialize Model Params and get model by given model name!
You just put the model name(function name) into model params for getting model symbol.
'''
def __init__(self, model_params):
self.__init_params__(model_params)
def __init_params__(self, model_params):
for key, val in model_params.iteritems():
setattr(self, key, val)
def get_model(self, name):
return getattr(self, name)()
def __input_part(self):
data = mx.sym.Variable('data')
tags = mx.sym.Variable('tags')
labels = mx.sym.Variable('labels')
data_embed = mx.sym.Embedding(data=data, input_dim=self.max_features+1, output_dim=self.embedding_dims, name='embed_data')
tags_embed = mx.sym.Embedding(data=tags, input_dim=self.nb_tags+1, output_dim=1, name='embed_tags')
concat = mx.sym.Concat(data_embed, tags_embed, dim=2)
return data, tags, labels, concat
def get_lstm_cell(self, inputs, stack_rnn=True, bi_direction=True, num_layers=1, num_hidden=32, dropout=0.5):
# build cell
if stack_rnn:
cell = mx.rnn.SequentialRNNCell()
for i in range(num_layers):
cell.add(mx.rnn.FusedRNNCell(num_hidden, num_layers=1, mode='lstm', prefix='lstm_l%d' % i, bidirectional=bi_direction))
if dropout > 0 and i < num_layers-1:
cell.add(mx.rnn.DropoutCell(dropout, prefix='lstm_d%d' % i))
else:
cell = mx.rnn.FusedRNNCell(num_hidden, num_layers=num_layers, dropout=dropout,
mode='lstm', bidirectional=bi_direction)
output, _ = cell.unroll(self.maxlen, inputs=inputs)
return output
def fast_text_lstm(self):
main_input, tag_input, labels, network = self.__input_part()
network = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden)
network = mx.sym.transpose(data=network, axes=(0, 2, 1))
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg')
network = mx.sym.Dropout(network, p=0.5)
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred')
network = mx.sym.Activation(network, act_type='sigmoid')
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss')
return network
def fast_text_lstm_maxpool(self):
main_input, tag_input, labels, network = self.__input_part()
network = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden)
network = mx.sym.transpose(data=network, axes=(0, 2, 1))
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='max')
network = mx.sym.Dropout(network, p=0.5)
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred')
network = mx.sym.Activation(network, act_type='sigmoid')
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss')
return network
def fast_text(self):
main_input, tag_input, labels, network = self.__input_part()
network = mx.sym.transpose(data=network, axes=(0, 2, 1))
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg')
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred')
network = mx.sym.Activation(network, act_type='sigmoid')
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss')
return network
def fast_text_conv(self):
main_input, tag_input, labels, network = self.__input_part()
network = mx.sym.transpose(data=network, axes=(0, 2, 1))
network = mx.sym.Convolution(data=network, kernel=(3,), num_filter=self.filters)
network = mx.sym.Dropout(data=network, p=0.5)
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg')
network = mx.sym.Dropout(network, p=0.5)
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred')
network = mx.sym.Activation(network, act_type='sigmoid')
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss')
return network
def bi_directional_lstm(self):
main_input, tag_input, labels, network = self.__input_part()
network = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden)
network = mx.sym.Reshape(network, shape=(-1, self.nb_hidden*2))
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred')
network = mx.sym.Dropout(network, p=0.5)
network = mx.sym.Activation(network, act_type='sigmoid')
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss')
return network
def combine_model1(self):
main_input, tag_input, labels, network = self.__input_part()
# conv part
network_part1 = mx.sym.transpose(data=network, axes=(0, 2, 1))
network_part1 = mx.sym.Convolution(data=network_part1, kernel=(3,), num_filter=self.filters)
network_part1 = mx.sym.Dropout(data=network_part1, p=0.5)
# lstm part
network_part2 = self.get_lstm_cell(network, num_layers=self.num_layers, num_hidden=self.nb_hidden)
network_part2 = mx.sym.transpose(data=network_part2, axes=(0, 2, 1))
network = mx.sym.Concat(network_part1, network_part2, dim=2)
network = mx.sym.Pooling(network, kernel=(self.maxlen,), global_pool=True, pool_type='avg')
network = mx.sym.Dropout(network, p=0.5)
network = mx.sym.FullyConnected(data=network, num_hidden=self.nb_classes, name='pred')
network = mx.sym.Activation(network, act_type='sigmoid')
#network = mx.sym.SoftmaxOutput(data=network, label=labels, name='output')
network = mx.sym.Custom(data=network, label=labels, name='output', op_type='CrossEntropyLoss')
return network
def load_model(self, rank=0):
if 'load_epoch' not in self or self.load_epoch is None:
return (None, None, None)
assert self.model_prefix is not None
model_prefix = self.model_prefix
if rank > 0 and os.path.exists("%s-%d-symbol.json" % (model_prefix, rank)):
model_prefix += "-%d" % (rank)
sym, arg_params, aux_params = mx.model.load_checkpoint(
model_prefix, self.load_epoch)
logging.info('Loaded model %s_%04d.params', model_prefix, self.load_epoch)
return (sym, arg_params, aux_params)
def save_model(self, rank=0):
if self.checkpoint is None:
return None
dst_dir = self.checkpoint
model_prefix = os.path.join(dst_dir, self.model_name)
if not os.path.isdir(dst_dir):
os.mkdir(dst_dir)
return mx.callback.do_checkpoint(model_prefix if rank == 0 else "%s-%d" % (
model_prefix, rank))
import mxnet as mx
import numpy as np
import os
import time
from sklearn.model_selection import train_test_split
from symbols import Network
from common import get_data_iter, precision, recall, fbeta_score
class ParamHolder():
def __init__(self, params):
for key, val in params.iteritems():
setattr(self, key, val)
# load train and valid data
def load_train_val():
master_dir = '../data/'
x_train = np.load(os.path.join(master_dir, 'x_train_save.npy'))
tag_train = np.load(os.path.join(master_dir, 'tag_train_save.npy'))
y_train = np.load(os.path.join(master_dir, 'y_train_save.npy'))
x_valid = np.load(os.path.join(master_dir, 'x_valid_save.npy'))
tag_valid = np.load(os.path.join(master_dir, 'tag_valid_save.npy'))
y_valid = np.load(os.path.join(master_dir, 'y_valid_save.npy'))
return x_train, tag_train, y_train, x_valid, tag_valid, y_valid
# load and prepare data
def prepare_data(args, maxlen):
x_test = np.load(args.x_test_path)[:, -maxlen:]
tags_test = np.load(args.tag_test_path)[:, -maxlen:]
y_test = np.load(args.y_test_path)[:, -maxlen:]
# train/valid split or load data
# saved data maxlen = 400
# x_train, x_valid, tag_train, tag_valid, y_train, y_valid = train_test_split(x_train, tags_train, y_train, test_size=0.2, random_state=2017)
x_train, tag_train, y_train, x_valid, tag_valid, y_valid = load_train_val()
return x_train, tag_train, y_train, x_valid, tag_valid, y_valid, x_test, tags_test, y_test
# define data file paths
check_dir = '../checkpoints/%s'
# model params
model_params = {
'nb_classes' : 20,
'max_features' : 342786,
'embedding_dims' : 256,
'maxlen' : 400,
'batch_size' : 256,
'filters': 256,
'nb_hidden': 32,
'kernel_size' : 3,
'nb_tags': 11,
'num_layers': 1,
'checkpoint': '../checkpoints/%s' % time.strftime('%Y%m%d'),
'model_name': 'fast_text_lstm',
}
# train params
train_params = {
'epoch': 40,
'batch_size': 256,
'x_train_path': '../data/x_train.npy',
'tag_train_path': '../data/x_train_tags.npy',
'y_train_path' : '../data/y_train.npy',
'x_test_path' : '../data/x_test.npy',
'tag_test_path' : '../data/x_test_tags.npy',
'y_test_path' : '../data/y_test.npy',
'check_dir' : '../checkpoints/%s',
'model_name' : 'fast_text_lstm',
'gpus': '0',
'lr_factor': 0.1,
'num_examples': 0,
'load_epoch': 0,
'kv_store': '',
'lr_step_epochs': '20, 40',
'lr': 0.01,
'mom': 0.9,
'wd': 0.0001,
'monitor': 0,
'optimizer': 'adam',
}
# get mxnet lr scheduler
def get_lr_scheduler(args, kv):
if args.lr_factor != None or args.lr_factor >= 1:
return (args.lr, None)
epoch_size = args.num_examples / args.batch_size
if 'dist' in args.kv_store:
epoch_size /= kv.num_workers
begin_epoch = args.load_epoch if args.load_epoch else 0
step_epochs = [int(l) for l in args.lr_step_epochs.split(',')]
lr = args.lr
for s in step_epochs:
if begin_epoch >= s:
lr *= args.lr_factor
if lr != args.lr:
logging.info('Adjust learning rate to %e for epoch %d' %(lr, begin_epoch))
steps = [epoch_size * (x-begin_epoch) for x in step_epochs if x-begin_epoch > 0]
return (lr, mx.lr_scheduler.MultiFactorScheduler(step=steps, factor=args.lr_factor))
# main train function
def train():
# load data
mx.random.seed(2017)
args = ParamHolder(train_params)
x_train, tag_train, y_train, x_valid, tag_valid, y_valid, x_test, tag_test, y_test = prepare_data(args, model_params['maxlen'])
train_params['num_examples'] = len(x_train) # setup num_example param after loading data
args.num_examples = len(x_train)
# make data iterator for adopting mxnet training process
train_data_iter = get_data_iter(x_train, tag_train, y_train, batch_size=model_params['batch_size'], shuffle=True)
valid_data_iter = get_data_iter(x_valid, tag_valid, y_valid, batch_size=model_params['batch_size'])
# make kv store
kv = mx.kv.create('local')
# initialize model inst
model_set = Network(model_params)
# checkpoint
checkpoint = model_set.save_model(kv.rank)
# devices for training
devs = mx.cpu() if args.gpus is None or args.gpus is '' else [
mx.gpu(int(i)) for i in args.gpus.split(',')]
# learning rate
lr, lr_scheduler = get_lr_scheduler(args, kv)
# load symbol
network = model_set.get_model(args.model_name)
# make module (model in keras)
model = mx.mod.Module(context=devs, symbol=network, data_names=['data', 'tags'], label_names=['labels'])
init = mx.initializer.Mixed(['bias', '.*'], [mx.init.Zero(), mx.init.Uniform(0.1)])
# prepare optimizer paramse
optimizer_params = {
'learning_rate': lr,
'wd' : args.wd,
'lr_scheduler': lr_scheduler}
# monitoring parameters of network
monitor = mx.mon.Monitor(args.monitor, pattern=".*") if args.monitor > 0 else None
# evaluation metrices using custom function in other source code
metric_f05 = lambda y_true, y_pred: fbeta_score(y_true, y_pred, beta=0.5)
p, r = map(mx.metric.create, [precision, recall])
metric_f05 = mx.metric.create(metric_f05)
eval_metrics = [p, r, metric_f05]
# callbacks that run after each batch
batch_end_callback = mx.callback.ProgressBar(np.ceil(float(args.num_examples)/args.batch_size))
batch_end_callback = mx.callback.Speedometer(args.batch_size, 50)
# setup logger
import logging
head = '%(asctime)-15s %(message)s'
logging.basicConfig(level=logging.DEBUG, format=head)
# plot image of network structure
mx.viz.plot_network(network).view()
# call fit function
model.fit(train_data_iter,
begin_epoch = args.load_epoch if args.load_epoch else 0,
num_epoch = args.epoch,
eval_data = valid_data_iter,
eval_metric = eval_metrics,
kvstore = kv,
optimizer = args.optimizer,
optimizer_params = optimizer_params,
initializer = init,
batch_end_callback = batch_end_callback,
epoch_end_callback = checkpoint,
allow_missing = True,
monitor = monitor)
# end
print('training is completed!!!!!!')
if __name__=='__main__':
train()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment