Skip to content

Instantly share code, notes, and snippets.

@fty8788
Created August 21, 2017 03:55
Show Gist options
  • Save fty8788/313a9e94e5738065f482436307da8a2a to your computer and use it in GitHub Desktop.
Save fty8788/313a9e94e5738065f482436307da8a2a to your computer and use it in GitHub Desktop.
DSSM_trainer_config.py
from paddle import v2 as paddle
from paddle.v2.attr import ParamAttr
from utils import TaskType, logger, ModelType, ModelArch, load_dic
from paddle.trainer_config_helpers import *
class DSSM(object):
def __init__(self,
dnn_dims=[],
vocab_sizes=[],
model_type=ModelType.create_classification(),
model_arch=ModelArch.create_cnn(),
share_semantic_generator=False,
class_num=None,
share_embed=False,
is_infer=False):
'''
@dnn_dims: list of int
dimentions of each layer in semantic vector generator.
@vocab_sizes: 2-d tuple
size of both left and right items.
@model_type: int
type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
@model_arch: int
model architecture
@share_semantic_generator: bool
whether to share the semantic vector generator for both left and right.
@share_embed: bool
whether to share the embeddings between left and right.
@class_num: int
number of categories.
'''
assert len(
vocab_sizes
) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2."
assert len(dnn_dims) > 1, "more than two layers is needed."
self.dnn_dims = dnn_dims
self.vocab_sizes = vocab_sizes
self.share_semantic_generator = share_semantic_generator
self.share_embed = share_embed
self.model_type = ModelType(model_type)
self.model_arch = ModelArch(model_arch)
self.class_num = class_num
self.is_infer = is_infer
logger.warning("build DSSM model with config of %s, %s" %
(self.model_type, self.model_arch))
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
# bind model architecture
_model_arch = {
'cnn': self.create_cnn,
'fc': self.create_fc,
'rnn': self.create_rnn,
}
def _model_arch_creater(emb, prefix=''):
sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
dnn = self.create_dnn(sent_vec, prefix)
return dnn
self.model_arch_creater = _model_arch_creater
# build model type
_model_type = {
'classification': self._build_classification_model,
'rank': self._build_rank_model,
'regression': self._build_regression_model,
}
#print 'model type: ', str(self.model_type)
self.model_type_creater = _model_type[str(self.model_type)]
def __call__(self):
return self.model_type_creater()
def create_embedding(self, input, prefix=''):
'''
Create an embedding table whose name has a `prefix`.
'''
logger.info("create embedding table [%s] which dimention is %d" %
(prefix, self.dnn_dims[0]))
emb = paddle.layer.embedding(
input=input,
size=self.dnn_dims[0],
param_attr=ParamAttr(name='%s_emb.w' % prefix))
return emb
def create_fc(self, emb, prefix=''):
'''
A multi-layer fully connected neural networks.
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `fc` parts.
'''
_input_layer = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1])
return fc
def create_rnn(self, emb, prefix=''):
'''
A GRU sentence vector learner.
'''
gru = paddle.layer.gru_memory(
input=emb, )
sent_vec = paddle.layer.last_seq(gru)
return sent_vec
def create_cnn(self, emb, prefix=''):
'''
A multi-layer CNN.
@emb: paddle.layer
output of the embedding layer
@prefix: str
prefix of layers' names, used to share parameters between more than one `cnn` parts.
'''
def create_conv(context_len, hidden_size, prefix):
key = "%s_%d_%d" % (prefix, context_len, hidden_size)
conv = paddle.networks.sequence_conv_pool(
input=emb,
context_len=context_len,
hidden_size=hidden_size,
# set parameter attr for parameter sharing
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
fc_param_attr=ParamAttr(name=key + '_fc.w'),
fc_bias_attr=ParamAttr(name=key + '_fc.b'),
pool_bias_attr=ParamAttr(name=key + '_pool.b'))
return conv
logger.info('create a sequence_conv_pool which context width is 3')
conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
logger.info('create a sequence_conv_pool which context width is 4')
conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
return conv_3, conv_4
def create_dnn(self, sent_vec, prefix):
# if more than three layers, than a fc layer will be added.
if len(self.dnn_dims) > 1:
_input_layer = sent_vec
for id, dim in enumerate(self.dnn_dims[1:]):
name = "%s_fc_%d_%d" % (prefix, id, dim)
logger.info("create fc layer [%s] which dimention is %d" %
(name, dim))
fc = paddle.layer.fc(
name=name,
input=_input_layer,
size=dim,
act=paddle.activation.Tanh(),
param_attr=ParamAttr(name='%s.w' % name),
bias_attr=ParamAttr(name='%s.b' % name))
_input_layer = fc
return _input_layer
def _build_classification_model(self):
logger.info("build classification model")
assert self.model_type.is_classification()
return self._build_classification_or_regression_model(
is_classification=True)
def _build_regression_model(self):
logger.info("build regression model")
assert self.model_type.is_regression()
return self._build_classification_or_regression_model(
is_classification=False)
def _build_rank_model(self):
'''
Build a pairwise rank model, and the cost is returned.
A pairwise rank model has 3 inputs:
- source sentence
- left_target sentence
- right_target sentence
- label, 1 if left_target should be sorted in front of right_target, otherwise 0.
'''
logger.info("build rank model")
assert self.model_type.is_rank()
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
left_target = paddle.layer.data(
name='left_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
right_target = paddle.layer.data(
name='right_target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
if not self.is_infer:
label = paddle.layer.data(
name='label_input', type=paddle.data_type.integer_value(1))
prefixs = '_ _ _'.split(
) if self.share_semantic_generator else 'source left right'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'source target target'.split()
word_vecs = []
for id, input in enumerate([source, left_target, right_target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
# cossim score of source and left_target
left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
# cossim score of source and right target
right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
if not self.is_infer:
# rank cost
cost = paddle.layer.rank_cost(left_score, right_score, label=label)
# prediction = left_score - right_score
# but this operator is not supported currently.
# so AUC will not used.
return cost, None, label
return None, [left_score, right_score], label
def _build_classification_or_regression_model(self, is_classification):
'''
Build a classification/regression model, and the cost is returned.
A Classification has 3 inputs:
- source sentence
- target sentence
- classification label
'''
if is_classification:
# prepare inputs.
assert self.class_num
source = paddle.layer.data(
name='source_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
target = paddle.layer.data(
name='target_input',
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
label = paddle.layer.data(
name='label_input',
type=paddle.data_type.integer_value(self.class_num)
if is_classification else paddle.data_type.dense_vector(1))
prefixs = '_ _'.split(
) if self.share_semantic_generator else 'left right'.split()
embed_prefixs = '_ _'.split(
) if self.share_embed else 'left right'.split()
word_vecs = []
for id, input in enumerate([source, target]):
x = self.create_embedding(input, prefix=embed_prefixs[id])
word_vecs.append(x)
semantics = []
for id, input in enumerate(word_vecs):
x = self.model_arch_creater(input, prefix=prefixs[id])
semantics.append(x)
if is_classification:
concated_vector = paddle.layer.concat(semantics)
prediction = paddle.layer.fc(
input=concated_vector,
size=self.class_num,
act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(
input=prediction, label=label)
else:
prediction = paddle.layer.cos_sim(*semantics)
cost = paddle.layer.mse_cost(prediction, label)
if not self.is_infer:
return cost, prediction, label
return None, prediction, label
layer_dims = [256, 128, 64, 32]
source_dic_path = "term.dict.bigram.61.150"
target_dic_path = source_dic_path
model_type = ModelType(ModelType.CLASSIFICATION_MODE)
model_arch = ModelArch(ModelArch.FC_MODE)
share_network_between_source_target = False
class_num = 2
share_embed = False
cost, prediction, label = DSSM(
dnn_dims=layer_dims,
vocab_sizes=[
len(load_dic(path))
for path in [source_dic_path, target_dic_path]
],
model_type=model_type,
model_arch=model_arch,
share_semantic_generator=share_network_between_source_target,
class_num=class_num,
share_embed=share_embed)()
outputs(prediction)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment