Created
August 21, 2017 03:55
-
-
Save fty8788/313a9e94e5738065f482436307da8a2a to your computer and use it in GitHub Desktop.
DSSM_trainer_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from paddle import v2 as paddle | |
from paddle.v2.attr import ParamAttr | |
from utils import TaskType, logger, ModelType, ModelArch, load_dic | |
from paddle.trainer_config_helpers import * | |
class DSSM(object): | |
def __init__(self, | |
dnn_dims=[], | |
vocab_sizes=[], | |
model_type=ModelType.create_classification(), | |
model_arch=ModelArch.create_cnn(), | |
share_semantic_generator=False, | |
class_num=None, | |
share_embed=False, | |
is_infer=False): | |
''' | |
@dnn_dims: list of int | |
dimentions of each layer in semantic vector generator. | |
@vocab_sizes: 2-d tuple | |
size of both left and right items. | |
@model_type: int | |
type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2' | |
@model_arch: int | |
model architecture | |
@share_semantic_generator: bool | |
whether to share the semantic vector generator for both left and right. | |
@share_embed: bool | |
whether to share the embeddings between left and right. | |
@class_num: int | |
number of categories. | |
''' | |
assert len( | |
vocab_sizes | |
) == 2, "vocab_sizes specify the sizes left and right inputs, and dim should be 2." | |
assert len(dnn_dims) > 1, "more than two layers is needed." | |
self.dnn_dims = dnn_dims | |
self.vocab_sizes = vocab_sizes | |
self.share_semantic_generator = share_semantic_generator | |
self.share_embed = share_embed | |
self.model_type = ModelType(model_type) | |
self.model_arch = ModelArch(model_arch) | |
self.class_num = class_num | |
self.is_infer = is_infer | |
logger.warning("build DSSM model with config of %s, %s" % | |
(self.model_type, self.model_arch)) | |
logger.info("vocabulary sizes: %s" % str(self.vocab_sizes)) | |
# bind model architecture | |
_model_arch = { | |
'cnn': self.create_cnn, | |
'fc': self.create_fc, | |
'rnn': self.create_rnn, | |
} | |
def _model_arch_creater(emb, prefix=''): | |
sent_vec = _model_arch.get(str(model_arch))(emb, prefix) | |
dnn = self.create_dnn(sent_vec, prefix) | |
return dnn | |
self.model_arch_creater = _model_arch_creater | |
# build model type | |
_model_type = { | |
'classification': self._build_classification_model, | |
'rank': self._build_rank_model, | |
'regression': self._build_regression_model, | |
} | |
#print 'model type: ', str(self.model_type) | |
self.model_type_creater = _model_type[str(self.model_type)] | |
def __call__(self): | |
return self.model_type_creater() | |
def create_embedding(self, input, prefix=''): | |
''' | |
Create an embedding table whose name has a `prefix`. | |
''' | |
logger.info("create embedding table [%s] which dimention is %d" % | |
(prefix, self.dnn_dims[0])) | |
emb = paddle.layer.embedding( | |
input=input, | |
size=self.dnn_dims[0], | |
param_attr=ParamAttr(name='%s_emb.w' % prefix)) | |
return emb | |
def create_fc(self, emb, prefix=''): | |
''' | |
A multi-layer fully connected neural networks. | |
@emb: paddle.layer | |
output of the embedding layer | |
@prefix: str | |
prefix of layers' names, used to share parameters between more than one `fc` parts. | |
''' | |
_input_layer = paddle.layer.pooling( | |
input=emb, pooling_type=paddle.pooling.Max()) | |
fc = paddle.layer.fc(input=_input_layer, size=self.dnn_dims[1]) | |
return fc | |
def create_rnn(self, emb, prefix=''): | |
''' | |
A GRU sentence vector learner. | |
''' | |
gru = paddle.layer.gru_memory( | |
input=emb, ) | |
sent_vec = paddle.layer.last_seq(gru) | |
return sent_vec | |
def create_cnn(self, emb, prefix=''): | |
''' | |
A multi-layer CNN. | |
@emb: paddle.layer | |
output of the embedding layer | |
@prefix: str | |
prefix of layers' names, used to share parameters between more than one `cnn` parts. | |
''' | |
def create_conv(context_len, hidden_size, prefix): | |
key = "%s_%d_%d" % (prefix, context_len, hidden_size) | |
conv = paddle.networks.sequence_conv_pool( | |
input=emb, | |
context_len=context_len, | |
hidden_size=hidden_size, | |
# set parameter attr for parameter sharing | |
context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'), | |
fc_param_attr=ParamAttr(name=key + '_fc.w'), | |
fc_bias_attr=ParamAttr(name=key + '_fc.b'), | |
pool_bias_attr=ParamAttr(name=key + '_pool.b')) | |
return conv | |
logger.info('create a sequence_conv_pool which context width is 3') | |
conv_3 = create_conv(3, self.dnn_dims[1], "cnn") | |
logger.info('create a sequence_conv_pool which context width is 4') | |
conv_4 = create_conv(4, self.dnn_dims[1], "cnn") | |
return conv_3, conv_4 | |
def create_dnn(self, sent_vec, prefix): | |
# if more than three layers, than a fc layer will be added. | |
if len(self.dnn_dims) > 1: | |
_input_layer = sent_vec | |
for id, dim in enumerate(self.dnn_dims[1:]): | |
name = "%s_fc_%d_%d" % (prefix, id, dim) | |
logger.info("create fc layer [%s] which dimention is %d" % | |
(name, dim)) | |
fc = paddle.layer.fc( | |
name=name, | |
input=_input_layer, | |
size=dim, | |
act=paddle.activation.Tanh(), | |
param_attr=ParamAttr(name='%s.w' % name), | |
bias_attr=ParamAttr(name='%s.b' % name)) | |
_input_layer = fc | |
return _input_layer | |
def _build_classification_model(self): | |
logger.info("build classification model") | |
assert self.model_type.is_classification() | |
return self._build_classification_or_regression_model( | |
is_classification=True) | |
def _build_regression_model(self): | |
logger.info("build regression model") | |
assert self.model_type.is_regression() | |
return self._build_classification_or_regression_model( | |
is_classification=False) | |
def _build_rank_model(self): | |
''' | |
Build a pairwise rank model, and the cost is returned. | |
A pairwise rank model has 3 inputs: | |
- source sentence | |
- left_target sentence | |
- right_target sentence | |
- label, 1 if left_target should be sorted in front of right_target, otherwise 0. | |
''' | |
logger.info("build rank model") | |
assert self.model_type.is_rank() | |
source = paddle.layer.data( | |
name='source_input', | |
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) | |
left_target = paddle.layer.data( | |
name='left_target_input', | |
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) | |
right_target = paddle.layer.data( | |
name='right_target_input', | |
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) | |
if not self.is_infer: | |
label = paddle.layer.data( | |
name='label_input', type=paddle.data_type.integer_value(1)) | |
prefixs = '_ _ _'.split( | |
) if self.share_semantic_generator else 'source left right'.split() | |
embed_prefixs = '_ _'.split( | |
) if self.share_embed else 'source target target'.split() | |
word_vecs = [] | |
for id, input in enumerate([source, left_target, right_target]): | |
x = self.create_embedding(input, prefix=embed_prefixs[id]) | |
word_vecs.append(x) | |
semantics = [] | |
for id, input in enumerate(word_vecs): | |
x = self.model_arch_creater(input, prefix=prefixs[id]) | |
semantics.append(x) | |
# cossim score of source and left_target | |
left_score = paddle.layer.cos_sim(semantics[0], semantics[1]) | |
# cossim score of source and right target | |
right_score = paddle.layer.cos_sim(semantics[0], semantics[2]) | |
if not self.is_infer: | |
# rank cost | |
cost = paddle.layer.rank_cost(left_score, right_score, label=label) | |
# prediction = left_score - right_score | |
# but this operator is not supported currently. | |
# so AUC will not used. | |
return cost, None, label | |
return None, [left_score, right_score], label | |
def _build_classification_or_regression_model(self, is_classification): | |
''' | |
Build a classification/regression model, and the cost is returned. | |
A Classification has 3 inputs: | |
- source sentence | |
- target sentence | |
- classification label | |
''' | |
if is_classification: | |
# prepare inputs. | |
assert self.class_num | |
source = paddle.layer.data( | |
name='source_input', | |
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0])) | |
target = paddle.layer.data( | |
name='target_input', | |
type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1])) | |
label = paddle.layer.data( | |
name='label_input', | |
type=paddle.data_type.integer_value(self.class_num) | |
if is_classification else paddle.data_type.dense_vector(1)) | |
prefixs = '_ _'.split( | |
) if self.share_semantic_generator else 'left right'.split() | |
embed_prefixs = '_ _'.split( | |
) if self.share_embed else 'left right'.split() | |
word_vecs = [] | |
for id, input in enumerate([source, target]): | |
x = self.create_embedding(input, prefix=embed_prefixs[id]) | |
word_vecs.append(x) | |
semantics = [] | |
for id, input in enumerate(word_vecs): | |
x = self.model_arch_creater(input, prefix=prefixs[id]) | |
semantics.append(x) | |
if is_classification: | |
concated_vector = paddle.layer.concat(semantics) | |
prediction = paddle.layer.fc( | |
input=concated_vector, | |
size=self.class_num, | |
act=paddle.activation.Softmax()) | |
cost = paddle.layer.classification_cost( | |
input=prediction, label=label) | |
else: | |
prediction = paddle.layer.cos_sim(*semantics) | |
cost = paddle.layer.mse_cost(prediction, label) | |
if not self.is_infer: | |
return cost, prediction, label | |
return None, prediction, label | |
layer_dims = [256, 128, 64, 32] | |
source_dic_path = "term.dict.bigram.61.150" | |
target_dic_path = source_dic_path | |
model_type = ModelType(ModelType.CLASSIFICATION_MODE) | |
model_arch = ModelArch(ModelArch.FC_MODE) | |
share_network_between_source_target = False | |
class_num = 2 | |
share_embed = False | |
cost, prediction, label = DSSM( | |
dnn_dims=layer_dims, | |
vocab_sizes=[ | |
len(load_dic(path)) | |
for path in [source_dic_path, target_dic_path] | |
], | |
model_type=model_type, | |
model_arch=model_arch, | |
share_semantic_generator=share_network_between_source_target, | |
class_num=class_num, | |
share_embed=share_embed)() | |
outputs(prediction) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment