Skip to content

Instantly share code, notes, and snippets.

@marekgalovic
Created April 14, 2017 03:06
Show Gist options
  • Save marekgalovic/ad4022c2b7f2e2014168ff03c1edca1e to your computer and use it in GitHub Desktop.
Save marekgalovic/ad4022c2b7f2e2014168ff03c1edca1e to your computer and use it in GitHub Desktop.
Quora question pairs - decomposable NLI
# Decomposable attention model for NLI
# https://arxiv.org/pdf/1606.01933v1.pdf
dnli_graph = tf.Graph()
with dnli_graph.as_default():
embedding_matrix = tf.Variable(tf.zeros([DICTIONARY_SIZE, EMBEDDING_SIZE]), name='word_embeddings', trainable=False)
embedding_placeholder = tf.placeholder(tf.float32, [DICTIONARY_SIZE, EMBEDDING_SIZE])
embedding_init_op = embedding_matrix.assign(embedding_placeholder)
X_Q1 = tf.placeholder(tf.int32, [None, None])
X_Q2 = tf.placeholder(tf.int32, [None, None])
y_ = tf.placeholder(tf.int32, [None, 2])
is_training = tf.placeholder(tf.bool)
with tf.name_scope('embeddings_lookup'):
Q1_embeddings = tf.nn.embedding_lookup(embedding_matrix, X_Q1)
Q2_embeddings = tf.nn.embedding_lookup(embedding_matrix, X_Q2)
with tf.name_scope('attention'):
e_Q1 = tf.layers.dense(Q1_embeddings, EMBEDDING_SIZE, activation=tf.nn.relu, name='embedding_projection_nn')
e_Q2 = tf.layers.dense(Q2_embeddings, EMBEDDING_SIZE, activation=tf.nn.relu, name='embedding_projection_nn', reuse=True)
e = tf.matmul(e_Q1, tf.transpose(e_Q2, [0,2,1]))
beta = tf.matmul(tf.nn.softmax(e), Q2_embeddings)
alpha = tf.matmul(tf.nn.softmax(tf.transpose(e, [0,2,1])), Q1_embeddings)
with tf.name_scope('comparison'):
v_Q1 = tf.layers.dense(tf.concat([Q1_embeddings, beta], 2), EMBEDDING_SIZE, activation=tf.nn.relu, name='attention_nn')
v_Q2 = tf.layers.dense(tf.concat([Q2_embeddings, alpha], 2), EMBEDDING_SIZE, activation=tf.nn.relu, name='attention_nn', reuse=True)
v = tf.concat([
tf.reduce_sum(v_Q1, 1),
tf.reduce_sum(v_Q2, 1)
], 1)
tf.summary.histogram('v_activations', v)
v_dropout = tf.layers.dropout(v, rate=0.3, training=is_training)
with tf.name_scope('classification'):
_L1 = tf.layers.dense(v_dropout, EMBEDDING_SIZE, activation=tf.nn.relu)
L1 = tf.layers.dropout(_L1, rate=0.3, training=is_training)
tf.summary.histogram('l1_activations', _L1)
_L2 = tf.layers.dense(L1, EMBEDDING_SIZE, activation=tf.nn.relu)
L2 = tf.layers.dropout(_L2, rate=0.2, training=is_training)
tf.summary.histogram('l2_activations', _L2)
y = tf.layers.dense(L2, 2, activation=tf.nn.sigmoid, name='classification_nn')
loss = tf.losses.log_loss(y_, y)
tf.summary.scalar('loss', loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)), tf.float32))
tf.summary.scalar('accuracy', accuracy)
train_op = tf.train.AdamOptimizer().minimize(loss)
metrics_op = tf.summary.merge_all()
saver = tf.train.Saver(max_to_keep=50)
# Extended version of a vanilla decomposable NLI propsed in the paper.
# The extended version uses self sentence attention to learn which words are
# important within each sentence using a dense nn with shared weights for both
# sentences. Softmax activation is used to get attention weights which are
# then multiplied by original embedded sentences.
ednli_graph = tf.Graph()
with ednli_graph.as_default():
embedding_matrix = tf.Variable(tf.zeros([DICTIONARY_SIZE, EMBEDDING_SIZE]), name='word_embeddings', trainable=False)
embedding_placeholder = tf.placeholder(tf.float32, [DICTIONARY_SIZE, EMBEDDING_SIZE])
embedding_init_op = embedding_matrix.assign(embedding_placeholder)
X1 = tf.placeholder(tf.int32, [None, None])
X2 = tf.placeholder(tf.int32, [None, None])
y_ = tf.placeholder(tf.int32, [None, 2])
is_training = tf.placeholder(tf.bool)
with tf.name_scope('embeddings_lookup'):
X1_embedded = tf.nn.embedding_lookup(embedding_matrix, X1)
X2_embedded = tf.nn.embedding_lookup(embedding_matrix, X2)
# Sentence self attention
with tf.name_scope('self_attention'):
X1_self_projection = tf.layers.dense(X1_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_self_attention')
X2_self_projection = tf.layers.dense(X2_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_self_attention', reuse=True)
X1_self_e = tf.matmul(X1_self_projection, tf.transpose(X1_self_projection, [0,2,1]))
X2_self_e = tf.matmul(X2_self_projection, tf.transpose(X2_self_projection, [0,2,1]))
X1_gamma = tf.matmul(tf.nn.softmax(X1_self_e), X1_embedded)
X2_gamma = tf.matmul(tf.nn.softmax(X2_self_e), X2_embedded)
# Inter sentence attention
with tf.name_scope('attention'):
X1_e = tf.layers.dense(X1_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_attention')
X2_e = tf.layers.dense(X2_embedded, EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_attention', reuse=True)
e = tf.matmul(X1_e, tf.transpose(X2_e, [0,2,1]))
beta = tf.matmul(tf.nn.softmax(e), X2_e)
alpha = tf.matmul(tf.nn.softmax(tf.transpose(e, [0,2,1])), X1_e)
# Inter sentence comparison
with tf.name_scope('comparison'):
X1_v = tf.layers.dense(tf.concat([X1_embedded, beta, X1_gamma], 2), 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_comparison')
X2_v = tf.layers.dense(tf.concat([X2_embedded, alpha, X2_gamma], 2), 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='sentence_comparison', reuse=True)
# Reduce to BATCH_SIZExEMBEDDING_SIZE
with tf.name_scope('sum_reduction'):
v = tf.concat([
tf.reduce_sum(X1_v, 1),
tf.reduce_sum(X2_v, 1),
], 1)
v_dropout = tf.layers.dropout(v, rate=0.3, training=is_training)
# Classification
with tf.name_scope('classification'):
L1 = tf.layers.dropout(
tf.layers.dense(v_dropout, 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='L1'),
rate = 0.3, training=is_training
)
tf.summary.histogram('l1_activations', L1)
L2 = tf.layers.dropout(
tf.layers.dense(L1, 2*EMBEDDING_SIZE, activation=tf.nn.relu, name='L2'),
rate = 0.3, training=is_training
)
tf.summary.histogram('l2_activations', L2)
L3 = tf.layers.dropout(
tf.layers.dense(L2, EMBEDDING_SIZE, activation=tf.nn.relu, name='L3'),
rate = 0.2, training=is_training
)
tf.summary.histogram('l1_activations', L3)
y = tf.layers.dense(L3, 2, activation=tf.nn.sigmoid, name='y')
tf.summary.histogram('y_activations', y)
loss = tf.losses.log_loss(y_, y)
tf.summary.scalar('loss', loss)
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(y_, 1), tf.argmax(y, 1)), tf.float32))
tf.summary.scalar('accuracy', accuracy)
train_op = tf.train.AdamOptimizer().minimize(loss)
metrics_op = tf.summary.merge_all()
saver = tf.train.Saver(max_to_keep=50)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment