Created
April 1, 2019 06:33
-
-
Save shantanuo/eeefb9d3273bf5ee821b2f1b3f1faec1 to your computer and use it in GitHub Desktop.
tensorflow toxic comment kaggle competition python code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This Python 3 environment comes with many helpful analytics libraries installed | |
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python | |
# For example, here's several helpful packages to load in | |
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import tensorflow as tf | |
from tensorflow.contrib.keras.api.keras.losses import binary_crossentropy | |
from collections import Counter | |
# Any results you write to the current directory are saved as output. | |
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] | |
MAXLEN = 1000 | |
EMDED_SIZE = 50 | |
MODEL_DEPTH = 6 | |
UNKNOWN_CHAR = 'ⓤ' | |
PAD_CHAR = 'â„—' | |
BSIZE = 512 | |
EPOCHS = 4 | |
train_data = pd.read_csv("train.csv") | |
#train_data = train_data.sample(frac = 0.1) | |
test_data = pd.read_csv("test.csv") | |
sample_submission = pd.read_csv("sample_submission.csv") | |
sentences_train = train_data["comment_text"].fillna("_NAN_").values | |
sentences_test = test_data["comment_text"].fillna("_NAN_").values | |
def create_char_vocabulary(texts,min_count_chars=100): | |
counter = Counter() | |
for k, text in enumerate(texts): | |
counter.update(text) | |
raw_counts = list(counter.items()) | |
print('%s characters found' %len(counter)) | |
print('keepin characters with count > %s' % min_count_chars) | |
vocab = [char_tuple[0] for char_tuple in raw_counts if char_tuple[1] > min_count_chars] | |
char2index = {char:(ind+1) for ind, char in enumerate(vocab)} | |
char2index[UNKNOWN_CHAR] = 0 | |
char2index[PAD_CHAR] = -1 | |
index2char = {ind:char for char, ind in char2index.items()} | |
print('%s remaining characters' % len(char2index)) | |
return char2index, index2char | |
def char2seq(texts, maxlen): | |
res = np.zeros((len(texts),maxlen)) | |
for k,text in enumerate(texts): | |
seq = np.zeros((len(text))) #equals padding with PAD_CHAR | |
for l, char in enumerate(text): | |
try: | |
id = char2index[char] | |
seq[l] = id | |
except KeyError: | |
seq[l] = char2index[UNKNOWN_CHAR] | |
seq = seq[:maxlen] | |
res[k][:len(seq)] = seq | |
return res | |
char2index, index2char = create_char_vocabulary(sentences_train) | |
X_train = char2seq(sentences_train,MAXLEN) | |
X_test = char2seq(sentences_test,MAXLEN) | |
Y_train = train_data[list_classes].values | |
graph = tf.Graph() | |
with graph.as_default(): | |
x = tf.placeholder(dtype=tf.int32,shape=(None,MAXLEN)) | |
y = tf.placeholder(dtype=tf.float32,shape=(None,6)) | |
is_training = tf.placeholder(tf.bool, [], name='is_training') | |
embedding = tf.get_variable("embedding", [len(char2index), EMDED_SIZE], dtype=tf.float32) | |
x2 = tf.nn.embedding_lookup(embedding, x, name="embedded_input") | |
for i in range(3,3+MODEL_DEPTH): | |
x2 = tf.layers.conv1d(x2, filters=2**i, kernel_size=3, strides=1) | |
x2 = tf.layers.conv1d(x2, filters=2**i, kernel_size=3, strides=1) | |
x2 = tf.layers.max_pooling1d(x2, pool_size=2, strides=2) | |
x2 = tf.reduce_mean(x2, axis=1) | |
x2 = tf.contrib.layers.fully_connected(x2, 64, activation_fn=tf.nn.relu) | |
logits = tf.contrib.layers.fully_connected(x2, 6, activation_fn=tf.nn.sigmoid) | |
loss = tf.losses.log_loss(labels=y,predictions=logits) | |
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) | |
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss) | |
(_, auc_update_op) = tf.metrics.auc(labels=y,predictions=logits,curve='ROC') | |
train_iters = len(X_train) - BSIZE | |
with tf.Session(graph=graph) as sess: | |
init = tf.global_variables_initializer() | |
sess.run(init) | |
for epoch in range(EPOCHS+1): | |
step = 0 | |
tf.local_variables_initializer().run(session=sess) | |
while step * BSIZE < train_iters: | |
batch_x = X_train[step * BSIZE:(step + 1) * BSIZE] | |
batch_y = Y_train[step * BSIZE:(step + 1) * BSIZE] | |
logloss , _, roc_auc = sess.run([loss,optimizer,auc_update_op],feed_dict={x:batch_x, | |
y:batch_y, | |
is_training:True}) | |
print('e%s -- s%s -- logloss: %s -- roc_auc: %s' %(epoch,step,logloss,roc_auc)) | |
step +=1 | |
num_batches = (len(X_test) // BSIZE) + 1 | |
res = np.zeros((len(X_test), 6)) | |
for s in range(num_batches): | |
if s % 50 == 0: | |
print(s) | |
batch_x_test = X_test[s * BSIZE:(s + 1) * BSIZE] | |
logits_ = sess.run(logits, feed_dict={x: batch_x_test, | |
is_training:False}) | |
res[s * BSIZE:(s + 1) * BSIZE] = logits_ | |
sample_submission[list_classes] = res | |
fn = 'submission.csv' | |
sample_submission.to_csv(fn, index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment