tensorflow toxic comment kaggle competition python code
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image:
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.contrib.keras.api.keras.losses import binary_crossentropy
from collections import Counter
# Any results you write to the current directory are saved as output.
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
MAXLEN = 1000
PAD_CHAR = 'â„—'
BSIZE = 512
train_data = pd.read_csv("train.csv")
#train_data = train_data.sample(frac = 0.1)
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")
sentences_train = train_data["comment_text"].fillna("_NAN_").values
sentences_test = test_data["comment_text"].fillna("_NAN_").values
def create_char_vocabulary(texts,min_count_chars=100):
counter = Counter()
for k, text in enumerate(texts):
raw_counts = list(counter.items())
print('%s characters found' %len(counter))
print('keepin characters with count > %s' % min_count_chars)
vocab = [char_tuple[0] for char_tuple in raw_counts if char_tuple[1] > min_count_chars]
char2index = {char:(ind+1) for ind, char in enumerate(vocab)}
char2index[UNKNOWN_CHAR] = 0
char2index[PAD_CHAR] = -1
index2char = {ind:char for char, ind in char2index.items()}
print('%s remaining characters' % len(char2index))
return char2index, index2char
def char2seq(texts, maxlen):
res = np.zeros((len(texts),maxlen))
for k,text in enumerate(texts):
seq = np.zeros((len(text))) #equals padding with PAD_CHAR
for l, char in enumerate(text):
id = char2index[char]
seq[l] = id
except KeyError:
seq[l] = char2index[UNKNOWN_CHAR]
seq = seq[:maxlen]
res[k][:len(seq)] = seq
return res
char2index, index2char = create_char_vocabulary(sentences_train)
X_train = char2seq(sentences_train,MAXLEN)
X_test = char2seq(sentences_test,MAXLEN)
Y_train = train_data[list_classes].values
graph = tf.Graph()
with graph.as_default():
x = tf.placeholder(dtype=tf.int32,shape=(None,MAXLEN))
y = tf.placeholder(dtype=tf.float32,shape=(None,6))
is_training = tf.placeholder(tf.bool, [], name='is_training')
embedding = tf.get_variable("embedding", [len(char2index), EMDED_SIZE], dtype=tf.float32)
x2 = tf.nn.embedding_lookup(embedding, x, name="embedded_input")
for i in range(3,3+MODEL_DEPTH):
x2 = tf.layers.conv1d(x2, filters=2**i, kernel_size=3, strides=1)
x2 = tf.layers.conv1d(x2, filters=2**i, kernel_size=3, strides=1)
x2 = tf.layers.max_pooling1d(x2, pool_size=2, strides=2)
x2 = tf.reduce_mean(x2, axis=1)
x2 = tf.contrib.layers.fully_connected(x2, 64, activation_fn=tf.nn.relu)
logits = tf.contrib.layers.fully_connected(x2, 6, activation_fn=tf.nn.sigmoid)
loss = tf.losses.log_loss(labels=y,predictions=logits)
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
(_, auc_update_op) = tf.metrics.auc(labels=y,predictions=logits,curve='ROC')
train_iters = len(X_train) - BSIZE
with tf.Session(graph=graph) as sess:
init = tf.global_variables_initializer()
for epoch in range(EPOCHS+1):
step = 0
while step * BSIZE < train_iters:
batch_x = X_train[step * BSIZE:(step + 1) * BSIZE]
batch_y = Y_train[step * BSIZE:(step + 1) * BSIZE]
logloss , _, roc_auc =[loss,optimizer,auc_update_op],feed_dict={x:batch_x,
print('e%s -- s%s -- logloss: %s -- roc_auc: %s' %(epoch,step,logloss,roc_auc))
step +=1
num_batches = (len(X_test) // BSIZE) + 1
res = np.zeros((len(X_test), 6))
for s in range(num_batches):
if s % 50 == 0:
batch_x_test = X_test[s * BSIZE:(s + 1) * BSIZE]
logits_ =, feed_dict={x: batch_x_test,
res[s * BSIZE:(s + 1) * BSIZE] = logits_
sample_submission[list_classes] = res
fn = 'submission.csv'
sample_submission.to_csv(fn, index=False)
