Skip to content

Instantly share code, notes, and snippets.

@Juancard
Created January 31, 2018 20:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Juancard/0ea387f4a43104ed26a7d15cb8312a90 to your computer and use it in GitHub Desktop.
Save Juancard/0ea387f4a43104ed26a7d15cb8312a90 to your computer and use it in GitHub Desktop.
Sentiment analysis over imdb database using convolutional neural networks
# coding: utf-8
# Sentiment analysis over imdb database using convolutional neural networks
# ConvNN architecture follows Kim Yoon directives on paper
# "Kim, Y. (2014). Convolutional Neural Networks for Sentence Classification.
# Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP 2014), 1746–1751."
# Link: http://arxiv.org/abs/1408.5882
# In[1]:
# sequence classification in the IMDB dataset
import numpy as np
import pandas as pd
import h5py
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
from keras.datasets import imdb
from keras.models import Sequential
from keras.models import load_model, Model
from keras.layers import Dense
from keras.layers import Convolution1D, GlobalMaxPooling1D, MaxPooling1D,Flatten, Dropout, Input
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
from keras.callbacks import TensorBoard
from keras.layers.merge import concatenate
from keras.layers import Merge
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint
# In[2]:
print ("Loading imdb dataset")
top_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
num_words=top_words,
skip_top=0,
maxlen=None,
seed=113,
start_char=1,
oov_char=2,
index_from=3)
# In[3]:
print("Encoding sentences")
# Pad the sequence to the same length
max_review_length = 1600 # less than maximum length of both test and train sets
x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)
# In[4]:
embedding_vector_length = 128
FILTER_SIZES = [3, 4, 5]
FILTERS = 128
P_DROPOUT = 0.5
# In[5]:
print("Setting up Convolutional Network")
input_layer = Input(shape=(max_review_length,), dtype='int32', name='sentence')
embedding = Embedding(top_words, embedding_vector_length, input_length=max_review_length, name="embedding")(input_layer)
convs = []
inputs = []
for i, kernel_size in enumerate(FILTER_SIZES):
name = str(kernel_size) + "ks"
conv = Convolution1D(
filters=FILTERS,
kernel_size=kernel_size,
activation='relu',
name="conv_" + name
)(embedding)
maxPooling = MaxPooling1D(
pool_size= max_review_length - kernel_size + 1,
name="maxpool_" + name
)(conv)
convs.append(maxPooling)
# merge
merged = concatenate(convs, name="concatenation")
# Flat
flat = Flatten(name="flatten_layer")(merged)
drop = Dropout(P_DROPOUT, name="dropout_%.2f" % P_DROPOUT)(flat)
# interpretation
outputs = Dense(1, activation='sigmoid')(drop)
model = Model(inputs=input_layer, outputs=outputs, name="output")
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
# In[22]:
plot_model(model, show_shapes=True, to_file='plots/zhang_architecture.png')
# In[10]:
# checkpoint
filepath="za_weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
# In[11]:
# fit model
print ("Fitting model")
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=16, callbacks=callbacks_list, verbose=2)
# In[12]:
# save the model
print ("Saving the model")
model.save('zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout')
# ## Prediction over kaggle test data
# In[13]:
print("Starting predictions"
INDEX_FROM=3
INDEX_UNK=2
INDEX_START=1
def preprocess_imdb(review):
# clean and tokenize
words_list = text_to_word_sequence(review)
# init array
words_index_list = []
# INDEX_START as first element in sequence (keras convention)
words_index_list.append(INDEX_START)
#words_index_list[0] = INDEX_START
for word_pos in xrange(0,len(words_list)):
word = words_list[word_pos]
if word not in word_index:
words_index_list.append(INDEX_UNK)
#words_index_list[word_pos + 1] = INDEX_UNK
else:
words_index_list.append(word_index[word] + INDEX_FROM if word_index[word] < top_words - INDEX_FROM - 1 else INDEX_UNK)
return words_index_list
# In[14]:
print("Loadin test dataset to predict")
# DOWNLOAD COLLECTION HERE:
# https://www.kaggle.com/c/word2vec-nlp-tutorial/data
collection_path = "/path/to/imdb_reviews_sentim_analisys/"
# Read data from files
kaggle_test_df = pd.read_csv( collection_path + "testData.tsv", header=0, delimiter="\t", quoting=3, encoding="utf-8" )
# In[15]:
print "Imdb dataset: Loading map from word to index"
word_index = imdb.get_word_index()
# In[16]:
preprocess_test = kaggle_test_df['review'].apply(preprocess_imdb)
x_test_predict = sequence.pad_sequences(preprocess_test, maxlen=max_review_length)
print("Shape of dataset to predict: " + str(x_test_predict.shape))
# In[17]:
print "Predicting one review"
neg_rev = "With all the controversy back in 2016 over the #Oscarssowhite shambles, it seems that in 2017 the Academy has made a conscious effort to include as much diversity into the show as they possibly can. Unfortunately, the downside of that is that films like 'Moonlight', which are in reality very average, get recognition they don't deserve and people are fooled into thinking they are better than they actually are. 'Moonlight' is a simple film, in fact it's far too simple. There is almost nothing thought-provoking or interesting that happens for the entire 110 minute run time. Yet somehow it's up for a plethora of awards. Go figure. Mahershala Ali and Naomie Harris have each been nominated for Academy Awards in their respective Supporting categories. Ali is quite brilliant, in fact he's the highlight of the film. He's in nearly the entire first third of the film and I was starting to wonder how this was considered a 'Supporting' role, yet he soon drops away. I wish he had been in it for longer though, because he was quite superb. Harris was also quite good in her role. She has a more spread out performance in the film, reoccurring in each chapter. I wouldn't say she blew me away, but she was certainly solid in her role. I will predict Ali to win his category, and Harris to miss out. 'Moonlight' is one of those films that just kind of drifts along until the credits role. The question I kept asking myself as I watched it was, what is meant to be so extraordinary about these characters? What part of this story justifies making a film out of it? To me it appears that some impressive acting and some false award nominations have tricked people into thinking this film is better than it is. Very disappointing."
neg_rev_enc = sequence.pad_sequences([preprocess_imdb(neg_rev)], maxlen=max_review_length)
prediction = model.predict(neg_rev_enc)
print ("%.4f - %s" % (prediction, "Positive" if round(prediction) == 1 else "Negative"))
# In[18]:
print "Predicting all dataset"
model_predictions = model.predict(x_test_predict, verbose=0)
print(model_predictions[:5])
# In[21]:
mod_pred_round = [int(round(i)) for i in model_predictions]
print(mod_pred_round[:5])
# In[22]:
# Write the test results
print "Writing prediction results"
output = pd.DataFrame( data={"id":kaggle_test_df["id"], "sentiment":mod_pred_round} )
output.to_csv( "imdb_zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout.csv", index=False, quoting=3 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment