Juancard/imdb_sentiment_analysis

## imdb_sentiment_analysis

# coding: utf-8

# Sentiment analysis over imdb database using convolutional neural networks
# ConvNN architecture follows Kim Yoon directives on paper
# "Kim, Y. (2014). Convolutional Neural Networks for Sentence Classification.
# Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP 2014), 1746–1751."
# Link: http://arxiv.org/abs/1408.5882

# In[1]:

# sequence classification in the IMDB dataset
import numpy as np
import pandas as pd
import h5py

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

from keras.datasets import imdb
from keras.models import Sequential
from keras.models import load_model, Model
from keras.layers import Dense
from keras.layers import Convolution1D, GlobalMaxPooling1D, MaxPooling1D,Flatten, Dropout, Input
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence
from keras.callbacks import TensorBoard
from keras.layers.merge import concatenate
from keras.layers import Merge
from keras.utils.vis_utils import plot_model
from keras.callbacks import ModelCheckpoint


# In[2]:

print ("Loading imdb dataset")
top_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=top_words,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)


# In[3]:

print("Encoding sentences")
# Pad the sequence to the same length
max_review_length = 1600 # less than maximum length of both test and train sets
x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)


# In[4]:

embedding_vector_length = 128
FILTER_SIZES = [3, 4, 5]
FILTERS = 128
P_DROPOUT = 0.5


# In[5]:
print("Setting up Convolutional Network")
input_layer = Input(shape=(max_review_length,), dtype='int32', name='sentence')
embedding = Embedding(top_words, embedding_vector_length, input_length=max_review_length, name="embedding")(input_layer)

convs = []
inputs = []
for i, kernel_size in enumerate(FILTER_SIZES):
    name = str(kernel_size) + "ks"
    conv = Convolution1D(
        filters=FILTERS,
        kernel_size=kernel_size,
        activation='relu',
        name="conv_" + name
    )(embedding)
    maxPooling = MaxPooling1D(
        pool_size= max_review_length - kernel_size + 1,
        name="maxpool_" + name
    )(conv)
    convs.append(maxPooling)

# merge
merged = concatenate(convs, name="concatenation")
# Flat
flat = Flatten(name="flatten_layer")(merged)
drop = Dropout(P_DROPOUT, name="dropout_%.2f" % P_DROPOUT)(flat)
# interpretation
outputs = Dense(1, activation='sigmoid')(drop)
model = Model(inputs=input_layer, outputs=outputs, name="output")
# compile
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


# In[22]:

plot_model(model, show_shapes=True, to_file='plots/zhang_architecture.png')


# In[10]:

# checkpoint
filepath="za_weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]


# In[11]:

# fit model
print ("Fitting model")
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=16, callbacks=callbacks_list, verbose=2)


# In[12]:

# save the model
print ("Saving the model")
model.save('zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout')


# ## Prediction over kaggle test data

# In[13]:

print("Starting predictions"

INDEX_FROM=3
INDEX_UNK=2
INDEX_START=1

def preprocess_imdb(review):
    # clean and tokenize
    words_list = text_to_word_sequence(review)
    # init array
    words_index_list = []
    # INDEX_START as first element in sequence (keras convention)
    words_index_list.append(INDEX_START)
    #words_index_list[0] = INDEX_START
    for word_pos in xrange(0,len(words_list)):
        word = words_list[word_pos]
        if word not in word_index:
            words_index_list.append(INDEX_UNK)
            #words_index_list[word_pos + 1] = INDEX_UNK
        else:
            words_index_list.append(word_index[word] + INDEX_FROM if word_index[word] < top_words - INDEX_FROM - 1 else INDEX_UNK)
    return words_index_list


# In[14]:

print("Loadin test dataset to predict")
# DOWNLOAD COLLECTION HERE:
# https://www.kaggle.com/c/word2vec-nlp-tutorial/data
collection_path = "/path/to/imdb_reviews_sentim_analisys/"

# Read data from files
kaggle_test_df = pd.read_csv( collection_path + "testData.tsv", header=0, delimiter="\t", quoting=3, encoding="utf-8" )


# In[15]:
print "Imdb dataset: Loading map from word to index"
word_index = imdb.get_word_index()


# In[16]:

preprocess_test = kaggle_test_df['review'].apply(preprocess_imdb)
x_test_predict = sequence.pad_sequences(preprocess_test, maxlen=max_review_length)
print("Shape of dataset to predict: " + str(x_test_predict.shape))


# In[17]:
print "Predicting one review"
neg_rev = "With all the controversy back in 2016 over the #Oscarssowhite shambles, it seems that in 2017 the Academy has made a conscious effort to include as much diversity into the show as they possibly can. Unfortunately, the downside of that is that films like 'Moonlight', which are in reality very average, get recognition they don't deserve and people are fooled into thinking they are better than they actually are. 'Moonlight' is a simple film, in fact it's far too simple. There is almost nothing thought-provoking or interesting that happens for the entire 110 minute run time. Yet somehow it's up for a plethora of awards. Go figure. Mahershala Ali and Naomie Harris have each been nominated for Academy Awards in their respective Supporting categories. Ali is quite brilliant, in fact he's the highlight of the film. He's in nearly the entire first third of the film and I was starting to wonder how this was considered a 'Supporting' role, yet he soon drops away. I wish he had been in it for longer though, because he was quite superb. Harris was also quite good in her role. She has a more spread out performance in the film, reoccurring in each chapter. I wouldn't say she blew me away, but she was certainly solid in her role. I will predict Ali to win his category, and Harris to miss out. 'Moonlight' is one of those films that just kind of drifts along until the credits role. The question I kept asking myself as I watched it was, what is meant to be so extraordinary about these characters? What part of this story justifies making a film out of it? To me it appears that some impressive acting and some false award nominations have tricked people into thinking this film is better than it is. Very disappointing."
neg_rev_enc = sequence.pad_sequences([preprocess_imdb(neg_rev)], maxlen=max_review_length)
prediction = model.predict(neg_rev_enc)
print ("%.4f - %s" % (prediction, "Positive" if round(prediction) == 1 else "Negative"))


# In[18]:
print "Predicting all dataset"
model_predictions = model.predict(x_test_predict, verbose=0)
print(model_predictions[:5])


# In[21]:

mod_pred_round = [int(round(i)) for i in model_predictions]
print(mod_pred_round[:5])


# In[22]:

# Write the test results
print "Writing prediction results"
output = pd.DataFrame( data={"id":kaggle_test_df["id"], "sentiment":mod_pred_round} )
output.to_csv( "imdb_zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout.csv", index=False, quoting=3 )

	# coding: utf-8

	# Sentiment analysis over imdb database using convolutional neural networks
	# ConvNN architecture follows Kim Yoon directives on paper
	# "Kim, Y. (2014). Convolutional Neural Networks for Sentence Classification.
	# Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP 2014), 1746–1751."
	# Link: http://arxiv.org/abs/1408.5882

	# In[1]:

	# sequence classification in the IMDB dataset
	import numpy as np
	import pandas as pd
	import h5py

	# fix random seed for reproducibility
	seed = 7
	np.random.seed(seed)

	from keras.datasets import imdb
	from keras.models import Sequential
	from keras.models import load_model, Model
	from keras.layers import Dense
	from keras.layers import Convolution1D, GlobalMaxPooling1D, MaxPooling1D,Flatten, Dropout, Input
	from keras.layers.embeddings import Embedding
	from keras.preprocessing import sequence
	from keras.preprocessing.text import text_to_word_sequence
	from keras.callbacks import TensorBoard
	from keras.layers.merge import concatenate
	from keras.layers import Merge
	from keras.utils.vis_utils import plot_model
	from keras.callbacks import ModelCheckpoint


	# In[2]:

	print ("Loading imdb dataset")
	top_words = 10000
	(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
	num_words=top_words,
	skip_top=0,
	maxlen=None,
	seed=113,
	start_char=1,
	oov_char=2,
	index_from=3)


	# In[3]:

	print("Encoding sentences")
	# Pad the sequence to the same length
	max_review_length = 1600 # less than maximum length of both test and train sets
	x_train = sequence.pad_sequences(x_train, maxlen=max_review_length)
	x_test = sequence.pad_sequences(x_test, maxlen=max_review_length)


	# In[4]:

	embedding_vector_length = 128
	FILTER_SIZES = [3, 4, 5]
	FILTERS = 128
	P_DROPOUT = 0.5


	# In[5]:
	print("Setting up Convolutional Network")
	input_layer = Input(shape=(max_review_length,), dtype='int32', name='sentence')
	embedding = Embedding(top_words, embedding_vector_length, input_length=max_review_length, name="embedding")(input_layer)

	convs = []
	inputs = []
	for i, kernel_size in enumerate(FILTER_SIZES):
	name = str(kernel_size) + "ks"
	conv = Convolution1D(
	filters=FILTERS,
	kernel_size=kernel_size,
	activation='relu',
	name="conv_" + name
	)(embedding)
	maxPooling = MaxPooling1D(
	pool_size= max_review_length - kernel_size + 1,
	name="maxpool_" + name
	)(conv)
	convs.append(maxPooling)

	# merge
	merged = concatenate(convs, name="concatenation")
	# Flat
	flat = Flatten(name="flatten_layer")(merged)
	drop = Dropout(P_DROPOUT, name="dropout_%.2f" % P_DROPOUT)(flat)
	# interpretation
	outputs = Dense(1, activation='sigmoid')(drop)
	model = Model(inputs=input_layer, outputs=outputs, name="output")
	# compile
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	model.summary()


	# In[22]:

	plot_model(model, show_shapes=True, to_file='plots/zhang_architecture.png')


	# In[10]:

	# checkpoint
	filepath="za_weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
	checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
	callbacks_list = [checkpoint]


	# In[11]:

	# fit model
	print ("Fitting model")
	model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=16, callbacks=callbacks_list, verbose=2)


	# In[12]:

	# save the model
	print ("Saving the model")
	model.save('zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout')





	# ## Prediction over kaggle test data

	# In[13]:

	print("Starting predictions"

	INDEX_FROM=3
	INDEX_UNK=2
	INDEX_START=1

	def preprocess_imdb(review):
	# clean and tokenize
	words_list = text_to_word_sequence(review)
	# init array
	words_index_list = []
	# INDEX_START as first element in sequence (keras convention)
	words_index_list.append(INDEX_START)
	#words_index_list[0] = INDEX_START
	for word_pos in xrange(0,len(words_list)):
	word = words_list[word_pos]
	if word not in word_index:
	words_index_list.append(INDEX_UNK)
	#words_index_list[word_pos + 1] = INDEX_UNK
	else:
	words_index_list.append(word_index[word] + INDEX_FROM if word_index[word] < top_words - INDEX_FROM - 1 else INDEX_UNK)
	return words_index_list


	# In[14]:

	print("Loadin test dataset to predict")
	# DOWNLOAD COLLECTION HERE:
	# https://www.kaggle.com/c/word2vec-nlp-tutorial/data
	collection_path = "/path/to/imdb_reviews_sentim_analisys/"

	# Read data from files
	kaggle_test_df = pd.read_csv( collection_path + "testData.tsv", header=0, delimiter="\t", quoting=3, encoding="utf-8" )


	# In[15]:
	print "Imdb dataset: Loading map from word to index"
	word_index = imdb.get_word_index()


	# In[16]:

	preprocess_test = kaggle_test_df['review'].apply(preprocess_imdb)
	x_test_predict = sequence.pad_sequences(preprocess_test, maxlen=max_review_length)
	print("Shape of dataset to predict: " + str(x_test_predict.shape))


	# In[17]:
	print "Predicting one review"
	neg_rev = "With all the controversy back in 2016 over the #Oscarssowhite shambles, it seems that in 2017 the Academy has made a conscious effort to include as much diversity into the show as they possibly can. Unfortunately, the downside of that is that films like 'Moonlight', which are in reality very average, get recognition they don't deserve and people are fooled into thinking they are better than they actually are. 'Moonlight' is a simple film, in fact it's far too simple. There is almost nothing thought-provoking or interesting that happens for the entire 110 minute run time. Yet somehow it's up for a plethora of awards. Go figure. Mahershala Ali and Naomie Harris have each been nominated for Academy Awards in their respective Supporting categories. Ali is quite brilliant, in fact he's the highlight of the film. He's in nearly the entire first third of the film and I was starting to wonder how this was considered a 'Supporting' role, yet he soon drops away. I wish he had been in it for longer though, because he was quite superb. Harris was also quite good in her role. She has a more spread out performance in the film, reoccurring in each chapter. I wouldn't say she blew me away, but she was certainly solid in her role. I will predict Ali to win his category, and Harris to miss out. 'Moonlight' is one of those films that just kind of drifts along until the credits role. The question I kept asking myself as I watched it was, what is meant to be so extraordinary about these characters? What part of this story justifies making a film out of it? To me it appears that some impressive acting and some false award nominations have tricked people into thinking this film is better than it is. Very disappointing."
	neg_rev_enc = sequence.pad_sequences([preprocess_imdb(neg_rev)], maxlen=max_review_length)
	prediction = model.predict(neg_rev_enc)
	print ("%.4f - %s" % (prediction, "Positive" if round(prediction) == 1 else "Negative"))


	# In[18]:
	print "Predicting all dataset"
	model_predictions = model.predict(x_test_predict, verbose=0)
	print(model_predictions[:5])


	# In[21]:

	mod_pred_round = [int(round(i)) for i in model_predictions]
	print(mod_pred_round[:5])


	# In[22]:

	# Write the test results
	print "Writing prediction results"
	output = pd.DataFrame( data={"id":kaggle_test_df["id"], "sentiment":mod_pred_round} )
	output.to_csv( "imdb_zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout.csv", index=False, quoting=3 )