Skip to content

Instantly share code, notes, and snippets.

@hiteshn97
Created September 29, 2018 09:54
Show Gist options
  • Save hiteshn97/8f222a2773e11d6921b937abaa21ab75 to your computer and use it in GitHub Desktop.
Save hiteshn97/8f222a2773e11d6921b937abaa21ab75 to your computer and use it in GitHub Desktop.
sentiment analysis using fasttext, keras
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 28 15:11:24 2018
@author: lenovo-pc
"""
import os
import re
import numpy as np
from numpy import array
from gensim.models import FastText
from gensim.corpora.dictionary import Dictionary
from nltk import word_tokenize
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Input
from keras.layers.core import Dense,Dropout
from keras.layers import Bidirectional,Flatten,RepeatVector,Activation
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
maxlen=300
window_size=5
batch_size=32
n_epoch=2
input_length=600
MAX_SEQUENCE_LENGTH=600
#Reading tokens into memory
def init_lists(folder):
var=1
a_list=[]
file_list=os.listdir(folder)
for file in file_list:
print(str(var)+'\n')
var=var+1
f=open(os.path.join(folder,file),encoding="utf-8")
text=f.read()
words = word_tokenize(text)
wordsOld = words
stopw = ['br', '<', '>', '(', ')']
wordsNew = list()
for word in words:
if word not in stopw:
word = word.lower()
wordsNew.append(word)
print(wordsNew==wordsOld)
a_list.append(wordsNew)
f.close()
return a_list
pos_train=init_lists("pos_train")
neg_train=init_lists("neg_train")
pos_test=init_lists("pos_test")
neg_test=init_lists("neg_test")
unsup=init_lists("unsup")
total = pos_train + neg_train + pos_test + neg_test + unsup
train = pos_train + neg_train
test = pos_test + neg_test
print(p[3])
print(p[2])
print(pos_train[1])
mya1 = np.zeros(12500)
mya2 = np.ones(12500)
label_train = array([0 for _ in range (12500)] + [1 for _ in range(12500)])
label_test = array([0 for _ in range (12500)] + [1 for _ in range(12500)])
#-----------------------------
#Converting words to numbers
tokenizer = Tokenizer(lower=True, split=' ') #nb_words = 20000
tokenizer.fit_on_texts(train)
sequences = tokenizer.texts_to_sequences(train)
word_index = tokenizer.word_index
data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding = 'post')
#-----------------------------
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
label_train = label_train[indices]
#Training a Fasttext Model
model=FastText(size=300,alpha=0.025,window=5,min_count=1,workers=4)
model.build_vocab(train + unsup)
model.train(train + unsup,
total_examples=model.corpus_count,epochs=model.iter)
model.save("FastText_file.bin")
model=FastText.load("FastText_file.bin")
#-----------------------------
# Building the embedding matrix
vocab_size = len(tokenizer.word_index) + 1 #114153
print(vocab_size)
embedding_matrix = np.random.random((vocab_size, 300))
for word,i in tokenizer.word_index.items():
try:
embedding_vector = model.wv[word]
except:
print(word, 'not found')
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
#-----------------------------
embedding_layer = Embedding(vocab_size,
300,
weights=[embedding_matrix],
mask_zero = False,
input_length = MAX_SEQUENCE_LENGTH,
trainable=False)
# Defining a simple keras model
print('Defining a Simple Keras Model...')
lstm_model=Sequential() # or Graph
lstm_model.add(Embedding(output_dim=300,input_dim=vocab_size,
weights=[embedding_matrix],input_length=input_length))
# Adding Input Length
lstm_model.add(Conv1D(250,3,padding='valid', activation='relu',strides=1))
# we use max pooling:
lstm_model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
lstm_model.add(Dense(250))
lstm_model.add(Dropout(0.2))
lstm_model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
lstm_model.add(Dense(1))
lstm_model.add(Activation('sigmoid'))
rms_prop=RMSprop(lr=0.001,rho=0.9,epsilon=None,decay=0.0)
#adam = Adam(lr=0.001,beta_1=0.9,beta_2=0.999,epsilon=None,decay=0.0,amsgrad=False)
print('Compiling the Model...')
lstm_model.compile(loss='binary_crossentropy',optimizer=rms_prop,metrics=['accuracy'])
#class_mode='binary')
earlyStopping=EarlyStopping(monitor='val_loss',min_delta=0,patience=0,
verbose=0,mode='auto')
#----------------------
print("Train...")
# Doing same preprocessing on text
t2 = Tokenizer(lower=True, split=' ', char_level=False, oov_token=None)
t2.fit_on_texts(test)
encoded_docs2 = t2.texts_to_sequences(test)
padded_docs2 = sequence.pad_sequences(encoded_docs2, maxlen = 600, padding = 'post')
print(label_train.size)
lstm_model.fit(data, label_train,batch_size=batch_size,epochs=20,
validation_data = (padded_docs2, label_test),callbacks=[earlyStopping])
print("Evaluate...")
score,acc=lstm_model.evaluate(padded_docs2,label_test,batch_size=batch_size)
#------------------
print('Test score:',str(score*100))
print('Test accuracy:',str(acc*100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment