Created
September 29, 2018 09:54
-
-
Save hiteshn97/8f222a2773e11d6921b937abaa21ab75 to your computer and use it in GitHub Desktop.
sentiment analysis using fasttext, keras
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Aug 28 15:11:24 2018 | |
@author: lenovo-pc | |
""" | |
import os | |
import re | |
import numpy as np | |
from numpy import array | |
from gensim.models import FastText | |
from gensim.corpora.dictionary import Dictionary | |
from nltk import word_tokenize | |
from keras.preprocessing import sequence | |
from keras.models import Sequential | |
from keras.layers import Embedding, LSTM, Input | |
from keras.layers.core import Dense,Dropout | |
from keras.layers import Bidirectional,Flatten,RepeatVector,Activation | |
from keras.optimizers import RMSprop | |
from keras.callbacks import EarlyStopping | |
from keras.layers import Conv1D, GlobalMaxPooling1D | |
from keras.preprocessing.text import Tokenizer | |
import matplotlib.pyplot as plt | |
maxlen=300 | |
window_size=5 | |
batch_size=32 | |
n_epoch=2 | |
input_length=600 | |
MAX_SEQUENCE_LENGTH=600 | |
#Reading tokens into memory | |
def init_lists(folder): | |
var=1 | |
a_list=[] | |
file_list=os.listdir(folder) | |
for file in file_list: | |
print(str(var)+'\n') | |
var=var+1 | |
f=open(os.path.join(folder,file),encoding="utf-8") | |
text=f.read() | |
words = word_tokenize(text) | |
wordsOld = words | |
stopw = ['br', '<', '>', '(', ')'] | |
wordsNew = list() | |
for word in words: | |
if word not in stopw: | |
word = word.lower() | |
wordsNew.append(word) | |
print(wordsNew==wordsOld) | |
a_list.append(wordsNew) | |
f.close() | |
return a_list | |
pos_train=init_lists("pos_train") | |
neg_train=init_lists("neg_train") | |
pos_test=init_lists("pos_test") | |
neg_test=init_lists("neg_test") | |
unsup=init_lists("unsup") | |
total = pos_train + neg_train + pos_test + neg_test + unsup | |
train = pos_train + neg_train | |
test = pos_test + neg_test | |
print(p[3]) | |
print(p[2]) | |
print(pos_train[1]) | |
mya1 = np.zeros(12500) | |
mya2 = np.ones(12500) | |
label_train = array([0 for _ in range (12500)] + [1 for _ in range(12500)]) | |
label_test = array([0 for _ in range (12500)] + [1 for _ in range(12500)]) | |
#----------------------------- | |
#Converting words to numbers | |
tokenizer = Tokenizer(lower=True, split=' ') #nb_words = 20000 | |
tokenizer.fit_on_texts(train) | |
sequences = tokenizer.texts_to_sequences(train) | |
word_index = tokenizer.word_index | |
data = sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding = 'post') | |
#----------------------------- | |
indices = np.arange(data.shape[0]) | |
np.random.shuffle(indices) | |
data = data[indices] | |
label_train = label_train[indices] | |
#Training a Fasttext Model | |
model=FastText(size=300,alpha=0.025,window=5,min_count=1,workers=4) | |
model.build_vocab(train + unsup) | |
model.train(train + unsup, | |
total_examples=model.corpus_count,epochs=model.iter) | |
model.save("FastText_file.bin") | |
model=FastText.load("FastText_file.bin") | |
#----------------------------- | |
# Building the embedding matrix | |
vocab_size = len(tokenizer.word_index) + 1 #114153 | |
print(vocab_size) | |
embedding_matrix = np.random.random((vocab_size, 300)) | |
for word,i in tokenizer.word_index.items(): | |
try: | |
embedding_vector = model.wv[word] | |
except: | |
print(word, 'not found') | |
if embedding_vector is not None: | |
embedding_matrix[i] = embedding_vector | |
#----------------------------- | |
embedding_layer = Embedding(vocab_size, | |
300, | |
weights=[embedding_matrix], | |
mask_zero = False, | |
input_length = MAX_SEQUENCE_LENGTH, | |
trainable=False) | |
# Defining a simple keras model | |
print('Defining a Simple Keras Model...') | |
lstm_model=Sequential() # or Graph | |
lstm_model.add(Embedding(output_dim=300,input_dim=vocab_size, | |
weights=[embedding_matrix],input_length=input_length)) | |
# Adding Input Length | |
lstm_model.add(Conv1D(250,3,padding='valid', activation='relu',strides=1)) | |
# we use max pooling: | |
lstm_model.add(GlobalMaxPooling1D()) | |
# We add a vanilla hidden layer: | |
lstm_model.add(Dense(250)) | |
lstm_model.add(Dropout(0.2)) | |
lstm_model.add(Activation('relu')) | |
# We project onto a single unit output layer, and squash it with a sigmoid: | |
lstm_model.add(Dense(1)) | |
lstm_model.add(Activation('sigmoid')) | |
rms_prop=RMSprop(lr=0.001,rho=0.9,epsilon=None,decay=0.0) | |
#adam = Adam(lr=0.001,beta_1=0.9,beta_2=0.999,epsilon=None,decay=0.0,amsgrad=False) | |
print('Compiling the Model...') | |
lstm_model.compile(loss='binary_crossentropy',optimizer=rms_prop,metrics=['accuracy']) | |
#class_mode='binary') | |
earlyStopping=EarlyStopping(monitor='val_loss',min_delta=0,patience=0, | |
verbose=0,mode='auto') | |
#---------------------- | |
print("Train...") | |
# Doing same preprocessing on text | |
t2 = Tokenizer(lower=True, split=' ', char_level=False, oov_token=None) | |
t2.fit_on_texts(test) | |
encoded_docs2 = t2.texts_to_sequences(test) | |
padded_docs2 = sequence.pad_sequences(encoded_docs2, maxlen = 600, padding = 'post') | |
print(label_train.size) | |
lstm_model.fit(data, label_train,batch_size=batch_size,epochs=20, | |
validation_data = (padded_docs2, label_test),callbacks=[earlyStopping]) | |
print("Evaluate...") | |
score,acc=lstm_model.evaluate(padded_docs2,label_test,batch_size=batch_size) | |
#------------------ | |
print('Test score:',str(score*100)) | |
print('Test accuracy:',str(acc*100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment