Skip to content

Instantly share code, notes, and snippets.

@hiteshn97
Created September 28, 2018 16:52
Show Gist options
  • Save hiteshn97/bb2f1c0ad60c427dbb2db122b4cae936 to your computer and use it in GitHub Desktop.
Save hiteshn97/bb2f1c0ad60c427dbb2db122b4cae936 to your computer and use it in GitHub Desktop.
my python code for sentiment analysis
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 28 15:11:24 2018
@author: lenovo-pc
"""
import os
import re
import numpy as np
from gensim.models import FastText
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, LSTM
from keras.layers.core import Dense,Dropout, Inpu
tfrom keras.layers import Bidirectional,Flatten,RepeatVector,Activation
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
import matplotlib.pyplot as plt
maxlen=300
window_size=5
batch_size=32
n_epoch=2
input_length=600
MAX_SEQUENCE_LENGTH=600
#Reading tokens into memory
def init_lists(folder):
var=1
a_list=[]
file_list=os.listdir(folder)
for file in file_list:
print(str(var)+'\n')
var=var+1
f=open(os.path.join(folder,file),encoding="utf-8")
text=f.read()
words=re.split(r'\s+',re.sub(r'[,/\-!?.I?"\]\[<>]', ' ',text).strip())
for word in words:
if word == "br":
word = ""
word = word.lower()
a_list.append(words)
f.close()
return a_list
pos_train=init_lists("pos_train")
neg_train=init_lists("neg_train")
pos_test=init_lists("pos_test")
neg_test=init_lists("neg_test")
unsup=init_lists("unsup")
total = pos_train + neg_train + pos_test + neg_test + unsup
train = pos_train + neg_train
test = pos_test + neg_test
mya1 = np.zeros(12500)
mya2 = np.ones(12500)
label_train = array([0 for _ in range (12500)] + [1 for _ in range(12500)])
label_test = array([0 for _ in range (12500)] + [1 for _ in range(12500)])
#-----------------------------
#Converting words to numbers
tokenizer = Tokenizer(lower=True, split=' ') #nb_words = 20000
tokenizer.fit_on_texts(train)
sequences = tokenizer.texts_to_sequences(train)
word_index = tokenizer.word_index
data = sequence.pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding = 'post')
#-----------------------------
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
label_train = label_train[indices]
#Training a Fasttext Model
model=FastText.load("FastText.bin")
#-----------------------------
# Building the embedding matrix
vocab_size = len(tokenizer.word_index) + 1 #114153
print(vocab_size)
embedding_matrix = np.random.random((vocab_size, 300))
for word,i in t.word_index.items():
try:
embedding_vector = model.wv[word]
except:
print(word, 'not found')
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
#-----------------------------
embedding_layer = Embedding(vocab_size,
300,
weights=[embedding_matrix],
mask_zero = false,
input_length = MAX_SEQUENCE_LENGTH,
trainable=false)
# Defining a simple keras model
print('Defining a Simple Keras Model...')
lstm_model=Sequential() # or Graph
lstm_model.add(Embedding(
output_dim=300,
input_dim=vocab_size,
weights=[embedding_matrix],
input_length=input_length))
# Adding Input Length
lstm_model.add(Conv1D(250,3,padding='valid', activation='relu',strides=1))
# we use max pooling:
lstm_model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
lstm_model.add(Dense(250))
lstm_model.add(Dropout(0.2))
lstm_model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
lstm_model.add(Dense(1))
lstm_model.add(Activation('sigmoid'))
rms_prop=RMSprop(lr=0.001,rho=0.9,epsilon=None,decay=0.0)
#adam = Adam(lr=0.001,beta_1=0.9,beta_2=0.999,epsilon=None,decay=0.0,amsgrad=False)
print('Compiling the Model...')
lstm_model.compile(loss='binary_crossentropy',optimizer=rms_prop,metrics=['accuracy'])
#class_mode='binary')
earlyStopping=EarlyStopping(monitor='val_loss',min_delta=0,patience=0,
verbose=0,mode='auto')
#----------------------
print("Train...")
# Doing same preprocessing on text
t2 = Tokenizer(lower=True, split=' ', char_level=False, oov_token=None)
t2.fit_on_texts(test)
encoded_docs2 = t2.texts_to_sequences(test)
padded_docs2 = sequence.pad_sequences(encoded_docs2, maxlen = 600, padding = 'post')
print(padded_docs.size / 600)
print(label_train.size)
lstm_model.fit(padded_docs, label_train,batch_size=batch_size,epochs=20,
validation_data = (padded_docs2, label_test),callbacks=[earlyStopping])
print("Evaluate...")
score,acc=lstm_model.evaluate(padded_docs2,label_test,batch_size=batch_size)
#------------------
print('Test score:',str(score*100))
print('Test accuracy:',str(acc*100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment