Instantly share code, notes, and snippets.
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save hiteshn97/bb2f1c0ad60c427dbb2db122b4cae936 to your computer and use it in GitHub Desktop.
my python code for sentiment analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Aug 28 15:11:24 2018 | |
@author: lenovo-pc | |
""" | |
import os | |
import re | |
import numpy as np | |
from gensim.models import FastText | |
from gensim.corpora.dictionary import Dictionary | |
from keras.preprocessing import sequence | |
from keras.models import Sequential | |
from keras.layers import Embedding, LSTM | |
from keras.layers.core import Dense,Dropout, Inpu | |
tfrom keras.layers import Bidirectional,Flatten,RepeatVector,Activation | |
from keras.optimizers import RMSprop | |
from keras.callbacks import EarlyStopping | |
from keras.layers import Conv1D, GlobalMaxPooling1D | |
from keras.preprocessing.text import Tokenizer | |
import matplotlib.pyplot as plt | |
maxlen=300 | |
window_size=5 | |
batch_size=32 | |
n_epoch=2 | |
input_length=600 | |
MAX_SEQUENCE_LENGTH=600 | |
#Reading tokens into memory | |
def init_lists(folder): | |
var=1 | |
a_list=[] | |
file_list=os.listdir(folder) | |
for file in file_list: | |
print(str(var)+'\n') | |
var=var+1 | |
f=open(os.path.join(folder,file),encoding="utf-8") | |
text=f.read() | |
words=re.split(r'\s+',re.sub(r'[,/\-!?.I?"\]\[<>]', ' ',text).strip()) | |
for word in words: | |
if word == "br": | |
word = "" | |
word = word.lower() | |
a_list.append(words) | |
f.close() | |
return a_list | |
pos_train=init_lists("pos_train") | |
neg_train=init_lists("neg_train") | |
pos_test=init_lists("pos_test") | |
neg_test=init_lists("neg_test") | |
unsup=init_lists("unsup") | |
total = pos_train + neg_train + pos_test + neg_test + unsup | |
train = pos_train + neg_train | |
test = pos_test + neg_test | |
mya1 = np.zeros(12500) | |
mya2 = np.ones(12500) | |
label_train = array([0 for _ in range (12500)] + [1 for _ in range(12500)]) | |
label_test = array([0 for _ in range (12500)] + [1 for _ in range(12500)]) | |
#----------------------------- | |
#Converting words to numbers | |
tokenizer = Tokenizer(lower=True, split=' ') #nb_words = 20000 | |
tokenizer.fit_on_texts(train) | |
sequences = tokenizer.texts_to_sequences(train) | |
word_index = tokenizer.word_index | |
data = sequence.pad_sequences(encoded_docs, maxlen=MAX_SEQUENCE_LENGTH, padding = 'post') | |
#----------------------------- | |
indices = np.arange(data.shape[0]) | |
np.random.shuffle(indices) | |
data = data[indices] | |
label_train = label_train[indices] | |
#Training a Fasttext Model | |
model=FastText.load("FastText.bin") | |
#----------------------------- | |
# Building the embedding matrix | |
vocab_size = len(tokenizer.word_index) + 1 #114153 | |
print(vocab_size) | |
embedding_matrix = np.random.random((vocab_size, 300)) | |
for word,i in t.word_index.items(): | |
try: | |
embedding_vector = model.wv[word] | |
except: | |
print(word, 'not found') | |
if embedding_vector is not None: | |
embedding_matrix[i] = embedding_vector | |
#----------------------------- | |
embedding_layer = Embedding(vocab_size, | |
300, | |
weights=[embedding_matrix], | |
mask_zero = false, | |
input_length = MAX_SEQUENCE_LENGTH, | |
trainable=false) | |
# Defining a simple keras model | |
print('Defining a Simple Keras Model...') | |
lstm_model=Sequential() # or Graph | |
lstm_model.add(Embedding( | |
output_dim=300, | |
input_dim=vocab_size, | |
weights=[embedding_matrix], | |
input_length=input_length)) | |
# Adding Input Length | |
lstm_model.add(Conv1D(250,3,padding='valid', activation='relu',strides=1)) | |
# we use max pooling: | |
lstm_model.add(GlobalMaxPooling1D()) | |
# We add a vanilla hidden layer: | |
lstm_model.add(Dense(250)) | |
lstm_model.add(Dropout(0.2)) | |
lstm_model.add(Activation('relu')) | |
# We project onto a single unit output layer, and squash it with a sigmoid: | |
lstm_model.add(Dense(1)) | |
lstm_model.add(Activation('sigmoid')) | |
rms_prop=RMSprop(lr=0.001,rho=0.9,epsilon=None,decay=0.0) | |
#adam = Adam(lr=0.001,beta_1=0.9,beta_2=0.999,epsilon=None,decay=0.0,amsgrad=False) | |
print('Compiling the Model...') | |
lstm_model.compile(loss='binary_crossentropy',optimizer=rms_prop,metrics=['accuracy']) | |
#class_mode='binary') | |
earlyStopping=EarlyStopping(monitor='val_loss',min_delta=0,patience=0, | |
verbose=0,mode='auto') | |
#---------------------- | |
print("Train...") | |
# Doing same preprocessing on text | |
t2 = Tokenizer(lower=True, split=' ', char_level=False, oov_token=None) | |
t2.fit_on_texts(test) | |
encoded_docs2 = t2.texts_to_sequences(test) | |
padded_docs2 = sequence.pad_sequences(encoded_docs2, maxlen = 600, padding = 'post') | |
print(padded_docs.size / 600) | |
print(label_train.size) | |
lstm_model.fit(padded_docs, label_train,batch_size=batch_size,epochs=20, | |
validation_data = (padded_docs2, label_test),callbacks=[earlyStopping]) | |
print("Evaluate...") | |
score,acc=lstm_model.evaluate(padded_docs2,label_test,batch_size=batch_size) | |
#------------------ | |
print('Test score:',str(score*100)) | |
print('Test accuracy:',str(acc*100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment