Skip to content

Instantly share code, notes, and snippets.

View ashokc's full-sized avatar

Ashok Chilakapati ashokc

View GitHub Profile
@ashokc
ashokc / construct_text_corpus.py
Last active January 16, 2019 01:05
Construct a Text Corpus with Sequences
# Build the corpus and sequences
with open ('words.txt' , 'r') as f:
words = sorted(list(set(f.read().lower().strip().split(','))))
X, labels = [], []
labelToName = { 0 : 'ordered', 1 : 'unordered', 2 : 'reversed' }
namesInLabelOrder = ['ordered', 'unordered', 'reversed']
nWords = len(words)
sequenceLength=15
for i in range(0, nWords-sequenceLength):
X.append(words[i:i+sequenceLength])
@ashokc
ashokc / lstm_model.py
Created January 16, 2019 01:22
Build a Simple LSTM Model
# Build the LSTM model
def getModel():
units1, units2 = int (nWords/4), int (nWords/8)
model = keras.models.Sequential()
model.add(keras.layers.embeddings.Embedding(input_dim = len(kTokenizer.word_index)+1,output_dim=units1,input_length=sequenceLength, trainable=True)) # Line 5
model.add(keras.layers.LSTM(units = units2, return_sequences =False)) # Line 6
model.add(keras.layers.Dense(len(labelToName), activation ='softmax')) # Line 7
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics=['acc'])
return model
@ashokc
ashokc / train_test_lstm.py
Created January 16, 2019 01:24
Train and test the simple LSTM model
# Train and test over multiple train/validation sets
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=2, mode='auto', restore_best_weights=False)
sss2 = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=1).split(train_x, train_labels)
for i in range(10):
train_indices_2, val_indices = next(sss2)
model = getModel()
model.fit(x=train_x[train_indices_2], y=train_labels[train_indices_2], epochs=50, batch_size=32, shuffle=True, validation_data = (train_x[val_indices], train_labels[val_indices]), verbose=2, callbacks=[early_stop])#Line7
test_loss, test_accuracy = model.evaluate(test_x, test_labels, verbose=2)
print (test_loss, test_accuracy)
predicted = model.predict(test_x, verbose=2)
@ashokc
ashokc / nb_implementation.py
Created January 16, 2019 15:25
Naive Bayes Implementation with SciKit
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
train_x = Xencoded[train_indices]
test_x = Xencoded[test_indices]
train_labels = labels[train_indices]
test_labels = labels[test_indices]
model.fit(train_x, train_labels)
predicted_labels = model.predict(test_x)
print (confusion_matrix(labels[test_indices], predicted_labels))
print (classification_report(labels[test_indices], predicted_labels, digits=4, target_names=namesInLabelOrder))
@ashokc
ashokc / movies_tokenize.py
Created January 26, 2019 18:02
Tokenize Movies
# Read the Text Corpus, Clean and Tokenize
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
nltk_stopw = stopwords.words('english')
def tokenize (text): # no punctuation & starts with a letter & between 2-15 characters in length
tokens = [word.strip(string.punctuation) for word in RegexpTokenizer(r'\b[a-zA-Z][a-zA-Z0-9]{2,14}\b').tokenize(text)]
return [f.lower() for f in tokens if f and f.lower() not in nltk_stopw]
@ashokc
ashokc / tf_idf_vectors.py
Created January 26, 2019 18:04
Tf-Idf vectors from tokens
# Build Tf-Idf Vectors
from sklearn.feature_extraction.text import TfidfVectorizer
X=np.array([np.array(xi) for xi in X]) # rows:Docs. columns:words
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=1).fit(X)
word_index = vectorizer.vocabulary_
Xencoded = vectorizer.transform(X)
@ashokc
ashokc / keras_padded_sequences.py
Created January 26, 2019 18:05
Padded sequences from Keras
# Turn text into 200-long integer sequences, padding with 0 if necessary to maintain the length at 200
import keras
sequenceLength = 200
kTokenizer = keras.preprocessing.text.Tokenizer()
kTokenizer.fit_on_texts(X)
encoded_docs = kTokenizer.texts_to_sequences(X)
Xencoded = keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=sequenceLength, padding='post')
@ashokc
ashokc / train_test_split.py
Created January 26, 2019 18:06
Train Test Split
# Test/Train Split
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(Xencoded, labels)
train_indices, test_indices = next(sss)
train_x, test_x = Xencoded[train_indices], Xencoded[test_indices]
@ashokc
ashokc / lstm_model_2.py
Created January 26, 2019 18:07
A simple model for LSTM
# A Simple Model for LSTM
model = keras.models.Sequential()
embedding = keras.layers.embeddings.Embedding(input_dim=len(kTokenizer.word_index)+1, output_dim=300, input_length=sequenceLength, trainable=True, mask_zero=True)
model.add(embedding)
model.add(keras.layers.LSTM(units=150, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
model.add(keras.layers.Dense(numClasses, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
@ashokc
ashokc / train_predict_lstm.py
Created January 26, 2019 18:07
Train & Predict with LSTM
# Train and Predict with LSTM
train_labels = keras.utils.to_categorical(labels[train_indices], len(labelToName))
test_labels = keras.utils.to_categorical(labels[test_indices], len(labelToName))
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=2, mode='auto', restore_best_weights=False)
history = model.fit(x=train_x, y=train_labels, epochs=50, batch_size=32, shuffle=True, validation_data = (test_x, test_labels), verbose=2, callbacks=[early_stop])
predicted = model.predict(test_x, verbose=2)
predicted_labels = predicted.argmax(axis=1)