Skip to content

Instantly share code, notes, and snippets.

@pranavraikote
Created May 27, 2021 19:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pranavraikote/f58d05de075d7830421c69a0218446c1 to your computer and use it in GitHub Desktop.
Save pranavraikote/f58d05de075d7830421c69a0218446c1 to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 6: Text Classification
import nltk
import string
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPool1D
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('stopwords')
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB_Reviews.csv')
STOPWORDS = stopwords.words('english')
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def preprocessing(text):
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = re.sub(r'<.*?>', '', text)
text = text.lower().strip()
text = text.translate(str.maketrans('', '', string.punctuation))
text = " ".join([word for word in str(text).split() if word not in STOPWORDS])
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
return text
data['Processed_Review'] = data['review'].map(preprocessing)
print(data.head(10))
X = np.array(data['Processed_Review'])
print(X)
Y = np.array(data.sentiment.map({'positive':1,'negative':0}))
print(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42)
print('Shape of Training Data')
print(X_train.shape, Y_train.shape)
print('Shape of Testing Data')
print(X_test.shape, Y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
count_vectorizer.fit(X_train)
X_train_count_vector = count_vectorizer.transform(X_train)
X_test_count_vector = count_vectorizer.transform(X_test)
print(X_train_count_vector)
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 3))
tf_idf_vectorizer.fit(X_train)
X_train_tfidf_vector = tf_idf_vectorizer.transform(X_train)
X_test_tfidf_vector = tf_idf_vectorizer.transform(X_test)
print(X_train_tfidf_vector)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_count_vector, Y_train)
score = classifier.score(X_test_count_vector, Y_test)
print('Logistic Regression - CountVector')
print("Accuracy:", score)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train_count_vector, Y_train)
score = classifier.score(X_test_count_vector, Y_test)
print('Naive Bayes - CountVector')
print("Accuracy:", score)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_tfidf_vector, Y_train)
score = classifier.score(X_test_tfidf_vector, Y_test)
print('Logistic Regression - Tf-Idf Vector')
print("Accuracy:", score)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train_tfidf_vector, Y_train)
score = classifier.score(X_test_tfidf_vector, Y_test)
print('Naive Bayes - Tf-Idf Vector')
print("Accuracy:", score)
# Data for Deep Learning models
X = np.array(data['Processed_Review'])
Y = np.array(data[['positive', 'negative']])
# Word-vector Size
embed_size = 300
# Unique Words
max_features = 10000
# No of words per document
max_len = 500
tokenizer = Tokenizer(num_words = max_features)
tokenizer.fit_on_texts(list(X))
list_tokenized_train = tokenizer.texts_to_sequences(X)
X = pad_sequences(list_tokenized_train, maxlen = max_len)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
print('Shape of Training Data')
print(X_train.shape, Y_train.shape)
print('Shape of Testing Data')
print(X_test.shape, Y_test.shape)
GLOVE_FILE = '/content/drive/MyDrive/Colab Notebooks/models/glove.6B.300d.txt'
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype = 'float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(GLOVE_FILE, encoding = 'utf-8'))
all_embs = np.hstack(embeddings_index.values())
emb_mean, emb_std = all_embs.mean(), all_embs.std()
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
if i >= max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten
embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim = max_features,
output_dim = embedding_dim,
input_length = max_len))
model.add(Flatten())
model.add(Dense(10, activation = 'relu'))
model.add(Dense(2, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.summary()
epochs = 20
batch_size = 64
checkpoint = ModelCheckpoint('Models/Text-Classify_Baseline.h5', save_best_only = True, monitor = 'val_loss', mode = 'min', verbose = 1)
early_stop = EarlyStopping(monitor = 'val_loss', min_delta = 0.001, patience = 5, mode = 'min', verbose = 1, restore_best_weights = True)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, patience = 3, min_delta = 0.001, mode = 'min', verbose = 1)
history = model.fit(X_train, Y_train,
epochs = epochs,
batch_size = batch_size,
validation_split = 0.15,
callbacks = [checkpoint, early_stop, reduce_lr])
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.legend()
plt.grid()
plt.show()
inp = Input(shape = (max_len,))
x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp)
x = LSTM(256, return_sequences = True, recurrent_dropout = 0.2)(x)
x = GlobalMaxPool1D()(x)
x = Dense(100, activation = 'relu')(x)
x = Dropout(0.2)(x)
x = Dense(2, activation = 'sigmoid')(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()
epochs = 20
batch_size = 64
checkpoint = ModelCheckpoint('Models/Text-Classify_GloVe.h5', save_best_only = True, monitor = 'val_loss', mode = 'min', verbose = 1)
early_stop = EarlyStopping(monitor = 'val_loss', min_delta = 0.001, patience = 5, mode = 'min', verbose = 1, restore_best_weights = True)
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, patience = 3, min_delta = 0.001, mode = 'min', verbose = 1)
history = model.fit(X_train, Y_train,
epochs = epochs, batch_size = batch_size,
validation_split = 0.15,
callbacks = [checkpoint, early_stop, reduce_lr])
plt.plot(history.history["loss"], label="Training Loss")
plt.plot(history.history["val_loss"], label="Validation Loss")
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.legend()
plt.grid()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment