Created
May 27, 2021 19:55
-
-
Save pranavraikote/f58d05de075d7830421c69a0218446c1 to your computer and use it in GitHub Desktop.
NLP Tutorials - Part 6: Text Classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import nltk | |
import string | |
import os, re | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
from sklearn.model_selection import train_test_split | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau | |
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers | |
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPool1D | |
from nltk.corpus import stopwords | |
from nltk.stem import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
nltk.download('wordnet') | |
nltk.download('stopwords') | |
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/IMDB_Reviews.csv') | |
STOPWORDS = stopwords.words('english') | |
ps = PorterStemmer() | |
lemmatizer = WordNetLemmatizer() | |
def preprocessing(text): | |
text = re.sub(r'[^a-zA-Z\s]', '', text) | |
text = re.sub(r'<.*?>', '', text) | |
text = text.lower().strip() | |
text = text.translate(str.maketrans('', '', string.punctuation)) | |
text = " ".join([word for word in str(text).split() if word not in STOPWORDS]) | |
text = " ".join([lemmatizer.lemmatize(word) for word in text.split()]) | |
return text | |
data['Processed_Review'] = data['review'].map(preprocessing) | |
print(data.head(10)) | |
X = np.array(data['Processed_Review']) | |
print(X) | |
Y = np.array(data.sentiment.map({'positive':1,'negative':0})) | |
print(Y) | |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 42) | |
print('Shape of Training Data') | |
print(X_train.shape, Y_train.shape) | |
print('Shape of Testing Data') | |
print(X_test.shape, Y_test.shape) | |
from sklearn.feature_extraction.text import CountVectorizer | |
count_vectorizer = CountVectorizer() | |
count_vectorizer.fit(X_train) | |
X_train_count_vector = count_vectorizer.transform(X_train) | |
X_test_count_vector = count_vectorizer.transform(X_test) | |
print(X_train_count_vector) | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
tf_idf_vectorizer = TfidfVectorizer(ngram_range=(1, 3)) | |
tf_idf_vectorizer.fit(X_train) | |
X_train_tfidf_vector = tf_idf_vectorizer.transform(X_train) | |
X_test_tfidf_vector = tf_idf_vectorizer.transform(X_test) | |
print(X_train_tfidf_vector) | |
from sklearn.linear_model import LogisticRegression | |
classifier = LogisticRegression() | |
classifier.fit(X_train_count_vector, Y_train) | |
score = classifier.score(X_test_count_vector, Y_test) | |
print('Logistic Regression - CountVector') | |
print("Accuracy:", score) | |
from sklearn.naive_bayes import MultinomialNB | |
classifier = MultinomialNB() | |
classifier.fit(X_train_count_vector, Y_train) | |
score = classifier.score(X_test_count_vector, Y_test) | |
print('Naive Bayes - CountVector') | |
print("Accuracy:", score) | |
from sklearn.linear_model import LogisticRegression | |
classifier = LogisticRegression() | |
classifier.fit(X_train_tfidf_vector, Y_train) | |
score = classifier.score(X_test_tfidf_vector, Y_test) | |
print('Logistic Regression - Tf-Idf Vector') | |
print("Accuracy:", score) | |
from sklearn.naive_bayes import MultinomialNB | |
classifier = MultinomialNB() | |
classifier.fit(X_train_tfidf_vector, Y_train) | |
score = classifier.score(X_test_tfidf_vector, Y_test) | |
print('Naive Bayes - Tf-Idf Vector') | |
print("Accuracy:", score) | |
# Data for Deep Learning models | |
X = np.array(data['Processed_Review']) | |
Y = np.array(data[['positive', 'negative']]) | |
# Word-vector Size | |
embed_size = 300 | |
# Unique Words | |
max_features = 10000 | |
# No of words per document | |
max_len = 500 | |
tokenizer = Tokenizer(num_words = max_features) | |
tokenizer.fit_on_texts(list(X)) | |
list_tokenized_train = tokenizer.texts_to_sequences(X) | |
X = pad_sequences(list_tokenized_train, maxlen = max_len) | |
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42) | |
print('Shape of Training Data') | |
print(X_train.shape, Y_train.shape) | |
print('Shape of Testing Data') | |
print(X_test.shape, Y_test.shape) | |
GLOVE_FILE = '/content/drive/MyDrive/Colab Notebooks/models/glove.6B.300d.txt' | |
def get_coefs(word, *arr): | |
return word, np.asarray(arr, dtype = 'float32') | |
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(GLOVE_FILE, encoding = 'utf-8')) | |
all_embs = np.hstack(embeddings_index.values()) | |
emb_mean, emb_std = all_embs.mean(), all_embs.std() | |
word_index = tokenizer.word_index | |
nb_words = min(max_features, len(word_index)) | |
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) | |
for word, i in word_index.items(): | |
if i >= max_features: | |
continue | |
embedding_vector = embeddings_index.get(word) | |
if embedding_vector is not None: | |
embedding_matrix[i] = embedding_vector | |
from tensorflow.keras.models import Sequential | |
from tensorflow.keras.layers import Flatten | |
embedding_dim = 50 | |
model = Sequential() | |
model.add(Embedding(input_dim = max_features, | |
output_dim = embedding_dim, | |
input_length = max_len)) | |
model.add(Flatten()) | |
model.add(Dense(10, activation = 'relu')) | |
model.add(Dense(2, activation = 'softmax')) | |
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) | |
model.summary() | |
epochs = 20 | |
batch_size = 64 | |
checkpoint = ModelCheckpoint('Models/Text-Classify_Baseline.h5', save_best_only = True, monitor = 'val_loss', mode = 'min', verbose = 1) | |
early_stop = EarlyStopping(monitor = 'val_loss', min_delta = 0.001, patience = 5, mode = 'min', verbose = 1, restore_best_weights = True) | |
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, patience = 3, min_delta = 0.001, mode = 'min', verbose = 1) | |
history = model.fit(X_train, Y_train, | |
epochs = epochs, | |
batch_size = batch_size, | |
validation_split = 0.15, | |
callbacks = [checkpoint, early_stop, reduce_lr]) | |
plt.plot(history.history["loss"], label="Training Loss") | |
plt.plot(history.history["val_loss"], label="Validation Loss") | |
plt.plot(history.history["accuracy"], label="Training Accuracy") | |
plt.plot(history.history["val_accuracy"], label="Validation Accuracy") | |
plt.legend() | |
plt.grid() | |
plt.show() | |
inp = Input(shape = (max_len,)) | |
x = Embedding(max_features, embed_size, weights = [embedding_matrix], trainable = False)(inp) | |
x = LSTM(256, return_sequences = True, recurrent_dropout = 0.2)(x) | |
x = GlobalMaxPool1D()(x) | |
x = Dense(100, activation = 'relu')(x) | |
x = Dropout(0.2)(x) | |
x = Dense(2, activation = 'sigmoid')(x) | |
model = Model(inputs = inp, outputs = x) | |
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) | |
model.summary() | |
epochs = 20 | |
batch_size = 64 | |
checkpoint = ModelCheckpoint('Models/Text-Classify_GloVe.h5', save_best_only = True, monitor = 'val_loss', mode = 'min', verbose = 1) | |
early_stop = EarlyStopping(monitor = 'val_loss', min_delta = 0.001, patience = 5, mode = 'min', verbose = 1, restore_best_weights = True) | |
reduce_lr = ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, patience = 3, min_delta = 0.001, mode = 'min', verbose = 1) | |
history = model.fit(X_train, Y_train, | |
epochs = epochs, batch_size = batch_size, | |
validation_split = 0.15, | |
callbacks = [checkpoint, early_stop, reduce_lr]) | |
plt.plot(history.history["loss"], label="Training Loss") | |
plt.plot(history.history["val_loss"], label="Validation Loss") | |
plt.plot(history.history["accuracy"], label="Training Accuracy") | |
plt.plot(history.history["val_accuracy"], label="Validation Accuracy") | |
plt.legend() | |
plt.grid() | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment