Skip to content

Instantly share code, notes, and snippets.

View saimadhu-polamuri's full-sized avatar
💭
For the love of data.

saimadhu saimadhu-polamuri

💭
For the love of data.
View GitHub Profile
## Required packages
import random
import spacy
import pandas as pd
import seaborn as sns
from spacy.util import minibatch
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot as plt
######## Main method ########
def main():
# Load dataset
data = pd.read_csv(data_path)
observations = len(data.index)
print("Dataset Size: {}".format(observations))
print(data['label'].value_counts())
print(data['label'].value_counts() / len(data.index) * 100.0)
@saimadhu-polamuri
saimadhu-polamuri / email-spam-classifier-create-model.py
Created July 18, 2020 16:43
Create space textcategorizer model
######## Main method ########
def main():
# Load dataset
data = pd.read_csv(data_path)
observations = len(data.index)
print("Dataset Size: {}".format(observations))
# Create an empty spacy model
# Split data into train and test datasets
x_train, x_test, y_train, y_test = train_test_split(
data['text'], data['label'], test_size=0.33, random_state=7)
# Create the train and test data for the spacy model
train_lables = [{'cats': {'ham': label == 'ham',
'spam': label == 'spam'}} for label in y_train]
test_lables = [{'cats': {'ham': label == 'ham',
'spam': label == 'spam'}} for label in y_test]
# Spacy model data
train_data = list(zip(x_train, train_lables))
test_data = list(zip(x_test, test_lables))
def train_model(model, train_data, optimizer, batch_size, epochs=10):
losses = {}
random.seed(1)
for epoch in range(epochs):
random.shuffle(train_data)
batches = minibatch(train_data, size=batch_size)
for batch in batches:
# Model configurations
optimizer = nlp.begin_training()
batch_size = 5
epochs = 10
# Training the model
train_model(nlp, train_data, optimizer, batch_size, epochs)
# Sample predictions
print(train_data[0])
sample_test = nlp(train_data[0][0])
print(sample_test.cats)
def get_predictions(model, texts):
# Use the model's tokenizer to tokenize each input text
docs = [model.tokenizer(text) for text in texts]
# Use textcat to get the scores for each doc
textcat = model.get_pipe('textcat')
scores, _ = textcat.predict(docs)
# From the scores, find the label with the highest score/probability
# Train and test accuracy
train_predictions = get_predictions(nlp, x_train)
test_predictions = get_predictions(nlp, x_test)
train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Train accuracy: {}".format(train_accuracy))
print("Test accuracy: {}".format(test_accuracy))