Skip to content

Instantly share code, notes, and snippets.

@AIAnytime
Created February 5, 2023 13:33
Show Gist options
  • Save AIAnytime/93c63c2ca1dc6f0c4b0b78416366ba1a to your computer and use it in GitHub Desktop.
Save AIAnytime/93c63c2ca1dc6f0c4b0b78416366ba1a to your computer and use it in GitHub Desktop.
Toxicity Classifier
# -*- coding: utf-8 -*-
"""Toxicity Classifier NLP.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1UUZzQgrRUcLujGxbmhE30AlQALMsYXCm
"""
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
data = pd.read_csv("FinalBalancedDataset.csv")
data.head(5)
data = data.drop("Unnamed: 0", axis=1)
data.head(5)
import nltk
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.corpus import wordnet
wordnet_lemmatizer = WordNetLemmatizer()
import re
def prepare_text(text):
def get_wordnet_pos(treebank_tag):
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
text = re.sub(r'[^a-zA-Z\']', ' ', text)
text = text.split()
text = ' '.join(text)
text = word_tokenize(text)
text = pos_tag(text)
lemma = []
for i in text: lemma.append(wordnet_lemmatizer.lemmatize(i[0], pos = get_wordnet_pos(i[1])))
lemma = ' '.join(lemma)
return lemma
data['clean_tweets'] = data['tweet'].apply(lambda x: prepare_text(x))
data.head(5)
data['Toxicity'].value_counts()
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
corpus = data['clean_tweets'].values.astype('U')
stopwords = set(nltk_stopwords.words('english'))
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf = count_tf_idf.fit_transform(corpus)
import pickle
pickle.dump(count_tf_idf, open("tf_idf.pkt", 'wb'))
tf_idf_train, tf_idf_test, target_train, target_test = train_test_split(
tf_idf, data['Toxicity'], test_size = 0.8, random_state=42, shuffle=True
)
"""## Train the model"""
model_bayes = MultinomialNB()
model_bayes.fit(tf_idf_train, target_train)
y_pred_proba = model_bayes.predict_proba(tf_idf_test)[::, 1]
y_pred_proba
fpr, tpr, _ = roc_curve(target_test, y_pred_proba)
final_roc_auc = roc_auc_score(target_test, y_pred_proba)
final_roc_auc
sample = "It was an amazing experience"
sample_tfidf = count_tf_idf.transform([sample])
display(model_bayes.predict_proba(sample_tfidf))
display(model_bayes.predict(sample_tfidf))
model = MultinomialNB()
model.fit(tf_idf, data['tweet'])
pickle.dump(model, open("model.pkt", 'wb'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment