Skip to content

Instantly share code, notes, and snippets.

@JohnDeJesus22
JohnDeJesus22 / accuracy.py
Last active July 18, 2020 22:47
accuracy
numerator = cm[0,0] + cm[1,1]
denominator = cm[0,0] + cm[1,1] + cm[0,1] + cm[1,0]
accuracy = numerator/denominator
@JohnDeJesus22
JohnDeJesus22 / sklearn_cm.py
Last active July 19, 2020 03:39
sklearn cm
# create confusion matrix with sklearn
cm = confusion_matrix(y_actual,y_pred)
# plot normalzied confusion matrix
skplt.metrics.plot_confusion_matrix(y_actual,y_pred,cmap= 'coolwarm', normalize=True)
# plot confusion matrix with scikitplot
skplt.metrics.plot_confusion_matrix(y_actual,y_pred,cmap= 'coolwarm')
@JohnDeJesus22
JohnDeJesus22 / cm_setup.py
Created July 16, 2020 01:42
data setup for confusion matrix
# set up binary data "actual" and "predicted"
np.random.RandomState(3)
y_actual = np.random.randint(0,2,500)
y_pred = np.random.randint(0,2,500)
@JohnDeJesus22
JohnDeJesus22 / confusion_matrix_libraries.py
Last active July 19, 2020 03:40
import confusion matrix libraries
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt
from math import sqrt
from sklearn.metrics import confusion_matrix
@JohnDeJesus22
JohnDeJesus22 / loading_libraries_missingno.py
Last active June 13, 2020 11:57
loading libaries missingno
# import libraries
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
# set seaborn chart background as default
sns.set()
@JohnDeJesus22
JohnDeJesus22 / remove_low_count_summaries
Created June 3, 2020 02:59
remove_low_count_summaries
# get number of rows that have 0 words. Checking for articles that did not download
text_data[text_data.text_len == 0].shape[0]
# get number of rows that are less than 400 words
small_text = text_data[text_data.text_len < 400]
small_text['text_no_nl'].shape[0]
# create a separate df with these articles and filter them out of the original text data
td_filtered = text_data[text_data.text_len >= 400]
# initiate ecdf function
def plot_ecdf(data, title='ECDF Plot', xlabel='Data Values', ylabel='Percentage'):
"""
Function to plot ecdf taking a column of data as input.
"""
xaxis = np.sort(data)
length = len(data)
yaxis = np.arange(1,length+1)/length
plt.plot(xaxis,yaxis,linestyle='none',marker='.')
@JohnDeJesus22
JohnDeJesus22 / visualize_text_len
Created June 3, 2020 02:53
visualize_text_len
# get lengths of each text
text_data['text_len'] = text_data['tokenized'].apply(len)
# visualize the lengths of text
plot_ecdf(text_data.text_len.values,title='ECDF Text Len Plot',xlabel='text lengths')