John DeJesus JohnDeJesus22

## accuracy.py
numerator = cm[0,0] + cm[1,1]
denominator = cm[0,0] + cm[1,1] + cm[0,1] + cm[1,0]
accuracy = numerator/denominator

## sklearn_cm.py
# create confusion matrix with sklearn
cm = confusion_matrix(y_actual,y_pred)

## cm_normalized.py
# plot normalzied confusion matrix
skplt.metrics.plot_confusion_matrix(y_actual,y_pred,cmap= 'coolwarm', normalize=True)

## confusion_matrix.py
# plot confusion matrix with scikitplot
skplt.metrics.plot_confusion_matrix(y_actual,y_pred,cmap= 'coolwarm')

## cm_setup.py
# set up binary data "actual" and "predicted"
np.random.RandomState(3)
y_actual = np.random.randint(0,2,500)
y_pred = np.random.randint(0,2,500)

## confusion_matrix_libraries.py
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import scikitplot as skplt

from math import sqrt
from sklearn.metrics import confusion_matrix

## loading_libraries_missingno.py
# import libraries
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns

# set seaborn chart background as default
sns.set()

## remove_low_count_summaries
# get number of rows that have 0 words. Checking for articles that did not download
text_data[text_data.text_len == 0].shape[0]

# get number of rows that are less than 400 words
small_text = text_data[text_data.text_len < 400]
small_text['text_no_nl'].shape[0]

# create a separate df with these articles and filter them out of the original text data
td_filtered = text_data[text_data.text_len >= 400]

## ecdf_function
# initiate ecdf function
def plot_ecdf(data, title='ECDF Plot', xlabel='Data Values', ylabel='Percentage'):

    """
    Function to plot ecdf taking a column of data as input.
    """
    xaxis = np.sort(data)
    length = len(data)
    yaxis = np.arange(1,length+1)/length
    plt.plot(xaxis,yaxis,linestyle='none',marker='.')

## visualize_text_len
# get lengths of each text
text_data['text_len'] = text_data['tokenized'].apply(len)

# visualize the lengths of text
plot_ecdf(text_data.text_len.values,title='ECDF Text Len Plot',xlabel='text lengths')
	numerator = cm[0,0] + cm[1,1]
	denominator = cm[0,0] + cm[1,1] + cm[0,1] + cm[1,0]
	accuracy = numerator/denominator
	# create confusion matrix with sklearn
	cm = confusion_matrix(y_actual,y_pred)
	# plot normalzied confusion matrix
	skplt.metrics.plot_confusion_matrix(y_actual,y_pred,cmap= 'coolwarm', normalize=True)
	# plot confusion matrix with scikitplot
	skplt.metrics.plot_confusion_matrix(y_actual,y_pred,cmap= 'coolwarm')
	# set up binary data "actual" and "predicted"
	np.random.RandomState(3)
	y_actual = np.random.randint(0,2,500)
	y_pred = np.random.randint(0,2,500)
	# import libraries
	import numpy as np
	import matplotlib.pyplot as plt
	import scikitplot as skplt

	from math import sqrt
	from sklearn.metrics import confusion_matrix
	# import libraries
	import pandas as pd
	import numpy as np
	import missingno as msno
	import matplotlib.pyplot as plt
	import seaborn as sns

	# set seaborn chart background as default
	sns.set()
	# get number of rows that have 0 words. Checking for articles that did not download
	text_data[text_data.text_len == 0].shape[0]

	# get number of rows that are less than 400 words
	small_text = text_data[text_data.text_len < 400]
	small_text['text_no_nl'].shape[0]

	# create a separate df with these articles and filter them out of the original text data
	td_filtered = text_data[text_data.text_len >= 400]
	# initiate ecdf function
	def plot_ecdf(data, title='ECDF Plot', xlabel='Data Values', ylabel='Percentage'):

	"""
	Function to plot ecdf taking a column of data as input.
	"""
	xaxis = np.sort(data)
	length = len(data)
	yaxis = np.arange(1,length+1)/length
	plt.plot(xaxis,yaxis,linestyle='none',marker='.')
	# get lengths of each text
	text_data['text_len'] = text_data['tokenized'].apply(len)

	# visualize the lengths of text
	plot_ecdf(text_data.text_len.values,title='ECDF Text Len Plot',xlabel='text lengths')