spencerwilson/nb.py

## nb.py
#!/usr/bin/env python

import numpy as np
import pandas as pd
from sklearn import svm

FILE_NAME = 'mosteller-wallace-federalist-papers.csv'
STOP_WORDS = ['a', 'an']

# Rows are samples, cols are [...word_count, AUTHOR, CODE_NUMER].
# Discard CODE_NUMBER col because it's useless.
df = pd.read_csv(FILE_NAME).iloc[:, :-1]

# In an exciting twist, rather than stop words being omitted, they're
# the only words we _do_ include.
df = df.loc[:, STOP_WORDS + ['AUTHOR']]

# Partition the dataset into X_train and X_test in a special way:
# samples whose label is 'unknown' ('HAMILTON OR MADISON') will be
# X_test, and everything written by those two men is X_train.
# Filter using boolean indexing: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
df_unknown = df['AUTHOR'] == 'HAMILTON OR MADISON'
df_unknown = df[df_unknown]
df_known = df[(df['AUTHOR'] == 'HAMILTON') | (df['AUTHOR'] == 'MADISON')]

X_train, y_train = df_known.iloc[:, :-1], df_known.iloc[:, -1:].values.ravel()
X_test = df_unknown.iloc[:, :-1]

# Rando classifier, from https://scikit-learn.org/stable/tutorial/basic/tutorial.html#learning-and-predicting
clf_svm = svm.SVC(gamma=0.001, C=100.)
clf_svm.fit(X_train, y_train)

from sklearn import naive_bayes

# A given author has some 'true' probability of producing a given sample
# (AKA feature vector AKA word count histogram for a document), and the task
# is try and estimate it. If we could, then we could substitute that value
# directly into Bayes' theorem as the 'likelihood' term, p(x | C_k). Along
# with the easily-estimated (substantiated below) 'prior' term p(C_k), the
# 'posterior' p(C_k | x) is computable. That sounds helpful, so let's try
# to get there.
#
# Main idea: each author (class, C_k) has his own personal multinomial distribution
# that characterizes the kinds of bags of words they're likely to produce when
# they sit down at their writing desk and produce documents.
#
# A multinomial distribution says what the probability of producing a given
# sample is, but to construct it you need its special parameter sauce: the
# author's probability of using each word, (p1, ..., pn). To estimate this,
# each word's relative frequency in the training set is used. This simple technique
# method has a fancy name: maximum likelihood estimation.
#
# The pmf for that distribution _is_ the p(x | C_k) that appears in Bayes'
# theorem. This likelihood term, along with the prior term p(C_k) (itself
# estimated simply based on the relative frequency of C_k within y_train), allows
# the computation of that thicc posterior: p(C_k | x).
#
# Finally, add in a decision rule like 'For a given sample, compute the
# posterior and pick the class whose posterior is greatest', AKA maximum
# a posteriori. With that, a classifier is born.
#
# See
#  - https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes
#  - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes
#  - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Constructing_a_classifier_from_the_probability_model
#  - https://en.wikipedia.org/wiki/Multinomial_distribution
clf_nb = naive_bayes.MultinomialNB()
clf_nb.fit(X_train, y_train)

print('(SVM, MultinomialNB)')
print(np.stack((clf_svm.predict(X_test), clf_nb.predict(X_test)), axis=1))

# TODO: Try cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation
	#!/usr/bin/env python

	import numpy as np
	import pandas as pd
	from sklearn import svm

	FILE_NAME = 'mosteller-wallace-federalist-papers.csv'
	STOP_WORDS = ['a', 'an']

	# Rows are samples, cols are [...word_count, AUTHOR, CODE_NUMER].
	# Discard CODE_NUMBER col because it's useless.
	df = pd.read_csv(FILE_NAME).iloc[:, :-1]

	# In an exciting twist, rather than stop words being omitted, they're
	# the only words we _do_ include.
	df = df.loc[:, STOP_WORDS + ['AUTHOR']]

	# Partition the dataset into X_train and X_test in a special way:
	# samples whose label is 'unknown' ('HAMILTON OR MADISON') will be
	# X_test, and everything written by those two men is X_train.
	# Filter using boolean indexing: https://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing
	df_unknown = df['AUTHOR'] == 'HAMILTON OR MADISON'
	df_unknown = df[df_unknown]
	df_known = df[(df['AUTHOR'] == 'HAMILTON') \| (df['AUTHOR'] == 'MADISON')]

	X_train, y_train = df_known.iloc[:, :-1], df_known.iloc[:, -1:].values.ravel()
	X_test = df_unknown.iloc[:, :-1]

	# Rando classifier, from https://scikit-learn.org/stable/tutorial/basic/tutorial.html#learning-and-predicting
	clf_svm = svm.SVC(gamma=0.001, C=100.)
	clf_svm.fit(X_train, y_train)

	from sklearn import naive_bayes

	# A given author has some 'true' probability of producing a given sample
	# (AKA feature vector AKA word count histogram for a document), and the task
	# is try and estimate it. If we could, then we could substitute that value
	# directly into Bayes' theorem as the 'likelihood' term, p(x \| C_k). Along
	# with the easily-estimated (substantiated below) 'prior' term p(C_k), the
	# 'posterior' p(C_k \| x) is computable. That sounds helpful, so let's try
	# to get there.
	#
	# Main idea: each author (class, C_k) has his own personal multinomial distribution
	# that characterizes the kinds of bags of words they're likely to produce when
	# they sit down at their writing desk and produce documents.
	#
	# A multinomial distribution says what the probability of producing a given
	# sample is, but to construct it you need its special parameter sauce: the
	# author's probability of using each word, (p1, ..., pn). To estimate this,
	# each word's relative frequency in the training set is used. This simple technique
	# method has a fancy name: maximum likelihood estimation.
	#
	# The pmf for that distribution _is_ the p(x \| C_k) that appears in Bayes'
	# theorem. This likelihood term, along with the prior term p(C_k) (itself
	# estimated simply based on the relative frequency of C_k within y_train), allows
	# the computation of that thicc posterior: p(C_k \| x).
	#
	# Finally, add in a decision rule like 'For a given sample, compute the
	# posterior and pick the class whose posterior is greatest', AKA maximum
	# a posteriori. With that, a classifier is born.
	#
	# See
	# - https://scikit-learn.org/stable/modules/naive_bayes.html#multinomial-naive-bayes
	# - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Multinomial_naive_Bayes
	# - https://en.wikipedia.org/wiki/Naive_Bayes_classifier#Constructing_a_classifier_from_the_probability_model
	# - https://en.wikipedia.org/wiki/Multinomial_distribution
	clf_nb = naive_bayes.MultinomialNB()
	clf_nb.fit(X_train, y_train)

	print('(SVM, MultinomialNB)')
	print(np.stack((clf_svm.predict(X_test), clf_nb.predict(X_test)), axis=1))

	# TODO: Try cross-validation: https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation