dkreider/nb-classifier.py

## nb-classifier.py
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer

words = ["I am strong", "I am slender", "I am handsome", "I am pretty", "I am big", "I am beautiful", "strength", "pretty", "muscle", "looks"]
authors = ["man", "woman", "man", "woman", "man", "woman", "man", "woman", "man", "woman"]
word_test = ['beauty', 'big muscle']

 # Split into training and testing data.
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(words, authors, test_size=0.1, random_state=42)

 # Text vectorization - convert from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train).toarray()
features_test_transformed = vectorizer.transform(features_test).toarray()
word_test_transformed = vectorizer.transform(word_test).toarray()

clf = GaussianNB()
clf.fit(features_train_transformed, labels_train)

prediction = clf.predict(word_test_transformed)
print("Our classifier predicts that " + word_test[0] + " belongs to the " + prediction[0] + " category.")
print("Our classifier predicts that " + word_test[1] + " belongs to the " + prediction[1] + " category.")
	from sklearn import cross_validation
	from sklearn.naive_bayes import GaussianNB
	from sklearn.feature_selection import SelectPercentile, f_classif
	from sklearn.feature_extraction.text import TfidfVectorizer

	words = ["I am strong", "I am slender", "I am handsome", "I am pretty", "I am big", "I am beautiful", "strength", "pretty", "muscle", "looks"]
	authors = ["man", "woman", "man", "woman", "man", "woman", "man", "woman", "man", "woman"]
	word_test = ['beauty', 'big muscle']

	# Split into training and testing data.
	features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(words, authors, test_size=0.1, random_state=42)

	# Text vectorization - convert from strings to lists of numbers
	vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
	features_train_transformed = vectorizer.fit_transform(features_train).toarray()
	features_test_transformed = vectorizer.transform(features_test).toarray()
	word_test_transformed = vectorizer.transform(word_test).toarray()

	clf = GaussianNB()
	clf.fit(features_train_transformed, labels_train)

	prediction = clf.predict(word_test_transformed)
	print("Our classifier predicts that " + word_test[0] + " belongs to the " + prediction[0] + " category.")
	print("Our classifier predicts that " + word_test[1] + " belongs to the " + prediction[1] + " category.")