Skip to content

Instantly share code, notes, and snippets.

@dkreider
Last active March 5, 2018 13:30
Show Gist options
  • Save dkreider/404123ce231a26221b65d8050531f1c4 to your computer and use it in GitHub Desktop.
Save dkreider/404123ce231a26221b65d8050531f1c4 to your computer and use it in GitHub Desktop.
Beginners guide to building a Naive Bayes classifier (https://danielk.tech/blog/build-your-first-naive-bayes-classifier.html)
from sklearn import cross_validation
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.feature_extraction.text import TfidfVectorizer
words = ["I am strong", "I am slender", "I am handsome", "I am pretty", "I am big", "I am beautiful", "strength", "pretty", "muscle", "looks"]
authors = ["man", "woman", "man", "woman", "man", "woman", "man", "woman", "man", "woman"]
word_test = ['beauty', 'big muscle']
# Split into training and testing data.
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(words, authors, test_size=0.1, random_state=42)
# Text vectorization - convert from strings to lists of numbers
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')
features_train_transformed = vectorizer.fit_transform(features_train).toarray()
features_test_transformed = vectorizer.transform(features_test).toarray()
word_test_transformed = vectorizer.transform(word_test).toarray()
clf = GaussianNB()
clf.fit(features_train_transformed, labels_train)
prediction = clf.predict(word_test_transformed)
print("Our classifier predicts that " + word_test[0] + " belongs to the " + prediction[0] + " category.")
print("Our classifier predicts that " + word_test[1] + " belongs to the " + prediction[1] + " category.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment