Last active
March 5, 2018 13:30
-
-
Save dkreider/404123ce231a26221b65d8050531f1c4 to your computer and use it in GitHub Desktop.
Beginners guide to building a Naive Bayes classifier (https://danielk.tech/blog/build-your-first-naive-bayes-classifier.html)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn import cross_validation | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.feature_selection import SelectPercentile, f_classif | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
words = ["I am strong", "I am slender", "I am handsome", "I am pretty", "I am big", "I am beautiful", "strength", "pretty", "muscle", "looks"] | |
authors = ["man", "woman", "man", "woman", "man", "woman", "man", "woman", "man", "woman"] | |
word_test = ['beauty', 'big muscle'] | |
# Split into training and testing data. | |
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(words, authors, test_size=0.1, random_state=42) | |
# Text vectorization - convert from strings to lists of numbers | |
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') | |
features_train_transformed = vectorizer.fit_transform(features_train).toarray() | |
features_test_transformed = vectorizer.transform(features_test).toarray() | |
word_test_transformed = vectorizer.transform(word_test).toarray() | |
clf = GaussianNB() | |
clf.fit(features_train_transformed, labels_train) | |
prediction = clf.predict(word_test_transformed) | |
print("Our classifier predicts that " + word_test[0] + " belongs to the " + prediction[0] + " category.") | |
print("Our classifier predicts that " + word_test[1] + " belongs to the " + prediction[1] + " category.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment