Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
'''
Created on Dec 4, 2016
@author: Elliot
'''
import nltk
import random
from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
class VoteClassifier(ClassifierI):
def __init__(self, *classifiers):
self._classifiers = classifiers
def classify(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
return mode(votes)
def confidence(self, features):
votes = []
for c in self._classifiers:
v = c.classify(features)
votes.append(v)
choice_votes = votes.count(mode(votes))
conf = choice_votes / len(votes)
return conf
def createClassifier(posFileName,negFileName):
short_pos = open(posFileName,"r").read()
short_neg = open(negFileName,"r").read()
documents = []
for r in short_pos.split('\n'):
documents.append( (r, "pos") )
posCount = len(documents)
print("positive reviews", posCount)
for r in short_neg.split('\n'):
documents.append( (r, "neg") )
print("negative reviews",len(documents)-posCount)
all_words = []
short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)
for w in short_pos_words:
all_words.append(w.lower())
for w in short_neg_words:
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = [w for (w,c) in all_words.most_common(5000)]
def find_features(document):
words = word_tokenize(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(rev), category) for (rev, category) in documents]
random.shuffle(featuresets)
# positive data example:
lengthFeatureSet= len(featuresets)
print("length feature set",lengthFeatureSet)
if(lengthFeatureSet<100):
print("feature set must have atleast a 100 reviews")
return -1
#make this modifiable
rangeTrainingSet = int(lengthFeatureSet*0.85)
training_set = featuresets[:rangeTrainingSet]
testing_set = featuresets[rangeTrainingSet:]
##
### negative data example:
##training_set = featuresets[100:]
##testing_set = featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)
##SVC_classifier = SklearnClassifier(SVC())
##SVC_classifier.train(training_set)
##print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)
voted_classifier = VoteClassifier(
NuSVC_classifier,
LinearSVC_classifier,
MNB_classifier,
BernoulliNB_classifier,
LogisticRegression_classifier)
print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)
save_classifier = open("23MNBCustom.pickle","wb")
pickle.dump(MNB_classifier, save_classifier)
save_classifier.close()
save_classifier = open("23BernoulliCustom.pickle","wb")
pickle.dump(BernoulliNB_classifier, save_classifier)
save_classifier.close()
save_classifier = open("23LogisticRegressionCustom.pickle","wb")
pickle.dump(LogisticRegression_classifier, save_classifier)
save_classifier.close()
save_classifier = open("23LinearCustom.pickle","wb")
pickle.dump(LinearSVC_classifier, save_classifier)
save_classifier.close()
save_classifier = open("23NuSVCCustom.pickle","wb")
pickle.dump(NuSVC_classifier, save_classifier)
save_classifier.close()
print("finished building and saving classifiers")
createClassifier("positive.txt", "negative.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.