Skip to content

Instantly share code, notes, and snippets.

@JeremyEnglert
Created June 22, 2018 01:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save JeremyEnglert/3eda4a123244c37b669472d9e8166ea6 to your computer and use it in GitHub Desktop.
Save JeremyEnglert/3eda4a123244c37b669472d9e8166ea6 to your computer and use it in GitHub Desktop.
Python Machine Learning Sentiment Classifier using NLTK Twitter Corpus
import nltk
from nltk.corpus import twitter_samples
#####
##### SENTIMENENT FUNCTION
#####
def naiveBayesSentimentCalculator(review):
problemInstance = review.split()
problemFeatures = extract_features(problemInstance)
return trainedNBClassifer.classify(problemFeatures)
#####
##### CREATE, TRAIN AND TEST CLASSIFIER
#####
# Save positive tweets from nltk corpus into positiveTweets list
positiveTweets = twitter_samples.strings('positive_tweets.json')
# Save negative tweets from nltk corpus into negativeTweets list
negativeTweets = twitter_samples.strings('negative_tweets.json')
# Split review data into two parts for training and testing
testTrainingSplitIndex = 2500
# Grab all reviews in the range of 0 to testTrainingSplitIndex
# This data is used to train the classifier
trainingNegativeTweets = negativeTweets[:testTrainingSplitIndex]
trainingPositiveTweets = positiveTweets[:testTrainingSplitIndex]
# Grab all reviews in the range of testTrainingSplitIndex to the end
# This model is used to test the classifier
testNegativeTweets = negativeTweets[testTrainingSplitIndex+1:]
testPositiveTweets = positiveTweets[testTrainingSplitIndex+1:]
# Break up data into words to define vocabulary
# Lists are created using nested for loop
# https://www.reddit.com/r/learnpython/comments/8ro4aj/help_me_understand_multiple_for_loops_inside_of_a/
def getVocabulary():
negativeWordList = [word for line in trainingNegativeTweets for word in line.split()]
positiveWordList = [word for line in trainingPositiveTweets for word in line.split()]
allWordList = [item for sublist in [positiveWordList, negativeWordList] for item in sublist]
allWordSet = list(set(allWordList))
vocabulary = allWordSet
return vocabulary
vocabulary = getVocabulary()
def getTrainingData():
negTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'negative'} for oneReview in trainingNegativeTweets]
posTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'positive'} for oneReview in trainingPositiveTweets]
fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList, posTaggedTrainingReviewList] for item in sublist]
trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData]
return trainingData
trainingData = getTrainingData()
# Add reviews to review_words list
# Check if word exists in vocabulary
# This will return a diictionary of all words in the vocabulary
# The key will be the word in the vocabulary
# The value will be true/false, depending on if the word from the vocabulary is in the review
def extract_features(review):
review_words=set(review)
features={}
for word in vocabulary: # Loop through every word in the vocabulary
features[word]=(word in review_words) # If word from vocabulary matches a word from the review, mark as true
return features
def getTrainedNaiveBayesClassifer(extract_features, trainingData):
trainingFeatures = nltk.classify.apply_features(extract_features, trainingData)
trainedNBClassifer = nltk.NaiveBayesClassifier.train(trainingFeatures)
return trainedNBClassifer
trainedNBClassifer = getTrainedNaiveBayesClassifer(extract_features, trainingData)
def getTesttReviewSentiments(naiveBayesSentimentCalculator):
testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeTweets]
testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveTweets]
labelToNum = {'positive': 1, 'negative': -1}
numericNegResults = [labelToNum[x] for x in testNegResults]
numericPosResults = [labelToNum[x] for x in testPosResults]
return {'results-on-positive' :numericPosResults, 'results-on-negative' :numericNegResults}
def runDiagnostics(reviewResult):
positiveReviewsResult = reviewResult['results-on-positive']
negativeReviewsResult = reviewResult['results-on-negative']
numTruePositive = sum(x > 0 for x in positiveReviewsResult)
numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
pctTruePositive = float(numTruePositive)/len(positiveReviewsResult)
pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult)
totalAccurate = numTruePositive + numTrueNegative
total = len(positiveReviewsResult) + len(negativeReviewsResult)
print ("Accuracy on positive tweets = " +"%.2f" % (pctTruePositive*100) + "%")
print ("Accurance on negative tweets = " +"%.2f" % (pctTrueNegative*100) + "%")
print ("Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%")
runDiagnostics(getTesttReviewSentiments(naiveBayesSentimentCalculator))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment