JeremyEnglert/train_test_classifier.py

## train_test_classifier.py
import nltk
from nltk.corpus import twitter_samples

#####
##### SENTIMENENT FUNCTION
#####
def naiveBayesSentimentCalculator(review):
    problemInstance = review.split()
    problemFeatures = extract_features(problemInstance)
    return trainedNBClassifer.classify(problemFeatures)


#####
##### CREATE, TRAIN AND TEST CLASSIFIER
#####

# Save positive tweets from nltk corpus into positiveTweets list
positiveTweets = twitter_samples.strings('positive_tweets.json')

# Save negative tweets from nltk corpus into negativeTweets list
negativeTweets = twitter_samples.strings('negative_tweets.json')

# Split review data into two parts for training and testing
testTrainingSplitIndex = 2500

# Grab all reviews in the range of 0 to testTrainingSplitIndex
# This data is used to train the classifier
trainingNegativeTweets = negativeTweets[:testTrainingSplitIndex]
trainingPositiveTweets = positiveTweets[:testTrainingSplitIndex]

# Grab all reviews in the range of testTrainingSplitIndex to the end
# This model is used to test the classifier
testNegativeTweets = negativeTweets[testTrainingSplitIndex+1:]
testPositiveTweets = positiveTweets[testTrainingSplitIndex+1:]


# Break up data into words to define vocabulary
# Lists are created using nested for loop
# https://www.reddit.com/r/learnpython/comments/8ro4aj/help_me_understand_multiple_for_loops_inside_of_a/
def getVocabulary():
    negativeWordList = [word for line in trainingNegativeTweets for word in line.split()]
    positiveWordList = [word for line in trainingPositiveTweets for word in line.split()]
    allWordList = [item for sublist in [positiveWordList, negativeWordList] for item in sublist]
    allWordSet = list(set(allWordList))
    vocabulary = allWordSet
    return vocabulary

vocabulary = getVocabulary()


def getTrainingData():
    negTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'negative'} for oneReview in trainingNegativeTweets]
    posTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'positive'} for oneReview in trainingPositiveTweets]
    fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList, posTaggedTrainingReviewList] for item in sublist]
    trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData]
    return trainingData

trainingData = getTrainingData()


# Add reviews to review_words list
# Check if word exists in vocabulary
# This will return a diictionary of all words in the vocabulary
# The key will be the word in the vocabulary
# The value will be true/false, depending on if the word from the vocabulary is in the review
def extract_features(review):
    review_words=set(review)
    features={}
    for word in vocabulary: # Loop through every word in the vocabulary
        features[word]=(word in review_words) # If word from vocabulary matches a word from the review, mark as true
    return features


def getTrainedNaiveBayesClassifer(extract_features, trainingData):
    trainingFeatures = nltk.classify.apply_features(extract_features, trainingData)
    trainedNBClassifer = nltk.NaiveBayesClassifier.train(trainingFeatures)
    return trainedNBClassifer

trainedNBClassifer = getTrainedNaiveBayesClassifer(extract_features, trainingData)


def getTesttReviewSentiments(naiveBayesSentimentCalculator):
    testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeTweets]
    testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveTweets]
    labelToNum = {'positive': 1, 'negative': -1}
    numericNegResults = [labelToNum[x] for x in testNegResults]
    numericPosResults = [labelToNum[x] for x in testPosResults]
    return {'results-on-positive' :numericPosResults, 'results-on-negative' :numericNegResults}


def runDiagnostics(reviewResult):
    positiveReviewsResult = reviewResult['results-on-positive']
    negativeReviewsResult = reviewResult['results-on-negative']
    numTruePositive = sum(x > 0 for x in positiveReviewsResult)
    numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
    pctTruePositive = float(numTruePositive)/len(positiveReviewsResult)
    pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult)
    totalAccurate = numTruePositive + numTrueNegative
    total = len(positiveReviewsResult) + len(negativeReviewsResult)
    print ("Accuracy on positive tweets = " +"%.2f" % (pctTruePositive*100) + "%")
    print ("Accurance on negative tweets = " +"%.2f" % (pctTrueNegative*100) + "%")
    print ("Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%")

runDiagnostics(getTesttReviewSentiments(naiveBayesSentimentCalculator))
	import nltk
	from nltk.corpus import twitter_samples

	#####
	##### SENTIMENENT FUNCTION
	#####
	def naiveBayesSentimentCalculator(review):
	problemInstance = review.split()
	problemFeatures = extract_features(problemInstance)
	return trainedNBClassifer.classify(problemFeatures)


	#####
	##### CREATE, TRAIN AND TEST CLASSIFIER
	#####

	# Save positive tweets from nltk corpus into positiveTweets list
	positiveTweets = twitter_samples.strings('positive_tweets.json')

	# Save negative tweets from nltk corpus into negativeTweets list
	negativeTweets = twitter_samples.strings('negative_tweets.json')

	# Split review data into two parts for training and testing
	testTrainingSplitIndex = 2500

	# Grab all reviews in the range of 0 to testTrainingSplitIndex
	# This data is used to train the classifier
	trainingNegativeTweets = negativeTweets[:testTrainingSplitIndex]
	trainingPositiveTweets = positiveTweets[:testTrainingSplitIndex]

	# Grab all reviews in the range of testTrainingSplitIndex to the end
	# This model is used to test the classifier
	testNegativeTweets = negativeTweets[testTrainingSplitIndex+1:]
	testPositiveTweets = positiveTweets[testTrainingSplitIndex+1:]


	# Break up data into words to define vocabulary
	# Lists are created using nested for loop
	# https://www.reddit.com/r/learnpython/comments/8ro4aj/help_me_understand_multiple_for_loops_inside_of_a/
	def getVocabulary():
	negativeWordList = [word for line in trainingNegativeTweets for word in line.split()]
	positiveWordList = [word for line in trainingPositiveTweets for word in line.split()]
	allWordList = [item for sublist in [positiveWordList, negativeWordList] for item in sublist]
	allWordSet = list(set(allWordList))
	vocabulary = allWordSet
	return vocabulary

	vocabulary = getVocabulary()


	def getTrainingData():
	negTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'negative'} for oneReview in trainingNegativeTweets]
	posTaggedTrainingReviewList = [{'review' : oneReview.split(), 'label' : 'positive'} for oneReview in trainingPositiveTweets]
	fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList, posTaggedTrainingReviewList] for item in sublist]
	trainingData = [(review['review'], review['label']) for review in fullTaggedTrainingData]
	return trainingData

	trainingData = getTrainingData()


	# Add reviews to review_words list
	# Check if word exists in vocabulary
	# This will return a diictionary of all words in the vocabulary
	# The key will be the word in the vocabulary
	# The value will be true/false, depending on if the word from the vocabulary is in the review
	def extract_features(review):
	review_words=set(review)
	features={}
	for word in vocabulary: # Loop through every word in the vocabulary
	features[word]=(word in review_words) # If word from vocabulary matches a word from the review, mark as true
	return features


	def getTrainedNaiveBayesClassifer(extract_features, trainingData):
	trainingFeatures = nltk.classify.apply_features(extract_features, trainingData)
	trainedNBClassifer = nltk.NaiveBayesClassifier.train(trainingFeatures)
	return trainedNBClassifer

	trainedNBClassifer = getTrainedNaiveBayesClassifer(extract_features, trainingData)


	def getTesttReviewSentiments(naiveBayesSentimentCalculator):
	testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeTweets]
	testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveTweets]
	labelToNum = {'positive': 1, 'negative': -1}
	numericNegResults = [labelToNum[x] for x in testNegResults]
	numericPosResults = [labelToNum[x] for x in testPosResults]
	return {'results-on-positive' :numericPosResults, 'results-on-negative' :numericNegResults}


	def runDiagnostics(reviewResult):
	positiveReviewsResult = reviewResult['results-on-positive']
	negativeReviewsResult = reviewResult['results-on-negative']
	numTruePositive = sum(x > 0 for x in positiveReviewsResult)
	numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
	pctTruePositive = float(numTruePositive)/len(positiveReviewsResult)
	pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult)
	totalAccurate = numTruePositive + numTrueNegative
	total = len(positiveReviewsResult) + len(negativeReviewsResult)
	print ("Accuracy on positive tweets = " +"%.2f" % (pctTruePositive*100) + "%")
	print ("Accurance on negative tweets = " +"%.2f" % (pctTrueNegative*100) + "%")
	print ("Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%")

	runDiagnostics(getTesttReviewSentiments(naiveBayesSentimentCalculator))