ugik/multinomialNaiveBayes_part3.py

## multinomialNaiveBayes_part3.py
# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    # prepare a list of words within each class
    class_words[c] = []

# loop through each sentence in our training data
for data in training_data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])

# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s \n" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)
	# capture unique stemmed words in the training corpus
	corpus_words = {}
	class_words = {}
	# turn a list into a set (of unique items) and then a list again (this removes duplicates)
	classes = list(set([a['class'] for a in training_data]))
	for c in classes:
	# prepare a list of words within each class
	class_words[c] = []

	# loop through each sentence in our training data
	for data in training_data:
	# tokenize each sentence into words
	for word in nltk.word_tokenize(data['sentence']):
	# ignore a some things
	if word not in ["?", "'s"]:
	# stem and lowercase each word
	stemmed_word = stemmer.stem(word.lower())
	# have we not seen this word already?
	if stemmed_word not in corpus_words:
	corpus_words[stemmed_word] = 1
	else:
	corpus_words[stemmed_word] += 1

	# add the word to our words in class list
	class_words[data['class']].extend([stemmed_word])

	# we now have each stemmed word and the number of occurances of the word in our training corpus (the word's commonality)
	print ("Corpus words and counts: %s \n" % corpus_words)
	# also we have all words in each class
	print ("Class words: %s" % class_words)