ugik/multinomialNaiveBayes.py

## multinomialNaiveBayes.py
# use natural language toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()

# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})

training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})

training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
print ("%s sentences in training data" % len(training_data))

# capture unique stemmed words in the training corpus
corpus_words = {}
classes = list(set([a['class'] for a in training_data]))
for c in classes:
    class_words[c] = []

for data in training_data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a few things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            class_words[data['class']].extend([stemmed_word])

# we now have each word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)
	# use natural language toolkit
	import nltk
	from nltk.corpus import stopwords
	from nltk.stem.lancaster import LancasterStemmer
	# word stemmer
	stemmer = LancasterStemmer()

	# 3 classes of training data
	training_data = []
	training_data.append({"class":"greeting", "sentence":"how are you?"})
	training_data.append({"class":"greeting", "sentence":"how is your day?"})
	training_data.append({"class":"greeting", "sentence":"good day"})
	training_data.append({"class":"greeting", "sentence":"how is it going today?"})

	training_data.append({"class":"goodbye", "sentence":"have a nice day"})
	training_data.append({"class":"goodbye", "sentence":"see you later"})
	training_data.append({"class":"goodbye", "sentence":"have a nice day"})
	training_data.append({"class":"goodbye", "sentence":"talk to you soon"})

	training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
	training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
	training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
	training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
	print ("%s sentences in training data" % len(training_data))

	# capture unique stemmed words in the training corpus
	corpus_words = {}
	classes = list(set([a['class'] for a in training_data]))
	for c in classes:
	class_words[c] = []

	for data in training_data:
	# tokenize each sentence into words
	for word in nltk.word_tokenize(data['sentence']):
	# ignore a few things
	if word not in ["?", "'s"]:
	# stem and lowercase each word
	stemmed_word = stemmer.stem(word.lower())
	if stemmed_word not in corpus_words:
	corpus_words[stemmed_word] = 1
	else:
	corpus_words[stemmed_word] += 1

	class_words[data['class']].extend([stemmed_word])

	# we now have each word and the number of occurances of the word in our training corpus (the word's commonality)
	print ("Corpus words and counts: %s" % corpus_words)
	# also we have all words in each class
	print ("Class words: %s" % class_words)