fabianobizarro/nlp-python.py

## nlp-python.py
import nltk
from nltk.stem.lancaster import LancasterStemmer

# word stemmer
stemmer = LancasterStemmer()

# 3 classes of training data

training_data = []
training_data.append({"class":"greeting", "sentence":"como vai você?"})
training_data.append({"class":"greeting", "sentence":"como cê ta?"})
training_data.append({"class":"greeting", "sentence":"como vai?"})
training_data.append({"class":"greeting", "sentence":"tudo bem?"})
training_data.append({"class":"greeting", "sentence":"e ai"})
training_data.append({"class":"greeting", "sentence":"Tudo joia?"})
training_data.append({"class":"greeting", "sentence":"tudo beleza?"})
training_data.append({"class":"greeting", "sentence":"tudo blz?"})

training_data.append({"class":"goodbye", "sentence":"flw"})
training_data.append({"class":"goodbye", "sentence":"até mais"})
training_data.append({"class":"goodbye", "sentence":"thcau"})
training_data.append({"class":"goodbye", "sentence":"flw vlw"})
training_data.append({"class":"goodbye", "sentence":"adeus"})
training_data.append({"class":"goodbye", "sentence":"te mais"})

training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
#print ("%s sentences of training data" % len(training_data))


corpus_words = { }
class_words = { }

classes = list(set([a['class'] for a in training_data]))

for c in classes:
    class_words[c] = []

# loop through each sentence in our training data
for data in training_data:
    # tokenize each sentence into words
    for word in nltk.word_tokenize(data['sentence']):
        # ignore a some things
        if word not in ["?", "'s"]:
            # stem and lowercase each word
            stemmed_word = stemmer.stem(word.lower())
            # have we not seen this word already?
            if stemmed_word not in corpus_words:
                corpus_words[stemmed_word] = 1
            else:
                corpus_words[stemmed_word] += 1

            # add the word to our words in class list
            class_words[data['class']].extend([stemmed_word])


# print ("Corpus words and counts: %s \n" % corpus_words)
# print ("Class words: %s" % class_words)

print(corpus_words)
# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
    score = 0
    # tokenize each word in our new sentence
    for word in nltk.word_tokenize(sentence):
        # check to see if the stem of the word is in any of our classes
        if stemmer.stem(word.lower()) in class_words[class_name]:
            # treat each word with same weight
            score += (1 / corpus_words[stemmer.stem(word.lower())])

            if show_details:
                print ("   match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
    return score


def classify(sentence):

    high_class = None
    high_score = 0
    # loop through our classes
    for c in class_words.keys():
        # calculate score of sentence for each class
        score = calculate_class_score(sentence, c)
        # keep track of highest score
        if score > high_score:
            high_class = c
            high_score = score

    return high_class, high_score
	import nltk
	from nltk.stem.lancaster import LancasterStemmer

	# word stemmer
	stemmer = LancasterStemmer()

	# 3 classes of training data

	training_data = []
	training_data.append({"class":"greeting", "sentence":"como vai você?"})
	training_data.append({"class":"greeting", "sentence":"como cê ta?"})
	training_data.append({"class":"greeting", "sentence":"como vai?"})
	training_data.append({"class":"greeting", "sentence":"tudo bem?"})
	training_data.append({"class":"greeting", "sentence":"e ai"})
	training_data.append({"class":"greeting", "sentence":"Tudo joia?"})
	training_data.append({"class":"greeting", "sentence":"tudo beleza?"})
	training_data.append({"class":"greeting", "sentence":"tudo blz?"})

	training_data.append({"class":"goodbye", "sentence":"flw"})
	training_data.append({"class":"goodbye", "sentence":"até mais"})
	training_data.append({"class":"goodbye", "sentence":"thcau"})
	training_data.append({"class":"goodbye", "sentence":"flw vlw"})
	training_data.append({"class":"goodbye", "sentence":"adeus"})
	training_data.append({"class":"goodbye", "sentence":"te mais"})

	training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
	training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
	training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
	training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
	#print ("%s sentences of training data" % len(training_data))


	corpus_words = { }
	class_words = { }

	classes = list(set([a['class'] for a in training_data]))

	for c in classes:
	class_words[c] = []

	# loop through each sentence in our training data
	for data in training_data:
	# tokenize each sentence into words
	for word in nltk.word_tokenize(data['sentence']):
	# ignore a some things
	if word not in ["?", "'s"]:
	# stem and lowercase each word
	stemmed_word = stemmer.stem(word.lower())
	# have we not seen this word already?
	if stemmed_word not in corpus_words:
	corpus_words[stemmed_word] = 1
	else:
	corpus_words[stemmed_word] += 1

	# add the word to our words in class list
	class_words[data['class']].extend([stemmed_word])


	# print ("Corpus words and counts: %s \n" % corpus_words)
	# print ("Class words: %s" % class_words)

	print(corpus_words)
	# calculate a score for a given class
	def calculate_class_score(sentence, class_name, show_details=True):
	score = 0
	# tokenize each word in our new sentence
	for word in nltk.word_tokenize(sentence):
	# check to see if the stem of the word is in any of our classes
	if stemmer.stem(word.lower()) in class_words[class_name]:
	# treat each word with same weight
	score += (1 / corpus_words[stemmer.stem(word.lower())])

	if show_details:
	print (" match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
	return score


	def classify(sentence):

	high_class = None
	high_score = 0
	# loop through our classes
	for c in class_words.keys():
	# calculate score of sentence for each class
	score = calculate_class_score(sentence, c)
	# keep track of highest score
	if score > high_score:
	high_class = c
	high_score = score

	return high_class, high_score