Skip to content

Instantly share code, notes, and snippets.

@fabianobizarro
Created August 25, 2017 18:05
Show Gist options
  • Save fabianobizarro/648165d1a507b440506120b5b52dc340 to your computer and use it in GitHub Desktop.
Save fabianobizarro/648165d1a507b440506120b5b52dc340 to your computer and use it in GitHub Desktop.
import nltk
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"como vai você?"})
training_data.append({"class":"greeting", "sentence":"como cê ta?"})
training_data.append({"class":"greeting", "sentence":"como vai?"})
training_data.append({"class":"greeting", "sentence":"tudo bem?"})
training_data.append({"class":"greeting", "sentence":"e ai"})
training_data.append({"class":"greeting", "sentence":"Tudo joia?"})
training_data.append({"class":"greeting", "sentence":"tudo beleza?"})
training_data.append({"class":"greeting", "sentence":"tudo blz?"})
training_data.append({"class":"goodbye", "sentence":"flw"})
training_data.append({"class":"goodbye", "sentence":"até mais"})
training_data.append({"class":"goodbye", "sentence":"thcau"})
training_data.append({"class":"goodbye", "sentence":"flw vlw"})
training_data.append({"class":"goodbye", "sentence":"adeus"})
training_data.append({"class":"goodbye", "sentence":"te mais"})
training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
#print ("%s sentences of training data" % len(training_data))
corpus_words = { }
class_words = { }
classes = list(set([a['class'] for a in training_data]))
for c in classes:
class_words[c] = []
# loop through each sentence in our training data
for data in training_data:
# tokenize each sentence into words
for word in nltk.word_tokenize(data['sentence']):
# ignore a some things
if word not in ["?", "'s"]:
# stem and lowercase each word
stemmed_word = stemmer.stem(word.lower())
# have we not seen this word already?
if stemmed_word not in corpus_words:
corpus_words[stemmed_word] = 1
else:
corpus_words[stemmed_word] += 1
# add the word to our words in class list
class_words[data['class']].extend([stemmed_word])
# print ("Corpus words and counts: %s \n" % corpus_words)
# print ("Class words: %s" % class_words)
print(corpus_words)
# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
score = 0
# tokenize each word in our new sentence
for word in nltk.word_tokenize(sentence):
# check to see if the stem of the word is in any of our classes
if stemmer.stem(word.lower()) in class_words[class_name]:
# treat each word with same weight
score += (1 / corpus_words[stemmer.stem(word.lower())])
if show_details:
print (" match: %s (%s)" % (stemmer.stem(word.lower()), 1 / corpus_words[stemmer.stem(word.lower())]))
return score
def classify(sentence):
high_class = None
high_score = 0
# loop through our classes
for c in class_words.keys():
# calculate score of sentence for each class
score = calculate_class_score(sentence, c)
# keep track of highest score
if score > high_score:
high_class = c
high_score = score
return high_class, high_score
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment