Last active
June 2, 2020 14:58
-
-
Save ugik/0f911ad00de7cdd770d7ae31a275f27e to your computer and use it in GitHub Desktop.
an algorithm for text classification
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# use natural language toolkit | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.stem.lancaster import LancasterStemmer | |
# word stemmer | |
stemmer = LancasterStemmer() | |
# 3 classes of training data | |
training_data = [] | |
training_data.append({"class":"greeting", "sentence":"how are you?"}) | |
training_data.append({"class":"greeting", "sentence":"how is your day?"}) | |
training_data.append({"class":"greeting", "sentence":"good day"}) | |
training_data.append({"class":"greeting", "sentence":"how is it going today?"}) | |
training_data.append({"class":"goodbye", "sentence":"have a nice day"}) | |
training_data.append({"class":"goodbye", "sentence":"see you later"}) | |
training_data.append({"class":"goodbye", "sentence":"have a nice day"}) | |
training_data.append({"class":"goodbye", "sentence":"talk to you soon"}) | |
training_data.append({"class":"sandwich", "sentence":"make me a sandwich"}) | |
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"}) | |
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"}) | |
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"}) | |
print ("%s sentences in training data" % len(training_data)) | |
# capture unique stemmed words in the training corpus | |
corpus_words = {} | |
classes = list(set([a['class'] for a in training_data])) | |
for c in classes: | |
class_words[c] = [] | |
for data in training_data: | |
# tokenize each sentence into words | |
for word in nltk.word_tokenize(data['sentence']): | |
# ignore a few things | |
if word not in ["?", "'s"]: | |
# stem and lowercase each word | |
stemmed_word = stemmer.stem(word.lower()) | |
if stemmed_word not in corpus_words: | |
corpus_words[stemmed_word] = 1 | |
else: | |
corpus_words[stemmed_word] += 1 | |
class_words[data['class']].extend([stemmed_word]) | |
# we now have each word and the number of occurances of the word in our training corpus (the word's commonality) | |
print ("Corpus words and counts: %s" % corpus_words) | |
# also we have all words in each class | |
print ("Class words: %s" % class_words) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
can we give sentences and classes in text or from excel sheet