Skip to content

Instantly share code, notes, and snippets.

@ugik
Last active June 2, 2020 14:58
Show Gist options
  • Save ugik/0f911ad00de7cdd770d7ae31a275f27e to your computer and use it in GitHub Desktop.
Save ugik/0f911ad00de7cdd770d7ae31a275f27e to your computer and use it in GitHub Desktop.
an algorithm for text classification
# use natural language toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"talk to you soon"})
training_data.append({"class":"sandwich", "sentence":"make me a sandwich"})
training_data.append({"class":"sandwich", "sentence":"can you make a sandwich?"})
training_data.append({"class":"sandwich", "sentence":"having a sandwich today?"})
training_data.append({"class":"sandwich", "sentence":"what's for lunch?"})
print ("%s sentences in training data" % len(training_data))
# capture unique stemmed words in the training corpus
corpus_words = {}
classes = list(set([a['class'] for a in training_data]))
for c in classes:
class_words[c] = []
for data in training_data:
# tokenize each sentence into words
for word in nltk.word_tokenize(data['sentence']):
# ignore a few things
if word not in ["?", "'s"]:
# stem and lowercase each word
stemmed_word = stemmer.stem(word.lower())
if stemmed_word not in corpus_words:
corpus_words[stemmed_word] = 1
else:
corpus_words[stemmed_word] += 1
class_words[data['class']].extend([stemmed_word])
# we now have each word and the number of occurances of the word in our training corpus (the word's commonality)
print ("Corpus words and counts: %s" % corpus_words)
# also we have all words in each class
print ("Class words: %s" % class_words)
@ugik
Copy link
Author

ugik commented Jan 11, 2017

an algorithm for text classification

@surya-vamsi2310
Copy link

can we give sentences and classes in text or from excel sheet

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment