Skip to content

Instantly share code, notes, and snippets.

View ugik's full-sized avatar

GK ugik

View GitHub Profile
ugik /
Last active June 2, 2020 14:58
an algorithm for text classification
# use natural language toolkit
import nltk
from nltk.corpus import stopwords
from import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
# use natural language toolkit
import nltk
from import LancasterStemmer
# word stemmer
stemmer = LancasterStemmer()
# 3 classes of training data
training_data = []
training_data.append({"class":"greeting", "sentence":"how are you?"})
training_data.append({"class":"greeting", "sentence":"how is your day?"})
training_data.append({"class":"greeting", "sentence":"good day"})
training_data.append({"class":"greeting", "sentence":"how is it going today?"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
training_data.append({"class":"goodbye", "sentence":"see you later"})
training_data.append({"class":"goodbye", "sentence":"have a nice day"})
# capture unique stemmed words in the training corpus
corpus_words = {}
class_words = {}
# turn a list into a set (of unique items) and then a list again (this removes duplicates)
classes = list(set([a['class'] for a in training_data]))
for c in classes:
# prepare a list of words within each class
class_words[c] = []
# loop through each sentence in our training data
# calculate a score for a given class
def calculate_class_score(sentence, class_name, show_details=True):
score = 0
# tokenize each word in our new sentence
for word in nltk.word_tokenize(sentence):
# check to see if the stem of the word is in any of our classes
if stemmer.stem(word.lower()) in class_words[class_name]:
# treat each word with same weight
score += 1
# we can now calculate a score for a new sentence
sentence = "good day for us to have lunch?"
# now we can find the class with the highest score
for c in class_words.keys():
print ("Class: %s Score: %s \n" % (c, calculate_class_score(sentence, c)))
# calculate a score for a given class taking into account word commonality
def calculate_class_score(sentence, class_name, show_details=True):
score = 0
# tokenize each word in our new sentence
for word in nltk.word_tokenize(sentence):
# check to see if the stem of the word is in any of our classes
if stemmer.stem(word.lower()) in class_words[class_name]:
# treat each word with relative weight
score += (1 / corpus_words[stemmer.stem(word.lower())])
# return the class with highest score for sentence
def classify(sentence):
high_class = None
high_score = 0
# loop through our classes
for c in class_words.keys():
# calculate score of sentence for each class
score = calculate_class_score_commonality(sentence, c, show_details=False)
# keep track of highest score
if score > high_score:
class NeuralNetwork():
def __init__(self):
# Seed the random number generator, so it generates the same numbers
# every time the program runs.
# We model a single neuron, with 3 input connections and 1 output connection.
# We assign random weights to a 3 x 1 matrix, with values in the range -1 to 1
# and mean 0.
self.synaptic_weights = 2 * random.random((3, 1)) - 1
ugik / simple_ANN_part0
Last active January 21, 2017 00:22
simple ANN
import numpy
# The training set. We have 4 examples, each consisting of 3 input values
# and 1 output value.
training_set_inputs = array([[0, 0, 1], [1, 1, 1], [1, 0, 1], [0, 1, 0]])
training_set_outputs = array([[0, 1, 1, 0]]).T