Created
January 24, 2017 20:48
-
-
Save ugik/bf71f2bdce301df30951a224392cd382 to your computer and use it in GitHub Desktop.
text ANN bag of words setup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create our training data | |
training = [] | |
output = [] | |
# create an empty array for our output | |
output_empty = [0] * len(classes) | |
# training set, bag of words for each sentence | |
for doc in documents: | |
# initialize our bag of words | |
bag = [] | |
# list of tokenized words for the pattern | |
pattern_words = doc[0] | |
# stem each word | |
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words] | |
# create our bag of words array | |
for w in words: | |
bag.append(1) if w in pattern_words else bag.append(0) | |
training.append(bag) | |
# output is a '0' for each tag and '1' for current tag | |
output_row = list(output_empty) | |
output_row[classes.index(doc[1])] = 1 | |
output.append(output_row) | |
# sample training/output | |
i = 0 | |
w = documents[i][0] | |
print ([stemmer.stem(word.lower()) for word in w]) | |
print (training[i]) | |
print (output[i]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment