Skip to content

Instantly share code, notes, and snippets.

@bumie-e
Last active February 7, 2021 12:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bumie-e/12b3321b48133a57389bcafa7beb52f2 to your computer and use it in GitHub Desktop.
Save bumie-e/12b3321b48133a57389bcafa7beb52f2 to your computer and use it in GitHub Desktop.
import numpy as np
import nltk
# nltk.download('punkt')
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()
def tokenize(sentence):
return nltk.word_tokenize(sentence)
def stem(word):
return stemmer.stem(word.lower())
def bag_of_words(tokenized_sentence, words):
# stem each word
sentence_words = [stem(word) for word in tokenized_sentence]
# initialize bag with 0 for each word
bag = np.zeros(len(words), dtype=np.float32)
for idx, w in enumerate(words):
if w in sentence_words:
bag[idx] = 1
return bag
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment