Skip to content

Instantly share code, notes, and snippets.

@oysters76
Created March 27, 2021 16:07
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save oysters76/69070c8b7eea5b7c53e34e4bf856dc3b to your computer and use it in GitHub Desktop.
Save oysters76/69070c8b7eea5b7c53e34e4bf856dc3b to your computer and use it in GitHub Desktop.
Trival Bag of Words implementation without the Hashing Trick
#Trivial Word Bag implementation using Python
import string
doc1 = "John likes to watch movies. Mary likes movies too.";
doc2 = "Mary also likes to watch football games.";
documents = [doc1, doc2]
def transform(s):
return s.translate(str.maketrans('','',string.punctuation)).lower()
def createWordBag(documents):
result = [transform(document) for document in documents]
result = " ".join(result)
result = result.split(" ")
result = set(result)
wordBag = {}
for index, word in enumerate(result):
wordBag[word] = index;
return wordBag;
def getFeature(document, wordBag):
newDocument = transform(document).split(" ")
n = len(newDocument)
feature = [0] * len(wordBag.keys())
for word in newDocument:
index = wordBag[word]
feature[index] += 1
return feature
wordBag = createWordBag(documents)
feature1 = getFeature(doc1, wordBag)
feature2 = getFeature(doc2, wordBag)
print(feature1) # [0, 1, 1, 0, 2, 2, 1, 1, 0, 1]
print(feature2) # [1, 1, 1, 1, 1, 0, 1, 0, 1, 0]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment