Skip to content

Instantly share code, notes, and snippets.

@Papaass
Last active January 11, 2019 17:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Papaass/dcd36365478d8e676294bf5d12e89837 to your computer and use it in GitHub Desktop.
Save Papaass/dcd36365478d8e676294bf5d12e89837 to your computer and use it in GitHub Desktop.
import numpy as np
from nltk import word_tokenize
corpus = ["Life is short but life can seem long","The night is near"]
#definir deux phrases du corpus
sentence_1 = "Life is short but life can seem long"
sentence_2 = "The night is near"
# fonction retournant un vocabulaire
def vocabulary(corpus):
voc = []
for sentence in corpus:
words = word_tokenize(sentence.lower())
voc.extend(words)
voc_clean= []
for w in voc:
if w not in voc_clean:
voc_clean.append(w)
return voc_clean
# fonction retournant un sac de mots
def bagofwords(sentence,corpus):
vocab = vocabulary(corpus)
sentence_words = words = word_tokenize(sentence.lower())
bag_of_words = np.zeros(len(vocab))
for w_in_sentence in sentence_words :
for i,w in enumerate(vocab) :
if w == w_in_sentence :
bag_of_words[i] += 1
return bag_of_words
print(bagofwords(sentence_1,corpus))
print(bagofwords(sentence_2,corpus))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment