Last active
January 11, 2019 17:42
-
-
Save Papaass/dcd36365478d8e676294bf5d12e89837 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from nltk import word_tokenize | |
corpus = ["Life is short but life can seem long","The night is near"] | |
#definir deux phrases du corpus | |
sentence_1 = "Life is short but life can seem long" | |
sentence_2 = "The night is near" | |
# fonction retournant un vocabulaire | |
def vocabulary(corpus): | |
voc = [] | |
for sentence in corpus: | |
words = word_tokenize(sentence.lower()) | |
voc.extend(words) | |
voc_clean= [] | |
for w in voc: | |
if w not in voc_clean: | |
voc_clean.append(w) | |
return voc_clean | |
# fonction retournant un sac de mots | |
def bagofwords(sentence,corpus): | |
vocab = vocabulary(corpus) | |
sentence_words = words = word_tokenize(sentence.lower()) | |
bag_of_words = np.zeros(len(vocab)) | |
for w_in_sentence in sentence_words : | |
for i,w in enumerate(vocab) : | |
if w == w_in_sentence : | |
bag_of_words[i] += 1 | |
return bag_of_words |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(bagofwords(sentence_1,corpus)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
print(bagofwords(sentence_2,corpus)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment