Skip to content

Instantly share code, notes, and snippets.

@Papaass
Created January 11, 2019 17:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Papaass/25cfd5c3bbfca0c5dd68137e60c1dfb2 to your computer and use it in GitHub Desktop.
Save Papaass/25cfd5c3bbfca0c5dd68137e60c1dfb2 to your computer and use it in GitHub Desktop.
import numpy as np
from nltk import word_tokenize
corpus = ["La vie est courte mais la vie peut paraître longue","La nuit est proche"]
#definir deux phrases du corpus
phrase_1 = "La vie est courte mais la vie peut paraître longue"
phrase_2 = "La nuit est proche"
# fonction retournant un vocabulaire
def vocabulary(corpus):
voc = []
for sentence in corpus:
words = word_tokenize(sentence.lower())
voc.extend(words)
voc_clean= []
for w in voc:
if w not in voc_clean:
voc_clean.append(w)
return voc_clean
# fonction retournant un sac de mots
def bagofwords(sentence,corpus):
vocab = vocabulary(corpus)
sentence_words = words = word_tokenize(sentence.lower())
bag_of_words = np.zeros(len(vocab))
for w_in_sentence in sentence_words :
for i,w in enumerate(vocab) :
if w == w_in_sentence :
bag_of_words[i] += 1
return bag_of_words
print(bagofwords(phrase_1,corpus))
print(bagofwords(phrase_2,corpus))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment