Papaass/bag_of_words.py

## bag_of_words.py
import numpy as np
from nltk import word_tokenize

corpus = ["Life is short but life can seem long","The night is near"]

#definir deux phrases du corpus
sentence_1 = "Life is short but life can seem long"
sentence_2 = "The night is near"

# fonction retournant un vocabulaire
def vocabulary(corpus):
    voc = []
    for sentence in corpus:
        words = word_tokenize(sentence.lower())
        voc.extend(words)

    voc_clean= []
    for w in voc:
        if w not in voc_clean:
            voc_clean.append(w)
    return voc_clean


# fonction retournant un sac de mots
def bagofwords(sentence,corpus):
    vocab = vocabulary(corpus)
    sentence_words  = words = word_tokenize(sentence.lower())
    bag_of_words = np.zeros(len(vocab))
    for w_in_sentence in sentence_words :
        for i,w in enumerate(vocab) :
            if w == w_in_sentence :
                bag_of_words[i] += 1
    return bag_of_words

## sent1.py
print(bagofwords(sentence_1,corpus))

## sent2.py
print(bagofwords(sentence_2,corpus))
	import numpy as np
	from nltk import word_tokenize

	corpus = ["Life is short but life can seem long","The night is near"]

	#definir deux phrases du corpus
	sentence_1 = "Life is short but life can seem long"
	sentence_2 = "The night is near"

	# fonction retournant un vocabulaire
	def vocabulary(corpus):
	voc = []
	for sentence in corpus:
	words = word_tokenize(sentence.lower())
	voc.extend(words)

	voc_clean= []
	for w in voc:
	if w not in voc_clean:
	voc_clean.append(w)
	return voc_clean


	# fonction retournant un sac de mots
	def bagofwords(sentence,corpus):
	vocab = vocabulary(corpus)
	sentence_words = words = word_tokenize(sentence.lower())
	bag_of_words = np.zeros(len(vocab))
	for w_in_sentence in sentence_words :
	for i,w in enumerate(vocab) :
	if w == w_in_sentence :
	bag_of_words[i] += 1
	return bag_of_words