frenzy2106/tokenization.py

## tokenization.py
#check how many individual words present in the corpus
word_dict = {}
for doc in corpus:
    words = nltk.word_tokenize(doc)
    for word in words:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1

len(word_dict)

#tokenising the texts
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus_tokens = tokenizer.texts_to_sequences(corpus)
	#check how many individual words present in the corpus
	word_dict = {}
	for doc in corpus:
	words = nltk.word_tokenize(doc)
	for word in words:
	if word not in word_dict:
	word_dict[word] = 1
	else:
	word_dict[word] += 1

	len(word_dict)

	#tokenising the texts
	tokenizer = keras.preprocessing.text.Tokenizer()
	tokenizer.fit_on_texts(corpus)
	corpus_tokens = tokenizer.texts_to_sequences(corpus)