purva91/1_tokenize.py

## 1_tokenize.py
from nltk.tokenize import word_tokenize

# Tokenization of each document
tokenized_sent = []
for s in sentences:
    tokenized_sent.append(word_tokenize(d.lower()))
tokenized_sent

## 2_tagged_document.py
# import
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
tagged_data

## 3_train_save_load.py
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

'''
vector_size = Dimensionality of the feature vectors.
window = The maximum distance between the current and predicted word within a sentence.
min_count = Ignores all words with total frequency lower than this.
alpha = The initial learning rate.
'''

## Print model vocabulary
model.wv.vocab

## 4_find_similar.py
test_doc = word_tokenize("I had pizza and pasta".lower())
test_doc_vector = model.infer_vector(test_doc)
model.docvecs.most_similar(positive = [test_doc_vector])

'''
positive = List of sentences that contribute positively.
'''
	from nltk.tokenize import word_tokenize

	# Tokenization of each document
	tokenized_sent = []
	for s in sentences:
	tokenized_sent.append(word_tokenize(d.lower()))
	tokenized_sent
	# import
	from gensim.models.doc2vec import Doc2Vec, TaggedDocument
	tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]
	tagged_data
	## Train doc2vec model
	model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

	'''
	vector_size = Dimensionality of the feature vectors.
	window = The maximum distance between the current and predicted word within a sentence.
	min_count = Ignores all words with total frequency lower than this.
	alpha = The initial learning rate.
	'''

	## Print model vocabulary
	model.wv.vocab
	test_doc = word_tokenize("I had pizza and pasta".lower())
	test_doc_vector = model.infer_vector(test_doc)
	model.docvecs.most_similar(positive = [test_doc_vector])

	'''
	positive = List of sentences that contribute positively.
	'''