yashrsharma44/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Code to run the doc2vec conversion

Taken from Gensim Tutorial
Steps to run the code -


Download the attachment
Make sure python is installed
Type  pip install gensim nltk
If any library is missing, install them as pip install <library_name>
Run python code_runner.py <input folder>
Here the input folder is the folder that contains the case files
You can provide the input folder by navigating into the input folder using terminal, and type pwd. This provides the path
of the input folder
The model will train itself, and after it is finished, run the python code_checker.py
You will be asked to enter the sample text for which the relevant documents would be returned.


## code_checker.py
# This code checks the summarizer for a given input
from gensim.models.doc2vec import Doc2Vec
import sys
import os
#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

model= Doc2Vec.load("d2v.model")
text = input("Please enter the sentence that you want a relevant document for!")
#to find the vector of a document which is not in training data
test_data = word_tokenize(text.lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# Stores the list of files in a given folder
file_list = list()
# Name of input folder
folder_name = sys.argv[1]
for file in os.listdir(folder_name):
    file_list.append(file)

# to find most similar doc using the given sentence
similar_doc = model.docvecs.most_similar(positive=[model.infer_vector(test_data)],topn=5)
print(similar_doc)
print('Corresponding filenames for a given tag is - ')
for val in similar_doc:
	print('Tag{0} - File{1}'.format(val[0],file_list[int(val[0])]))


# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])


## code_runner.py
import os
import sys
from nltk.tokenize import word_tokenize
import pdb

# Stores the list of files in a given folder
file_list = list()
# Name of input folder
folder_name = sys.argv[1]
for file in os.listdir(folder_name):
    file_list.append(file)

#Stores the given text in total_text
PATH = folder_name
total_text = list()

for i,file in enumerate(file_list):
    if i<=100:
        with open(PATH+file,'r') as file:
            text = file.readlines()
            text = text[6:]
            total_text.append(text)
    else:
        break

for i,item in enumerate(total_text):
    total_text[i]=''.join(item)


# Runs the summarizer

#Import all the dependencies
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


data = total_text

tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

# train the model

max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm =1)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2222v.model")
print("Model Saved")
	# This code checks the summarizer for a given input
	from gensim.models.doc2vec import Doc2Vec
	import sys
	import os
	#Import all the dependencies
	from gensim.models.doc2vec import Doc2Vec, TaggedDocument
	from nltk.tokenize import word_tokenize

	model= Doc2Vec.load("d2v.model")
	text = input("Please enter the sentence that you want a relevant document for!")
	#to find the vector of a document which is not in training data
	test_data = word_tokenize(text.lower())
	v1 = model.infer_vector(test_data)
	print("V1_infer", v1)

	# Stores the list of files in a given folder
	file_list = list()
	# Name of input folder
	folder_name = sys.argv[1]
	for file in os.listdir(folder_name):
	file_list.append(file)

	# to find most similar doc using the given sentence
	similar_doc = model.docvecs.most_similar(positive=[model.infer_vector(test_data)],topn=5)
	print(similar_doc)
	print('Corresponding filenames for a given tag is - ')
	for val in similar_doc:
	print('Tag{0} - File{1}'.format(val[0],file_list[int(val[0])]))


	# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
	print(model.docvecs['1'])