Skip to content

Instantly share code, notes, and snippets.

@yashrsharma44
Created March 17, 2019 09:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yashrsharma44/8434817594dbc0d072c59ab406f99dd7 to your computer and use it in GitHub Desktop.
Save yashrsharma44/8434817594dbc0d072c59ab406f99dd7 to your computer and use it in GitHub Desktop.
Instructions to run the Summarizer

Instructions to run the summarizer for Case files

  • Note that we are using the original case files as input, and not the summarized files as input, because the result with the summarized files are gibberish

Steps to run the summarizer

  • We are using Python2.7
  • pip install gensim
  • python sum100.py <input folder> <output folder>
from gensim.summarization import summarize
import sys
import os
def load_file(filename):
text = list()
# print(filename)
with open(filename, 'r') as file:
for line in file:
text.append(line)
return text
errorList = list()
def find_num_of_words(text):
ln = list()
for i in text:
if i is ' ':
ln.append(1)
return len(ln)
def main():
# Initialise the input, output folder
input = sys.argv[1]
output = sys.argv[2]
# List of all files in a folder
output_file_list = list()
# List of text of all files which are unsummarized
total_file_text_unsummarized = list()
# List of text of all files which are summarized
total_file_text_summarized = list()
for file in os.listdir(input):
output_file_list.append(file)
# Load all the files in text
for file in output_file_list:
total_file_text_unsummarized.append(load_file('{0}/{1}'.format(input,file)))
for index ,textfile in enumerate(total_file_text_unsummarized):
# textfile = textfile
word_count = 200
filename = output_file_list[index]
text_file = str()
for line in textfile:
text_file += line
# print('---- TEXT FILE -- \n {0}'.format(text_file))
summarized_text = summarize(text_file, word_count=word_count)
len_of_words_summarized = find_num_of_words(summarized_text)
if len_of_words_summarized <= 50 or len_of_words_summarized >= 200:
error = "{0} has the length {1}".format(filename, len_of_words_summarized)
errorList.append(error)
name_of_file = '/{0}.txt'.format(filename)
# print('HHHHHHHHH{0}'.format(output+name_of_file))
with open(output+name_of_file,'a') as file:
file.write(summarized_text)
sys.stdout.write('{0}\r'.format(index))
sys.stdout.flush()
print('--------- SUMMARIZATION COMPLETED!!------------\n')
print('--------- FOLLOWING ARE THE LIST OF ERROR FILEs-----\n')
print(errorList)
with open(output+'/ERRORLIST.txt','w') as file:
for error in errorList:
file.write(error)
print('----- END ------\n')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment