narendraprasath/PDF COVID-19 FAQ parsing

## PDF COVID-19 FAQ parsing
## QA will be stored as .csv file
def extract_QA_from_text_file(INPUT_DIR, text_file_name):
  output_file_name = 'covid_19faq.csv'
  with open(os.path.join(INPUT_DIR, text_file_name), 'r', encoding='latin') as obj:
      text = obj.read()

  text = text.strip()
  ## extract the question by following pattern
  pattern = '\n+\s*\d+[.](.*?)\?'
  question_pattern = re.compile(pattern,re.MULTILINE|re.IGNORECASE|re.DOTALL)
  matched_QA_positions = [(m.start(0),m.end(0)) for m in question_pattern.finditer(text)]
  print(f"Available no of question is {len(matched_QA_positions)}")
  ## store question and answer pair
  questions = {}
  ## iterate every matched QA
  for index in range(len(matched_QA_positions)):
      ## get the start and end position
      faq_start_pos = matched_QA_positions[index][0]
      faq_end_pos = matched_QA_positions[index][1]

      if index == len(matched_QA_positions) - 1:
          next_faq_start_pos = -1
      else:
          next_faq_start_pos = matched_QA_positions[index+1][0]

      ## get the question from start and end position from original text
      question = text[faq_start_pos:faq_end_pos]
      if next_faq_start_pos == -1:
          answer = text[faq_end_pos:]
      else:
          answer = text[faq_end_pos:next_faq_start_pos]
      ## replace multiple new lines to space in questions and answers
      question = re.sub("\n+"," ",question.strip())
      answer = re.sub("\n+"," ",answer.strip())
      questions[question] = answer

  ## create dataframe from key-value pair
  faq_df = pd.DataFrame.from_dict(questions, orient='index', columns=["answers"])
  faq_df["questions"] = faq_df.index
  faq_df.reset_index(inplace=True)
  faq_df[["questions", "answers"]].to_csv(os.path.join(INPUT_DIR, output_file_name),index = False)
  print(f"COVID QA file {output_file_name} created")
	## QA will be stored as .csv file
	def extract_QA_from_text_file(INPUT_DIR, text_file_name):
	output_file_name = 'covid_19faq.csv'
	with open(os.path.join(INPUT_DIR, text_file_name), 'r', encoding='latin') as obj:
	text = obj.read()

	text = text.strip()
	## extract the question by following pattern
	pattern = '\n+\s\d+[.](.?)\?'
	question_pattern = re.compile(pattern,re.MULTILINE\|re.IGNORECASE\|re.DOTALL)
	matched_QA_positions = [(m.start(0),m.end(0)) for m in question_pattern.finditer(text)]
	print(f"Available no of question is {len(matched_QA_positions)}")
	## store question and answer pair
	questions = {}
	## iterate every matched QA
	for index in range(len(matched_QA_positions)):
	## get the start and end position
	faq_start_pos = matched_QA_positions[index][0]
	faq_end_pos = matched_QA_positions[index][1]

	if index == len(matched_QA_positions) - 1:
	next_faq_start_pos = -1
	else:
	next_faq_start_pos = matched_QA_positions[index+1][0]

	## get the question from start and end position from original text
	question = text[faq_start_pos:faq_end_pos]
	if next_faq_start_pos == -1:
	answer = text[faq_end_pos:]
	else:
	answer = text[faq_end_pos:next_faq_start_pos]
	## replace multiple new lines to space in questions and answers
	question = re.sub("\n+"," ",question.strip())
	answer = re.sub("\n+"," ",answer.strip())
	questions[question] = answer

	## create dataframe from key-value pair
	faq_df = pd.DataFrame.from_dict(questions, orient='index', columns=["answers"])
	faq_df["questions"] = faq_df.index
	faq_df.reset_index(inplace=True)
	faq_df[["questions", "answers"]].to_csv(os.path.join(INPUT_DIR, output_file_name),index = False)
	print(f"COVID QA file {output_file_name} created")