Skip to content

Instantly share code, notes, and snippets.

@narendraprasath
Created May 14, 2020 08:34
Show Gist options
  • Save narendraprasath/26d93fce4dd69dc1646ad3148295755e to your computer and use it in GitHub Desktop.
Save narendraprasath/26d93fce4dd69dc1646ad3148295755e to your computer and use it in GitHub Desktop.
## QA will be stored as .csv file
def extract_QA_from_text_file(INPUT_DIR, text_file_name):
output_file_name = 'covid_19faq.csv'
with open(os.path.join(INPUT_DIR, text_file_name), 'r', encoding='latin') as obj:
text = obj.read()
text = text.strip()
## extract the question by following pattern
pattern = '\n+\s*\d+[.](.*?)\?'
question_pattern = re.compile(pattern,re.MULTILINE|re.IGNORECASE|re.DOTALL)
matched_QA_positions = [(m.start(0),m.end(0)) for m in question_pattern.finditer(text)]
print(f"Available no of question is {len(matched_QA_positions)}")
## store question and answer pair
questions = {}
## iterate every matched QA
for index in range(len(matched_QA_positions)):
## get the start and end position
faq_start_pos = matched_QA_positions[index][0]
faq_end_pos = matched_QA_positions[index][1]
if index == len(matched_QA_positions) - 1:
next_faq_start_pos = -1
else:
next_faq_start_pos = matched_QA_positions[index+1][0]
## get the question from start and end position from original text
question = text[faq_start_pos:faq_end_pos]
if next_faq_start_pos == -1:
answer = text[faq_end_pos:]
else:
answer = text[faq_end_pos:next_faq_start_pos]
## replace multiple new lines to space in questions and answers
question = re.sub("\n+"," ",question.strip())
answer = re.sub("\n+"," ",answer.strip())
questions[question] = answer
## create dataframe from key-value pair
faq_df = pd.DataFrame.from_dict(questions, orient='index', columns=["answers"])
faq_df["questions"] = faq_df.index
faq_df.reset_index(inplace=True)
faq_df[["questions", "answers"]].to_csv(os.path.join(INPUT_DIR, output_file_name),index = False)
print(f"COVID QA file {output_file_name} created")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment