Created
May 14, 2020 08:34
-
-
Save narendraprasath/26d93fce4dd69dc1646ad3148295755e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## QA will be stored as .csv file | |
def extract_QA_from_text_file(INPUT_DIR, text_file_name): | |
output_file_name = 'covid_19faq.csv' | |
with open(os.path.join(INPUT_DIR, text_file_name), 'r', encoding='latin') as obj: | |
text = obj.read() | |
text = text.strip() | |
## extract the question by following pattern | |
pattern = '\n+\s*\d+[.](.*?)\?' | |
question_pattern = re.compile(pattern,re.MULTILINE|re.IGNORECASE|re.DOTALL) | |
matched_QA_positions = [(m.start(0),m.end(0)) for m in question_pattern.finditer(text)] | |
print(f"Available no of question is {len(matched_QA_positions)}") | |
## store question and answer pair | |
questions = {} | |
## iterate every matched QA | |
for index in range(len(matched_QA_positions)): | |
## get the start and end position | |
faq_start_pos = matched_QA_positions[index][0] | |
faq_end_pos = matched_QA_positions[index][1] | |
if index == len(matched_QA_positions) - 1: | |
next_faq_start_pos = -1 | |
else: | |
next_faq_start_pos = matched_QA_positions[index+1][0] | |
## get the question from start and end position from original text | |
question = text[faq_start_pos:faq_end_pos] | |
if next_faq_start_pos == -1: | |
answer = text[faq_end_pos:] | |
else: | |
answer = text[faq_end_pos:next_faq_start_pos] | |
## replace multiple new lines to space in questions and answers | |
question = re.sub("\n+"," ",question.strip()) | |
answer = re.sub("\n+"," ",answer.strip()) | |
questions[question] = answer | |
## create dataframe from key-value pair | |
faq_df = pd.DataFrame.from_dict(questions, orient='index', columns=["answers"]) | |
faq_df["questions"] = faq_df.index | |
faq_df.reset_index(inplace=True) | |
faq_df[["questions", "answers"]].to_csv(os.path.join(INPUT_DIR, output_file_name),index = False) | |
print(f"COVID QA file {output_file_name} created") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment