Created
December 8, 2018 03:09
-
-
Save parksunwoo/b3981f2c8f24dc995308d6ae2ed06fde to your computer and use it in GitHub Desktop.
QA_baseline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##!pip3 install summa | |
#Step1. textrank를 활용 문서에서 중요문장 추출 | |
from summa.summarizer import summarize | |
f = open("wiki_en/chosun.txt", 'r') | |
data = f.read() | |
summary = summarize(data, ratio=0.2) | |
f = open("wiki_en/chosun_min.txt", "w") | |
f.write(summary) | |
#Step2. 요약된 문장에서 질문생성 | |
# Automatic question generation by using NLP | |
# https://github.com/indrajithi/genquest | |
def genQuestion(line): | |
""" | |
outputs question from the given text | |
""" | |
if type(line) is str: # If the passed variable is of type string. | |
line = TextBlob(line) # Create object of type textblob.blob.TextBlob | |
bucket = {} # Create an empty dictionary | |
for i,j in enumerate(line.tags): # line.tags are the parts-of-speach in English | |
if j[1] not in bucket: | |
bucket[j[1]] = i # Add all tags to the dictionary or bucket variable | |
if verbose: # In verbose more print the key,values of dictionary | |
print('\n','-'*20) | |
print(line ,'\n') | |
print("TAGS:",line.tags, '\n') | |
print(bucket) | |
question = '' # Create an empty string | |
# These are the english part-of-speach tags used in this demo program. | |
#..................................................................... | |
# NNS Noun, plural | |
# JJ Adjective | |
# NNP Proper noun, singular | |
# VBG Verb, gerund or present participle | |
# VBN Verb, past participle | |
# VBZ Verb, 3rd person singular present | |
# VBD Verb, past tense | |
# IN Preposition or subordinating conjunction | |
# PRP Personal pronoun | |
# NN Noun, singular or mass | |
#..................................................................... | |
# Create a list of tag-combination | |
l1 = ['NNP', 'VBG', 'VBZ', 'IN'] | |
l2 = ['NNP', 'VBG', 'VBZ'] | |
l3 = ['PRP', 'VBG', 'VBZ', 'IN'] | |
l4 = ['PRP', 'VBG', 'VBZ'] | |
l5 = ['PRP', 'VBG', 'VBD'] | |
l6 = ['NNP', 'VBG', 'VBD'] | |
l7 = ['NN', 'VBG', 'VBZ'] | |
l8 = ['NNP', 'VBZ', 'JJ'] | |
l9 = ['NNP', 'VBZ', 'NN'] | |
l10 = ['NNP', 'VBZ'] | |
l11 = ['PRP', 'VBZ'] | |
l12 = ['NNP', 'NN', 'IN'] | |
l13 = ['NN', 'VBZ'] | |
# With the use of conditional statements the dictionary is compared with the list created above | |
if all(key in bucket for key in l1): #'NNP', 'VBG', 'VBZ', 'IN' in sentence. | |
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']]+ ' '+ line.words[bucket['VBG']] + '?' | |
elif all(key in bucket for key in l2): #'NNP', 'VBG', 'VBZ' in sentence. | |
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']] +' '+ line.words[bucket['VBG']] + '?' | |
elif all(key in bucket for key in l3): #'PRP', 'VBG', 'VBZ', 'IN' in sentence. | |
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['PRP']]+ ' '+ line.words[bucket['VBG']] + '?' | |
elif all(key in bucket for key in l4): #'PRP', 'VBG', 'VBZ' in sentence. | |
question = 'What ' + line.words[bucket['PRP']] +' '+ ' does ' + line.words[bucket['VBG']]+ ' '+ line.words[bucket['VBG']] + '?' | |
elif all(key in bucket for key in l7): #'NN', 'VBG', 'VBZ' in sentence. | |
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NN']] +' '+ line.words[bucket['VBG']] + '?' | |
elif all(key in bucket for key in l8): #'NNP', 'VBZ', 'JJ' in sentence. | |
question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?' | |
elif all(key in bucket for key in l9): #'NNP', 'VBZ', 'NN' in sentence | |
question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?' | |
elif all(key in bucket for key in l11): #'PRP', 'VBZ' in sentence. | |
if line.words[bucket['PRP']] in ['she','he']: | |
question = 'What' + ' does ' + line.words[bucket['PRP']].lower() + ' ' + line.words[bucket['VBZ']].singularize() + '?' | |
elif all(key in bucket for key in l10): #'NNP', 'VBZ' in sentence. | |
question = 'What' + ' does ' + line.words[bucket['NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?' | |
elif all(key in bucket for key in l13): #'NN', 'VBZ' in sentence. | |
question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NN']] + '?' | |
# When the tags are generated 's is split to ' and s. To overcome this issue. | |
if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’": | |
question = question.replace(" ’ ","'s ") | |
# Print the genetated questions as output. | |
if question != '': | |
print('\n', 'Question: ' + question ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment