Skip to content

Instantly share code, notes, and snippets.

@parksunwoo
Created December 8, 2018 03:09
Show Gist options
  • Save parksunwoo/b3981f2c8f24dc995308d6ae2ed06fde to your computer and use it in GitHub Desktop.
Save parksunwoo/b3981f2c8f24dc995308d6ae2ed06fde to your computer and use it in GitHub Desktop.
QA_baseline
##!pip3 install summa
#Step1. textrank를 활용 문서에서 중요문장 추출
from summa.summarizer import summarize
f = open("wiki_en/chosun.txt", 'r')
data = f.read()
summary = summarize(data, ratio=0.2)
f = open("wiki_en/chosun_min.txt", "w")
f.write(summary)
#Step2. 요약된 문장에서 질문생성
# Automatic question generation by using NLP
# https://github.com/indrajithi/genquest
def genQuestion(line):
"""
outputs question from the given text
"""
if type(line) is str: # If the passed variable is of type string.
line = TextBlob(line) # Create object of type textblob.blob.TextBlob
bucket = {} # Create an empty dictionary
for i,j in enumerate(line.tags): # line.tags are the parts-of-speach in English
if j[1] not in bucket:
bucket[j[1]] = i # Add all tags to the dictionary or bucket variable
if verbose: # In verbose more print the key,values of dictionary
print('\n','-'*20)
print(line ,'\n')
print("TAGS:",line.tags, '\n')
print(bucket)
question = '' # Create an empty string
# These are the english part-of-speach tags used in this demo program.
#.....................................................................
# NNS Noun, plural
# JJ Adjective
# NNP Proper noun, singular
# VBG Verb, gerund or present participle
# VBN Verb, past participle
# VBZ Verb, 3rd person singular present
# VBD Verb, past tense
# IN Preposition or subordinating conjunction
# PRP Personal pronoun
# NN Noun, singular or mass
#.....................................................................
# Create a list of tag-combination
l1 = ['NNP', 'VBG', 'VBZ', 'IN']
l2 = ['NNP', 'VBG', 'VBZ']
l3 = ['PRP', 'VBG', 'VBZ', 'IN']
l4 = ['PRP', 'VBG', 'VBZ']
l5 = ['PRP', 'VBG', 'VBD']
l6 = ['NNP', 'VBG', 'VBD']
l7 = ['NN', 'VBG', 'VBZ']
l8 = ['NNP', 'VBZ', 'JJ']
l9 = ['NNP', 'VBZ', 'NN']
l10 = ['NNP', 'VBZ']
l11 = ['PRP', 'VBZ']
l12 = ['NNP', 'NN', 'IN']
l13 = ['NN', 'VBZ']
# With the use of conditional statements the dictionary is compared with the list created above
if all(key in bucket for key in l1): #'NNP', 'VBG', 'VBZ', 'IN' in sentence.
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']]+ ' '+ line.words[bucket['VBG']] + '?'
elif all(key in bucket for key in l2): #'NNP', 'VBG', 'VBZ' in sentence.
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']] +' '+ line.words[bucket['VBG']] + '?'
elif all(key in bucket for key in l3): #'PRP', 'VBG', 'VBZ', 'IN' in sentence.
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['PRP']]+ ' '+ line.words[bucket['VBG']] + '?'
elif all(key in bucket for key in l4): #'PRP', 'VBG', 'VBZ' in sentence.
question = 'What ' + line.words[bucket['PRP']] +' '+ ' does ' + line.words[bucket['VBG']]+ ' '+ line.words[bucket['VBG']] + '?'
elif all(key in bucket for key in l7): #'NN', 'VBG', 'VBZ' in sentence.
question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NN']] +' '+ line.words[bucket['VBG']] + '?'
elif all(key in bucket for key in l8): #'NNP', 'VBZ', 'JJ' in sentence.
question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?'
elif all(key in bucket for key in l9): #'NNP', 'VBZ', 'NN' in sentence
question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?'
elif all(key in bucket for key in l11): #'PRP', 'VBZ' in sentence.
if line.words[bucket['PRP']] in ['she','he']:
question = 'What' + ' does ' + line.words[bucket['PRP']].lower() + ' ' + line.words[bucket['VBZ']].singularize() + '?'
elif all(key in bucket for key in l10): #'NNP', 'VBZ' in sentence.
question = 'What' + ' does ' + line.words[bucket['NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?'
elif all(key in bucket for key in l13): #'NN', 'VBZ' in sentence.
question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NN']] + '?'
# When the tags are generated 's is split to ' and s. To overcome this issue.
if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’":
question = question.replace(" ’ ","'s ")
# Print the genetated questions as output.
if question != '':
print('\n', 'Question: ' + question )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment