parksunwoo/QA_baseline.py

## QA_baseline.py
##!pip3 install summa

#Step1. textrank를 활용 문서에서 중요문장 추출

from summa.summarizer import summarize
f = open("wiki_en/chosun.txt", 'r')
data = f.read()
summary = summarize(data, ratio=0.2)

f = open("wiki_en/chosun_min.txt", "w")
f.write(summary)

#Step2. 요약된 문장에서 질문생성
# Automatic question generation by using NLP
# https://github.com/indrajithi/genquest

def genQuestion(line):
    """
    outputs question from the given text
    """

    if type(line) is str:     # If the passed variable is of type string.
        line = TextBlob(line) # Create object of type textblob.blob.TextBlob

    bucket = {}               # Create an empty dictionary


    for i,j in enumerate(line.tags):  # line.tags are the parts-of-speach in English
        if j[1] not in bucket:
            bucket[j[1]] = i  # Add all tags to the dictionary or bucket variable

    if verbose:               # In verbose more print the key,values of dictionary
        print('\n','-'*20)
        print(line ,'\n')
        print("TAGS:",line.tags, '\n')
        print(bucket)

    question = ''            # Create an empty string

    # These are the english part-of-speach tags used in this demo program.
    #.....................................................................
    # NNS     Noun, plural
    # JJ  Adjective
    # NNP     Proper noun, singular
    # VBG     Verb, gerund or present participle
    # VBN     Verb, past participle
    # VBZ     Verb, 3rd person singular present
    # VBD     Verb, past tense
    # IN      Preposition or subordinating conjunction
    # PRP     Personal pronoun
    # NN  Noun, singular or mass
    #.....................................................................

    # Create a list of tag-combination

    l1 = ['NNP', 'VBG', 'VBZ', 'IN']
    l2 = ['NNP', 'VBG', 'VBZ']
    l3 = ['PRP', 'VBG', 'VBZ', 'IN']
    l4 = ['PRP', 'VBG', 'VBZ']
    l5 = ['PRP', 'VBG', 'VBD']
    l6 = ['NNP', 'VBG', 'VBD']
    l7 = ['NN', 'VBG', 'VBZ']
    l8 = ['NNP', 'VBZ', 'JJ']
    l9 = ['NNP', 'VBZ', 'NN']
    l10 = ['NNP', 'VBZ']
    l11 = ['PRP', 'VBZ']
    l12 = ['NNP', 'NN', 'IN']
    l13 = ['NN', 'VBZ']

    # With the use of conditional statements the dictionary is compared with the list created above

    if all(key in  bucket for key in l1): #'NNP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']]+ ' '+ line.words[bucket['VBG']] + '?'

    elif all(key in  bucket for key in l2): #'NNP', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']] +' '+ line.words[bucket['VBG']] + '?'

    elif all(key in  bucket for key in l3): #'PRP', 'VBG', 'VBZ', 'IN' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['PRP']]+ ' '+ line.words[bucket['VBG']] + '?'

    elif all(key in  bucket for key in l4): #'PRP', 'VBG', 'VBZ' in sentence.
        question = 'What ' + line.words[bucket['PRP']] +' '+  ' does ' + line.words[bucket['VBG']]+ ' '+  line.words[bucket['VBG']] + '?'

    elif all(key in  bucket for key in l7): #'NN', 'VBG', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NN']] +' '+ line.words[bucket['VBG']] + '?'

    elif all(key in bucket for key in l8): #'NNP', 'VBZ', 'JJ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?'

    elif all(key in bucket for key in l9): #'NNP', 'VBZ', 'NN' in sentence
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?'

    elif all(key in bucket for key in l11): #'PRP', 'VBZ' in sentence.
        if line.words[bucket['PRP']] in ['she','he']:
            question = 'What' + ' does ' + line.words[bucket['PRP']].lower() + ' ' + line.words[bucket['VBZ']].singularize() + '?'

    elif all(key in bucket for key in l10): #'NNP', 'VBZ' in sentence.
        question = 'What' + ' does ' + line.words[bucket['NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?'

    elif all(key in bucket for key in l13): #'NN', 'VBZ' in sentence.
        question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NN']] + '?'

    # When the tags are generated 's is split to ' and s. To overcome this issue.
    if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’":
        question = question.replace(" ’ ","'s ")

    # Print the genetated questions as output.
    if question != '':
        print('\n', 'Question: ' + question )
	##!pip3 install summa

	#Step1. textrank를 활용 문서에서 중요문장 추출

	from summa.summarizer import summarize
	f = open("wiki_en/chosun.txt", 'r')
	data = f.read()
	summary = summarize(data, ratio=0.2)

	f = open("wiki_en/chosun_min.txt", "w")
	f.write(summary)

	#Step2. 요약된 문장에서 질문생성
	# Automatic question generation by using NLP
	# https://github.com/indrajithi/genquest

	def genQuestion(line):
	"""
	outputs question from the given text
	"""

	if type(line) is str: # If the passed variable is of type string.
	line = TextBlob(line) # Create object of type textblob.blob.TextBlob

	bucket = {} # Create an empty dictionary


	for i,j in enumerate(line.tags): # line.tags are the parts-of-speach in English
	if j[1] not in bucket:
	bucket[j[1]] = i # Add all tags to the dictionary or bucket variable

	if verbose: # In verbose more print the key,values of dictionary
	print('\n','-'*20)
	print(line ,'\n')
	print("TAGS:",line.tags, '\n')
	print(bucket)

	question = '' # Create an empty string

	# These are the english part-of-speach tags used in this demo program.
	#.....................................................................
	# NNS Noun, plural
	# JJ Adjective
	# NNP Proper noun, singular
	# VBG Verb, gerund or present participle
	# VBN Verb, past participle
	# VBZ Verb, 3rd person singular present
	# VBD Verb, past tense
	# IN Preposition or subordinating conjunction
	# PRP Personal pronoun
	# NN Noun, singular or mass
	#.....................................................................

	# Create a list of tag-combination

	l1 = ['NNP', 'VBG', 'VBZ', 'IN']
	l2 = ['NNP', 'VBG', 'VBZ']
	l3 = ['PRP', 'VBG', 'VBZ', 'IN']
	l4 = ['PRP', 'VBG', 'VBZ']
	l5 = ['PRP', 'VBG', 'VBD']
	l6 = ['NNP', 'VBG', 'VBD']
	l7 = ['NN', 'VBG', 'VBZ']
	l8 = ['NNP', 'VBZ', 'JJ']
	l9 = ['NNP', 'VBZ', 'NN']
	l10 = ['NNP', 'VBZ']
	l11 = ['PRP', 'VBZ']
	l12 = ['NNP', 'NN', 'IN']
	l13 = ['NN', 'VBZ']

	# With the use of conditional statements the dictionary is compared with the list created above

	if all(key in bucket for key in l1): #'NNP', 'VBG', 'VBZ', 'IN' in sentence.
	question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']]+ ' '+ line.words[bucket['VBG']] + '?'

	elif all(key in bucket for key in l2): #'NNP', 'VBG', 'VBZ' in sentence.
	question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NNP']] +' '+ line.words[bucket['VBG']] + '?'

	elif all(key in bucket for key in l3): #'PRP', 'VBG', 'VBZ', 'IN' in sentence.
	question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['PRP']]+ ' '+ line.words[bucket['VBG']] + '?'

	elif all(key in bucket for key in l4): #'PRP', 'VBG', 'VBZ' in sentence.
	question = 'What ' + line.words[bucket['PRP']] +' '+ ' does ' + line.words[bucket['VBG']]+ ' '+ line.words[bucket['VBG']] + '?'

	elif all(key in bucket for key in l7): #'NN', 'VBG', 'VBZ' in sentence.
	question = 'What' + ' ' + line.words[bucket['VBZ']] +' '+ line.words[bucket['NN']] +' '+ line.words[bucket['VBG']] + '?'

	elif all(key in bucket for key in l8): #'NNP', 'VBZ', 'JJ' in sentence.
	question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?'

	elif all(key in bucket for key in l9): #'NNP', 'VBZ', 'NN' in sentence
	question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NNP']] + '?'

	elif all(key in bucket for key in l11): #'PRP', 'VBZ' in sentence.
	if line.words[bucket['PRP']] in ['she','he']:
	question = 'What' + ' does ' + line.words[bucket['PRP']].lower() + ' ' + line.words[bucket['VBZ']].singularize() + '?'

	elif all(key in bucket for key in l10): #'NNP', 'VBZ' in sentence.
	question = 'What' + ' does ' + line.words[bucket['NNP']] + ' ' + line.words[bucket['VBZ']].singularize() + '?'

	elif all(key in bucket for key in l13): #'NN', 'VBZ' in sentence.
	question = 'What' + ' ' + line.words[bucket['VBZ']] + ' ' + line.words[bucket['NN']] + '?'

	# When the tags are generated 's is split to ' and s. To overcome this issue.
	if 'VBZ' in bucket and line.words[bucket['VBZ']] == "’":
	question = question.replace(" ’ ","'s ")

	# Print the genetated questions as output.
	if question != '':
	print('\n', 'Question: ' + question )