dkoslicki/BioASQ_with_identifiers.py

## BioASQ_with_identifiers.py
import sys
sys.path.append("/home/dkoslicki/Desktop/RTX/code")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay/predictor")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/Feedback")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/OpenAPI/python-flask-server")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction/tests")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/test")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/NodeSynonymizer")

from node_synonymizer import NodeSynonymizer
import json
import nltk
import regex
import string
synonymizer = NodeSynonymizer()

def find_node_name(block):
    """Uses the synonymizer to look up if the block (of text) is in the KG. Return None if not."""
    res = synonymizer.get_normalizer_results(block, kg_name="KG2")
    if res[block]:
        id = res[block]['id']['identifier']
        synonyms_names = [x['label'] for x in res[block]['synonyms']]
        equiv_ids = [x['identifier'] for x in res[block]['equivalent_identifiers']]
        return (id, synonyms_names, equiv_ids)
    else:
        return (None, None, None)

# Hold the results here
identified_questions = dict()
identified_questions['questions'] = []

# Load the BioASQ questions
with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq.json','r') as fp:
    j_ob = json.load(fp)

# loop over all the questions
for item in j_ob['questions']:
    identified_question = dict()
    body = item['body']
    identified_question['body'] = body
    input_question = body.replace("?", "")  # remove trailing question marks

    # split into words

    # question_tokenized = nltk.word_tokenize(input_question, "english")  # only use if using pos tagging
    question_tokenized = input_question.split(' ')
    # BioASQ uses single quotes around things at time, remove them
    # to save time, remove commas too
    question_tokenized_clean = []
    for token in question_tokenized:
        if token:
            if token[0] == "'" and token[-1] == "'":
                token = token[1:-1]
            if token in list(string.punctuation):
                continue
            question_tokenized_clean.append(token)
    question_tokenized = question_tokenized_clean
    print(question_tokenized_clean)

    # Right here, I could use a pos tagger, and then ignore some parts of speach (like I, is, an, a, all)
    # but let's just do everything without thinking about it first

    # now, from largest to smallest shingle/k-mer/list of consecutive subwords
    # See if you can find identifiers for them
    identified_question['terms'] = []
    blocks = []  # this will be for the consecutive subwords
    for block_size in range(1, len(question_tokenized)):
        for i in range(len(question_tokenized) - block_size + 1):
            block = " ".join(question_tokenized[i:(i + block_size)])
            blocks.append(block)
    blocks = list(reversed(blocks))
    candidate_node_names = []
    found_blocks = []  # keep track of the already found blocks TODO: this will cause problems when you ask something like "how are malaria and mixed malaria different?"
    for block in blocks:
        term = dict()
        (id, synonyms_names, equiv_ids) = find_node_name(block)
        if id:
            # only add it if it's not a proper subset of an already found block
            # this means to take most specific first
            if all([block not in b for b in found_blocks]):
                found_blocks.append(block)
                term['term'] = block
                term['id'] = id
                term['syn_names'] = synonyms_names
                term['equiv_ids'] = equiv_ids
                identified_question['terms'].append(term)
    identified_questions['questions'].append(identified_question)

# write to file
with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq_with_IDs.json', 'w') as fid:
    json.dump(identified_questions, fid, indent=1)
	import sys
	sys.path.append("/home/dkoslicki/Desktop/RTX/code")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay/predictor")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/Feedback")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/OpenAPI/python-flask-server")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction/tests")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/test")
	sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/NodeSynonymizer")

	from node_synonymizer import NodeSynonymizer
	import json
	import nltk
	import regex
	import string
	synonymizer = NodeSynonymizer()

	def find_node_name(block):
	"""Uses the synonymizer to look up if the block (of text) is in the KG. Return None if not."""
	res = synonymizer.get_normalizer_results(block, kg_name="KG2")
	if res[block]:
	id = res[block]['id']['identifier']
	synonyms_names = [x['label'] for x in res[block]['synonyms']]
	equiv_ids = [x['identifier'] for x in res[block]['equivalent_identifiers']]
	return (id, synonyms_names, equiv_ids)
	else:
	return (None, None, None)

	# Hold the results here
	identified_questions = dict()
	identified_questions['questions'] = []

	# Load the BioASQ questions
	with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq.json','r') as fp:
	j_ob = json.load(fp)

	# loop over all the questions
	for item in j_ob['questions']:
	identified_question = dict()
	body = item['body']
	identified_question['body'] = body
	input_question = body.replace("?", "") # remove trailing question marks

	# split into words

	# question_tokenized = nltk.word_tokenize(input_question, "english") # only use if using pos tagging
	question_tokenized = input_question.split(' ')
	# BioASQ uses single quotes around things at time, remove them
	# to save time, remove commas too
	question_tokenized_clean = []
	for token in question_tokenized:
	if token:
	if token[0] == "'" and token[-1] == "'":
	token = token[1:-1]
	if token in list(string.punctuation):
	continue
	question_tokenized_clean.append(token)
	question_tokenized = question_tokenized_clean
	print(question_tokenized_clean)

	# Right here, I could use a pos tagger, and then ignore some parts of speach (like I, is, an, a, all)
	# but let's just do everything without thinking about it first

	# now, from largest to smallest shingle/k-mer/list of consecutive subwords
	# See if you can find identifiers for them
	identified_question['terms'] = []
	blocks = [] # this will be for the consecutive subwords
	for block_size in range(1, len(question_tokenized)):
	for i in range(len(question_tokenized) - block_size + 1):
	block = " ".join(question_tokenized[i:(i + block_size)])
	blocks.append(block)
	blocks = list(reversed(blocks))
	candidate_node_names = []
	found_blocks = [] # keep track of the already found blocks TODO: this will cause problems when you ask something like "how are malaria and mixed malaria different?"
	for block in blocks:
	term = dict()
	(id, synonyms_names, equiv_ids) = find_node_name(block)
	if id:
	# only add it if it's not a proper subset of an already found block
	# this means to take most specific first
	if all([block not in b for b in found_blocks]):
	found_blocks.append(block)
	term['term'] = block
	term['id'] = id
	term['syn_names'] = synonyms_names
	term['equiv_ids'] = equiv_ids
	identified_question['terms'].append(term)
	identified_questions['questions'].append(identified_question)

	# write to file
	with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq_with_IDs.json', 'w') as fid:
	json.dump(identified_questions, fid, indent=1)