Skip to content

Instantly share code, notes, and snippets.

@dkoslicki
Created February 3, 2021 21:54
Show Gist options
  • Save dkoslicki/9d4b87ef81fd4f60ededb028a9554861 to your computer and use it in GitHub Desktop.
Save dkoslicki/9d4b87ef81fd4f60ededb028a9554861 to your computer and use it in GitHub Desktop.
Simple code to get identifiers from BioASQ questions
import sys
sys.path.append("/home/dkoslicki/Desktop/RTX/code")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay/predictor")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/Feedback")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/OpenAPI/python-flask-server")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction/tests")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/test")
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/NodeSynonymizer")
from node_synonymizer import NodeSynonymizer
import json
import nltk
import regex
import string
synonymizer = NodeSynonymizer()
def find_node_name(block):
"""Uses the synonymizer to look up if the block (of text) is in the KG. Return None if not."""
res = synonymizer.get_normalizer_results(block, kg_name="KG2")
if res[block]:
id = res[block]['id']['identifier']
synonyms_names = [x['label'] for x in res[block]['synonyms']]
equiv_ids = [x['identifier'] for x in res[block]['equivalent_identifiers']]
return (id, synonyms_names, equiv_ids)
else:
return (None, None, None)
# Hold the results here
identified_questions = dict()
identified_questions['questions'] = []
# Load the BioASQ questions
with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq.json','r') as fp:
j_ob = json.load(fp)
# loop over all the questions
for item in j_ob['questions']:
identified_question = dict()
body = item['body']
identified_question['body'] = body
input_question = body.replace("?", "") # remove trailing question marks
# split into words
# question_tokenized = nltk.word_tokenize(input_question, "english") # only use if using pos tagging
question_tokenized = input_question.split(' ')
# BioASQ uses single quotes around things at time, remove them
# to save time, remove commas too
question_tokenized_clean = []
for token in question_tokenized:
if token:
if token[0] == "'" and token[-1] == "'":
token = token[1:-1]
if token in list(string.punctuation):
continue
question_tokenized_clean.append(token)
question_tokenized = question_tokenized_clean
print(question_tokenized_clean)
# Right here, I could use a pos tagger, and then ignore some parts of speach (like I, is, an, a, all)
# but let's just do everything without thinking about it first
# now, from largest to smallest shingle/k-mer/list of consecutive subwords
# See if you can find identifiers for them
identified_question['terms'] = []
blocks = [] # this will be for the consecutive subwords
for block_size in range(1, len(question_tokenized)):
for i in range(len(question_tokenized) - block_size + 1):
block = " ".join(question_tokenized[i:(i + block_size)])
blocks.append(block)
blocks = list(reversed(blocks))
candidate_node_names = []
found_blocks = [] # keep track of the already found blocks TODO: this will cause problems when you ask something like "how are malaria and mixed malaria different?"
for block in blocks:
term = dict()
(id, synonyms_names, equiv_ids) = find_node_name(block)
if id:
# only add it if it's not a proper subset of an already found block
# this means to take most specific first
if all([block not in b for b in found_blocks]):
found_blocks.append(block)
term['term'] = block
term['id'] = id
term['syn_names'] = synonyms_names
term['equiv_ids'] = equiv_ids
identified_question['terms'].append(term)
identified_questions['questions'].append(identified_question)
# write to file
with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq_with_IDs.json', 'w') as fid:
json.dump(identified_questions, fid, indent=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment