Created
February 3, 2021 21:54
-
-
Save dkoslicki/9d4b87ef81fd4f60ededb028a9554861 to your computer and use it in GitHub Desktop.
Simple code to get identifiers from BioASQ questions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/Overlay/predictor") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/Feedback") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/UI/OpenAPI/python-flask-server") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/reasoningtool/kg-construction/tests") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/test") | |
sys.path.append("/home/dkoslicki/Desktop/RTX/code/ARAX/NodeSynonymizer") | |
from node_synonymizer import NodeSynonymizer | |
import json | |
import nltk | |
import regex | |
import string | |
synonymizer = NodeSynonymizer() | |
def find_node_name(block): | |
"""Uses the synonymizer to look up if the block (of text) is in the KG. Return None if not.""" | |
res = synonymizer.get_normalizer_results(block, kg_name="KG2") | |
if res[block]: | |
id = res[block]['id']['identifier'] | |
synonyms_names = [x['label'] for x in res[block]['synonyms']] | |
equiv_ids = [x['identifier'] for x in res[block]['equivalent_identifiers']] | |
return (id, synonyms_names, equiv_ids) | |
else: | |
return (None, None, None) | |
# Hold the results here | |
identified_questions = dict() | |
identified_questions['questions'] = [] | |
# Load the BioASQ questions | |
with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq.json','r') as fp: | |
j_ob = json.load(fp) | |
# loop over all the questions | |
for item in j_ob['questions']: | |
identified_question = dict() | |
body = item['body'] | |
identified_question['body'] = body | |
input_question = body.replace("?", "") # remove trailing question marks | |
# split into words | |
# question_tokenized = nltk.word_tokenize(input_question, "english") # only use if using pos tagging | |
question_tokenized = input_question.split(' ') | |
# BioASQ uses single quotes around things at time, remove them | |
# to save time, remove commas too | |
question_tokenized_clean = [] | |
for token in question_tokenized: | |
if token: | |
if token[0] == "'" and token[-1] == "'": | |
token = token[1:-1] | |
if token in list(string.punctuation): | |
continue | |
question_tokenized_clean.append(token) | |
question_tokenized = question_tokenized_clean | |
print(question_tokenized_clean) | |
# Right here, I could use a pos tagger, and then ignore some parts of speach (like I, is, an, a, all) | |
# but let's just do everything without thinking about it first | |
# now, from largest to smallest shingle/k-mer/list of consecutive subwords | |
# See if you can find identifiers for them | |
identified_question['terms'] = [] | |
blocks = [] # this will be for the consecutive subwords | |
for block_size in range(1, len(question_tokenized)): | |
for i in range(len(question_tokenized) - block_size + 1): | |
block = " ".join(question_tokenized[i:(i + block_size)]) | |
blocks.append(block) | |
blocks = list(reversed(blocks)) | |
candidate_node_names = [] | |
found_blocks = [] # keep track of the already found blocks TODO: this will cause problems when you ask something like "how are malaria and mixed malaria different?" | |
for block in blocks: | |
term = dict() | |
(id, synonyms_names, equiv_ids) = find_node_name(block) | |
if id: | |
# only add it if it's not a proper subset of an already found block | |
# this means to take most specific first | |
if all([block not in b for b in found_blocks]): | |
found_blocks.append(block) | |
term['term'] = block | |
term['id'] = id | |
term['syn_names'] = synonyms_names | |
term['equiv_ids'] = equiv_ids | |
identified_question['terms'].append(term) | |
identified_questions['questions'].append(identified_question) | |
# write to file | |
with open('/home/dkoslicki/Desktop/RTX/code/ARAX/ARAXQuery/bioasq_with_IDs.json', 'w') as fid: | |
json.dump(identified_questions, fid, indent=1) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment