Created
December 7, 2023 22:22
-
-
Save dkoslicki/034832c5e6a8f8fa226ca780bd002b8a to your computer and use it in GitHub Desktop.
score_ars_by_g_score.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import os | |
import json | |
import glob | |
import re | |
query_classes = ['ameliorates','CPIC','drug_treats_rare_disease','DrugCentral_creative', | |
'GTRx','OOPD','RareDisease','three_hop','treats'] | |
aras = ["aragorn", "arax", "bte", 'improving'] | |
ara_pattern = re.compile(fr"({'|'.join(aras)})") | |
query_pattern = re.compile(fr"({'|'.join(query_classes)})") | |
# url = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes" | |
response_files = glob.glob('normalized_results/**/*.json',recursive=True) | |
counter = 1 | |
failed_to_get_results = [] | |
# Old g-score | |
def old_get_confidence(result): | |
""" | |
This function iterates through the results from multiple ARAs, | |
If only a single score is non-zero the result is thresholded to be in [0,1-eps] | |
If a result has non-zero scores from multiple ARAs, | |
then all the scores are added together and thresholded to be in [0,1] | |
eps is set to 0.001 | |
""" | |
score_sum = 0.0 | |
non_zero_count = 0 | |
eps = 0.001 | |
for analysis in result.get("analyses") or []: | |
if analysis.get("score") is not None: | |
score_sum += analysis["score"] | |
if analysis["score"] > 0: | |
non_zero_count += 1 | |
if non_zero_count == 1 and score_sum > 1 - eps: | |
score_sum = 1 - eps | |
elif non_zero_count > 1 and score_sum > 1: | |
score_sum = 1 | |
return score_sum | |
def new_get_confidence(result): | |
""" | |
via: https://ncatstranslator.slack.com/archives/C0442D7N7J9/p1694416813304959 | |
""" | |
non_zero_count = 0 | |
score_product = 1 | |
for analysis in result.get("analyses") or []: | |
if analysis.get("score") is not None: | |
score_product = score_product * (1 - analysis["score"]) | |
if analysis["score"] > 0: | |
non_zero_count += 1 | |
if non_zero_count == 0: | |
return 0 | |
final_score = 1 - score_product | |
return final_score | |
for json_file in response_files: | |
ars_count = json_file.count('ars') | |
if ars_count < 2: | |
continue | |
not_found_node_counter = 0 | |
print(counter) | |
counter += 1 | |
# query_class = query_pattern.search(json_file).group(1) | |
# ara = ara_pattern.search(json_file).group(1) | |
with open(json_file) as fp: | |
json_response = json.loads(fp.read()) | |
nodes_to_normalize = set() | |
try: | |
results = json_response['fields']['data']['message']['results'] | |
if not results: | |
continue | |
except: | |
failed_to_get_results.append(json_file) | |
continue | |
for result in results: | |
new_score = new_get_confidence(result) | |
old_score = old_get_confidence(result) | |
result["old_score"] = old_score | |
result["new_score"] = new_score | |
results = sorted(results, key=lambda x: x['old_score'],reverse=True) | |
json_response['fields']['data']['message']['results'] = results | |
new_file_path = 'old_gscore' + json_file | |
os.makedirs(os.path.dirname(new_file_path), exist_ok=True) | |
with open(new_file_path, 'w') as fp: | |
json.dump(json_response, fp, indent=4) | |
results = sorted(results, key=lambda x: x['new_score'],reverse=True) | |
json_response['fields']['data']['message']['results'] = results | |
new_file_path = 'new_gscore' + json_file | |
os.makedirs(os.path.dirname(new_file_path), exist_ok=True) | |
with open(new_file_path, 'w') as fp: | |
json.dump(json_response, fp, indent=4) | |
import pdb;pdb.set_trace() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment