dkoslicki/score_ars_by_g_score.py

## score_ars_by_g_score.py
import requests
import os
import json
import glob
import re
query_classes = ['ameliorates','CPIC','drug_treats_rare_disease','DrugCentral_creative',
               'GTRx','OOPD','RareDisease','three_hop','treats']
aras = ["aragorn", "arax", "bte", 'improving']
ara_pattern = re.compile(fr"({'|'.join(aras)})")
query_pattern = re.compile(fr"({'|'.join(query_classes)})")
# url = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
response_files = glob.glob('normalized_results/**/*.json',recursive=True)
counter = 1
failed_to_get_results = []

# Old g-score
def old_get_confidence(result):
    """
    This function iterates through the results from multiple ARAs,
    If only a single score is non-zero the result is thresholded to be in [0,1-eps]
    If a result has non-zero scores from multiple ARAs,
    then all the scores are added together and thresholded to be in [0,1]

    eps is set to 0.001
    """
    score_sum = 0.0
    non_zero_count = 0
    eps = 0.001
    for analysis in result.get("analyses") or []:
        if analysis.get("score") is not None:
            score_sum += analysis["score"]
            if analysis["score"] > 0:
                non_zero_count += 1
    if non_zero_count == 1 and score_sum > 1 - eps:
        score_sum = 1 - eps
    elif non_zero_count > 1 and score_sum > 1:
        score_sum = 1
    return score_sum

def new_get_confidence(result):
    """
    via: https://ncatstranslator.slack.com/archives/C0442D7N7J9/p1694416813304959
    """

    non_zero_count = 0
    score_product = 1
    for analysis in result.get("analyses") or []:
        if analysis.get("score") is not None:
            score_product = score_product * (1 - analysis["score"])
            if analysis["score"] > 0:
                non_zero_count += 1

    if non_zero_count == 0:
        return 0
    final_score = 1 - score_product
    return final_score

for json_file in response_files:

    ars_count = json_file.count('ars')
    if ars_count < 2:
        continue
    not_found_node_counter = 0
    print(counter)
    counter += 1

    # query_class = query_pattern.search(json_file).group(1)
    # ara = ara_pattern.search(json_file).group(1)
    with open(json_file) as fp:
        json_response = json.loads(fp.read())
    nodes_to_normalize = set()
    try:
        results = json_response['fields']['data']['message']['results']
        if not results:
            continue
    except:
        failed_to_get_results.append(json_file)
        continue
    for result in results:
        new_score = new_get_confidence(result)
        old_score = old_get_confidence(result)
        result["old_score"] = old_score
        result["new_score"] = new_score
    results = sorted(results, key=lambda x: x['old_score'],reverse=True)
    json_response['fields']['data']['message']['results'] = results
    new_file_path = 'old_gscore' + json_file
    os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
    with open(new_file_path, 'w') as fp:
        json.dump(json_response, fp, indent=4)
    results = sorted(results, key=lambda x: x['new_score'],reverse=True)
    json_response['fields']['data']['message']['results'] = results
    new_file_path = 'new_gscore' + json_file
    os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
    with open(new_file_path, 'w') as fp:
        json.dump(json_response, fp, indent=4)
import pdb;pdb.set_trace()
	import requests
	import os
	import json
	import glob
	import re
	query_classes = ['ameliorates','CPIC','drug_treats_rare_disease','DrugCentral_creative',
	'GTRx','OOPD','RareDisease','three_hop','treats']
	aras = ["aragorn", "arax", "bte", 'improving']
	ara_pattern = re.compile(fr"({'\|'.join(aras)})")
	query_pattern = re.compile(fr"({'\|'.join(query_classes)})")
	# url = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes"
	response_files = glob.glob('normalized_results/*/.json',recursive=True)
	counter = 1
	failed_to_get_results = []

	# Old g-score
	def old_get_confidence(result):
	"""
	This function iterates through the results from multiple ARAs,
	If only a single score is non-zero the result is thresholded to be in [0,1-eps]
	If a result has non-zero scores from multiple ARAs,
	then all the scores are added together and thresholded to be in [0,1]

	eps is set to 0.001
	"""
	score_sum = 0.0
	non_zero_count = 0
	eps = 0.001
	for analysis in result.get("analyses") or []:
	if analysis.get("score") is not None:
	score_sum += analysis["score"]
	if analysis["score"] > 0:
	non_zero_count += 1
	if non_zero_count == 1 and score_sum > 1 - eps:
	score_sum = 1 - eps
	elif non_zero_count > 1 and score_sum > 1:
	score_sum = 1
	return score_sum

	def new_get_confidence(result):
	"""
	via: https://ncatstranslator.slack.com/archives/C0442D7N7J9/p1694416813304959
	"""

	non_zero_count = 0
	score_product = 1
	for analysis in result.get("analyses") or []:
	if analysis.get("score") is not None:
	score_product = score_product * (1 - analysis["score"])
	if analysis["score"] > 0:
	non_zero_count += 1

	if non_zero_count == 0:
	return 0
	final_score = 1 - score_product
	return final_score

	for json_file in response_files:

	ars_count = json_file.count('ars')
	if ars_count < 2:
	continue
	not_found_node_counter = 0
	print(counter)
	counter += 1

	# query_class = query_pattern.search(json_file).group(1)
	# ara = ara_pattern.search(json_file).group(1)
	with open(json_file) as fp:
	json_response = json.loads(fp.read())
	nodes_to_normalize = set()
	try:
	results = json_response['fields']['data']['message']['results']
	if not results:
	continue
	except:
	failed_to_get_results.append(json_file)
	continue
	for result in results:
	new_score = new_get_confidence(result)
	old_score = old_get_confidence(result)
	result["old_score"] = old_score
	result["new_score"] = new_score
	results = sorted(results, key=lambda x: x['old_score'],reverse=True)
	json_response['fields']['data']['message']['results'] = results
	new_file_path = 'old_gscore' + json_file
	os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
	with open(new_file_path, 'w') as fp:
	json.dump(json_response, fp, indent=4)
	results = sorted(results, key=lambda x: x['new_score'],reverse=True)
	json_response['fields']['data']['message']['results'] = results
	new_file_path = 'new_gscore' + json_file
	os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
	with open(new_file_path, 'w') as fp:
	json.dump(json_response, fp, indent=4)
	import pdb;pdb.set_trace()