Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Get PD Biomarker AUPR scores by task. Requires the original scoring code `LDopaScorer.py`
from __future__ import division, print_function
import synapseclient as sc
import pandas as pd
import numpy as np
import argparse
import LDopaScorer
TRAINING_TABLE = 'syn10495809'
TESTING_TABLE = 'syn10701954'
TESTING_WITH_SCORES_TABLE = 'syn10518012'
SUBMISSION_TABLES = {
'tremor': 'syn11559815',
'dyskinesia': 'syn11559820',
'bradykinesia': 'syn11559823'
}
TASK_GROUPINGS = {
'drnkg': ('drnkg',),
'fldng': ('fldng',),
'ftn': ('ftnr1', 'ftnr2', 'ftnl1', 'ftnl2'),
'ntblt': ('ntblt',),
'orgpa': ('orgpa',),
'ram': ('ramr1', 'ramr2', 'raml1', 'raml2')}
CATEGORY_WEIGHTS = {
'tremor': [896, 381, 214, 9, 0],
'dyskinesia': [531, 129],
'bradykinesia': [990, 419]}
def read_args():
parser = argparse.ArgumentParser()
parser.add_argument("--num-bootstraps", type=int)
args = parser.parse_args()
return args
def fetch_task_dfhis(syn):
task_dfhis = {}
for group in TASK_GROUPINGS:
train_dfhi, test_dfhi = fetch_group_dfhis(syn, TASK_GROUPINGS[group])
dfhis = np.append(train_dfhi, test_dfhi)
task_dfhis[group] = dfhis
return task_dfhis
def get_table(syn, synId):
q = syn.tableQuery("select * from {}".format(synId))
df = q.asDataFrame()
df = df.set_index("dataFileHandleId", drop=True)
return df
def count_cases(syn, df, phenotype, group, isTestDataset):
if isTestDataset:
test = get_table(syn, TESTING_TABLE)[
["tremorScore", "dyskinesiaScore", "bradykinesiaScore"]]
df = df.merge(test, left_index=True, right_index=True)
df = df[df['{}Score_y'.format(phenotype)] == "Score"]
scores_for_task = df.query("task in {}".format(TASK_GROUPINGS[group]))
scores = scores_for_task['{}Score_x'.format(phenotype)].values
else:
scores_for_task = df.query("task in {}".format(TASK_GROUPINGS[group]))
scores = scores_for_task['{}Score'.format(phenotype)].values
scores_without_na = scores[pd.notnull(scores)]
unique, counts = np.unique(scores_without_na, return_counts=True)
return dict(zip(unique, counts))
def count_test_cases_for_each_task(syn):
task_counts = pd.DataFrame(columns=['taskGroup', 'phenotype',
'class', 'trainCases', 'testCases'])
train = get_table(syn, TRAINING_TABLE)
test = get_table(syn, TESTING_WITH_SCORES_TABLE)
for group in TASK_GROUPINGS:
train_dfhi, test_dfhi = fetch_group_dfhis(syn, TASK_GROUPINGS[group])
train_classes = train.loc[train_dfhi]
test_classes = test.loc[test_dfhi]
for phenotype in ['tremor', 'dyskinesia', 'bradykinesia']:
train_counts = count_cases(syn, train, phenotype, group, False)
test_counts = count_cases(syn, test, phenotype, group, True)
classes = [0, 1, 2, 3, 4] if phenotype == "tremor" else [0, 1]
for c in classes:
task_counts = task_counts.append({
'taskGroup': group,
'phenotype': phenotype,
'class': c,
'testCases': test_counts.setdefault(c, 0),
'trainCases': train_counts.setdefault(c, 0)},
ignore_index=True)
return task_counts
def fetch_submissions(syn, phenotype):
tableId = SUBMISSION_TABLES[phenotype]
q_submission = syn.tableQuery(
"select submissionId, dataFileHandleId from {}".format(tableId))
submission_paths = syn.downloadTableColumns(
q_submission, "dataFileHandleId")
return q_submission.asDataFrame(), submission_paths
def fetch_group_dfhis(syn, group):
query_string = "select dataFileHandleId from {} where task in {}"
group_string = "('" + "', '".join(group) + "')"
train_query = query_string.format(TRAINING_TABLE, group_string)
test_query = query_string.format(TESTING_TABLE, group_string)
q_train = syn.tableQuery(train_query).asDataFrame()
q_test = syn.tableQuery(test_query).asDataFrame()
return q_train.dataFileHandleId.values, q_test.dataFileHandleId.values
def get_weighted_mean(scores, weights):
numer = 0
denom = sum(weights)
for w, s in zip(weights, scores):
if pd.notnull(s):
numer += w * s
return numer / denom
def get_weights(phenotype, taskGroup, task_counts, forColumn="testCases"):
weights = task_counts.query(
"taskGroup == '{}' and phenotype == '{}'".format(
taskGroup, phenotype))
return weights[forColumn].values
def get_bootstrap_weights(dfhis, test_table_with_scores, phenotype):
test_table_with_scores_filtered = test_table_with_scores.loc[dfhis]
scores = test_table_with_scores_filtered["{}Score".format(phenotype)].values
scores_without_na = scores[pd.notnull(scores)]
unique, counts = np.unique(scores_without_na, return_counts=True)
class_counts = dict(zip(unique, counts))
for i in range(len(CATEGORY_WEIGHTS[phenotype])):
if i not in class_counts:
class_counts[i] = 0
return class_counts
def score_all(phenotype, submissions, task_dfhis, task_counts):
results = {}
for dfhi in submissions:
dfhi, path = str(dfhi), str(submissions[dfhi])
auprc_by_task_group = score(
phenotype, path, task_dfhis, task_counts, bootstrap)
results[dfhi] = auprc_by_task_group
return results
def score_all_bootstrap(phenotype, submissions, task_dfhis, task_counts,
num_bootstraps, test_table_with_scores):
results = {}
for dfhi in submissions:
dfhi, path = str(dfhi), str(submissions[dfhi])
print(dfhi)
score_info = score_bootstrap(
phenotype, path, task_dfhis, task_counts,
num_bootstraps, test_table_with_scores)
results[dfhi] = score_info
return results
def score_bootstrap(phenotype, path, task_dfhis, task_counts,
num_bootstraps, test_table_with_scores):
train_X, test_X, train_y, test_y, index = LDopaScorer.read_data(path, phenotype)
score_info = {}
for task in task_dfhis:
relevant_dfhis = task_dfhis[task]
train_relevant_dfhis = train_X.index.intersection(relevant_dfhis)
train_X_task = train_X.loc[train_relevant_dfhis]
train_y_task = train_y.loc[train_relevant_dfhis]
weights_train = get_weights(phenotype, task, task_counts, "trainCases")
classes = [i for i in range(len(weights_train))
if weights_train[i] > 0]
if len(train_X_task):
ensemble = LDopaScorer.train_ensemble(
train_X_task.values, train_y_task.values)
test_relevant_dfhis = test_X.index.intersection(relevant_dfhis)
for i in range(num_bootstraps):
weights_test = {0: None}
while 1 not in weights_test:
test_relevant_dfhis_bootstrap = np.random.choice(
np.array([j for j in test_relevant_dfhis]),
size=len(test_relevant_dfhis), replace=True)
weights_test = get_bootstrap_weights(
test_relevant_dfhis_bootstrap, test_table_with_scores, phenotype)
test_X_task = test_X.loc[test_relevant_dfhis_bootstrap]
test_y_task = test_y.loc[test_relevant_dfhis_bootstrap]
results, y_score, y_true = LDopaScorer.getNonLinearInterpAupr(
test_X_task, test_y_task, classes, ensemble)
if phenotype == 'tremor':
weighted_aupr = get_weighted_mean(results,
[weights_test.setdefault(k, 0) for k in range(5)])
else:
weighted_aupr = results[0]
score_info[(task, i)] = {
'aupr': weighted_aupr, 'weights': weights_test}
return score_info
def score(phenotype, path, task_dfhis, task_counts):
train_X, test_X, train_y, test_y, index = LDopaScorer.read_data(path, phenotype)
task_scores = {}
for task in task_dfhis:
relevant_dfhis = task_dfhis[task]
train_relevant_dfhis = train_X.index.intersection(relevant_dfhis)
test_relevant_dfhis = test_X.index.intersection(relevant_dfhis)
train_X_task = train_X.loc[train_relevant_dfhis]
test_X_task = test_X.loc[test_relevant_dfhis]
train_y_task = train_y.loc[train_relevant_dfhis]
test_y_task = test_y.loc[test_relevant_dfhis]
weights_train = get_weights(phenotype, task, task_counts, "trainCases")
weights_test = get_weights(phenotype, task, task_counts, "testCases")
classes = [i for i in range(len(weights_train))
if weights_train[i] > 0]
if len(train_X_task) and len(test_X_task):
ensemble = LDopaScorer.train_ensemble(
train_X_task.values, train_y_task.values)
results, y_score, y_true = LDopaScorer.getNonLinearInterpAupr(
test_X_task, test_y_task, classes, ensemble)
print("results", results)
if phenotype == 'tremor':
weighted_aupr = get_weighted_mean(results, weights_test)
else:
weighted_aupr = results[0]
task_scores[task] = weighted_aupr
return task_scores
def write_scores_to_file(scores, phenotype, submissions):
scores_as_list = []
tasks = None
for dfhi in scores:
tasks = scores[dfhi].keys()
break
for dfhi in scores:
score_row = np.append([dfhi], [scores[dfhi][t] for t in tasks])
scores_as_list.append(score_row)
scores = pd.DataFrame(
scores_as_list,
columns=np.append(['dataFileHandleId'], tasks))
# two hours later, dtypes must match...
submissions = submissions.dropna()
scores.dataFileHandleId = scores.dataFileHandleId.astype(int)
submissions.dataFileHandleId = submissions.dataFileHandleId.astype(int)
submissions.submissionId = submissions.submissionId.astype(int)
submissions = submissions.merge(scores, on="dataFileHandleId")
submissions = submissions.drop("dataFileHandleId", axis=1)
submissions.to_csv("{}_scores_by_task.csv".format(phenotype),
index=False, header=True)
return scores
def write_bootstrap_scores_to_file(scores, phenotype, submissions):
scores_as_list = []
tasks = None
for dfhi in scores:
tasks = list(np.unique([i[0] for i in scores[dfhi].keys()]))
break
for dfhi in scores:
for score_info in scores[dfhi]:
task, bootstrap_num = score_info
aupr, weights = scores[dfhi][score_info]['aupr'], \
scores[dfhi][score_info]['weights']
class_weights = [weights.setdefault(i, 0) for i in range(5)]
score_row = [dfhi, phenotype, task, bootstrap_num, aupr] + class_weights
scores_as_list.append(score_row)
scores = pd.DataFrame(
scores_as_list,
columns=['dataFileHandleId', 'phenotype', 'task', 'bootstrap', 'AUPR',
'testClassCount_0', 'testClassCount_1',
'testClassCount_2', 'testClassCount_3',
'testClassCount_4'])
# two hours later, dtypes must match...
submissions = submissions.dropna()
scores.dataFileHandleId = scores.dataFileHandleId.astype(int)
submissions.dataFileHandleId = submissions.dataFileHandleId.astype(int)
submissions.submissionId = submissions.submissionId.astype(int)
submissions = submissions.merge(scores, on="dataFileHandleId")
submissions = submissions.drop("dataFileHandleId", axis=1)
submissions.to_csv("{}_bootstrap_scores_by_task.csv".format(phenotype),
index=False, header=True)
return scores
def main():
args = read_args()
syn = sc.login()
task_dfhis = fetch_task_dfhis(syn)
task_counts = count_test_cases_for_each_task(syn)
test_table_with_scores = get_table(syn, TESTING_WITH_SCORES_TABLE)
for phenotype in SUBMISSION_TABLES:
submissions, submission_paths = fetch_submissions(syn, phenotype)
if args.num_bootstraps:
scores = score_all_bootstrap(
phenotype, submission_paths, task_dfhis, task_counts,
args.num_bootstraps, test_table_with_scores)
write_bootstrap_scores_to_file(scores, phenotype, submissions)
else:
scores = score_all(phenotype, submission_paths, task_dfhis, task_counts)
write_scores_to_file(scores, phenotype, submissions)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.
You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.