Skip to content

Instantly share code, notes, and snippets.

@philerooski
Last active June 7, 2018 18:46
Show Gist options
  • Save philerooski/68bda8ecb4c55efbeedcfa84d494f279 to your computer and use it in GitHub Desktop.
Save philerooski/68bda8ecb4c55efbeedcfa84d494f279 to your computer and use it in GitHub Desktop.
Get PD Biomarker AUPR scores by task. Requires the original scoring code `LDopaScorer.py`
from __future__ import division, print_function
import synapseclient as sc
import pandas as pd
import numpy as np
import argparse
import LDopaScorer
TRAINING_TABLE = 'syn10495809'
TESTING_TABLE = 'syn10701954'
TESTING_WITH_SCORES_TABLE = 'syn10518012'
SUBMISSION_TABLES = {
'tremor': 'syn11559815',
'dyskinesia': 'syn11559820',
'bradykinesia': 'syn11559823'
}
TASK_GROUPINGS = {
'drnkg': ('drnkg',),
'fldng': ('fldng',),
'ftn': ('ftnr1', 'ftnr2', 'ftnl1', 'ftnl2'),
'ntblt': ('ntblt',),
'orgpa': ('orgpa',),
'ram': ('ramr1', 'ramr2', 'raml1', 'raml2')}
CATEGORY_WEIGHTS = {
'tremor': [896, 381, 214, 9, 0],
'dyskinesia': [531, 129],
'bradykinesia': [990, 419]}
def read_args():
parser = argparse.ArgumentParser()
parser.add_argument("--num-bootstraps", type=int)
args = parser.parse_args()
return args
def fetch_task_dfhis(syn):
task_dfhis = {}
for group in TASK_GROUPINGS:
train_dfhi, test_dfhi = fetch_group_dfhis(syn, TASK_GROUPINGS[group])
dfhis = np.append(train_dfhi, test_dfhi)
task_dfhis[group] = dfhis
return task_dfhis
def get_table(syn, synId):
q = syn.tableQuery("select * from {}".format(synId))
df = q.asDataFrame()
df = df.set_index("dataFileHandleId", drop=True)
return df
def count_cases(syn, df, phenotype, group, isTestDataset):
if isTestDataset:
test = get_table(syn, TESTING_TABLE)[
["tremorScore", "dyskinesiaScore", "bradykinesiaScore"]]
df = df.merge(test, left_index=True, right_index=True)
df = df[df['{}Score_y'.format(phenotype)] == "Score"]
scores_for_task = df.query("task in {}".format(TASK_GROUPINGS[group]))
scores = scores_for_task['{}Score_x'.format(phenotype)].values
else:
scores_for_task = df.query("task in {}".format(TASK_GROUPINGS[group]))
scores = scores_for_task['{}Score'.format(phenotype)].values
scores_without_na = scores[pd.notnull(scores)]
unique, counts = np.unique(scores_without_na, return_counts=True)
return dict(zip(unique, counts))
def count_test_cases_for_each_task(syn):
task_counts = pd.DataFrame(columns=['taskGroup', 'phenotype',
'class', 'trainCases', 'testCases'])
train = get_table(syn, TRAINING_TABLE)
test = get_table(syn, TESTING_WITH_SCORES_TABLE)
for group in TASK_GROUPINGS:
train_dfhi, test_dfhi = fetch_group_dfhis(syn, TASK_GROUPINGS[group])
train_classes = train.loc[train_dfhi]
test_classes = test.loc[test_dfhi]
for phenotype in ['tremor', 'dyskinesia', 'bradykinesia']:
train_counts = count_cases(syn, train, phenotype, group, False)
test_counts = count_cases(syn, test, phenotype, group, True)
classes = [0, 1, 2, 3, 4] if phenotype == "tremor" else [0, 1]
for c in classes:
task_counts = task_counts.append({
'taskGroup': group,
'phenotype': phenotype,
'class': c,
'testCases': test_counts.setdefault(c, 0),
'trainCases': train_counts.setdefault(c, 0)},
ignore_index=True)
return task_counts
def fetch_submissions(syn, phenotype):
tableId = SUBMISSION_TABLES[phenotype]
q_submission = syn.tableQuery(
"select submissionId, dataFileHandleId from {}".format(tableId))
submission_paths = syn.downloadTableColumns(
q_submission, "dataFileHandleId")
return q_submission.asDataFrame(), submission_paths
def fetch_group_dfhis(syn, group):
query_string = "select dataFileHandleId from {} where task in {}"
group_string = "('" + "', '".join(group) + "')"
train_query = query_string.format(TRAINING_TABLE, group_string)
test_query = query_string.format(TESTING_TABLE, group_string)
q_train = syn.tableQuery(train_query).asDataFrame()
q_test = syn.tableQuery(test_query).asDataFrame()
return q_train.dataFileHandleId.values, q_test.dataFileHandleId.values
def get_weighted_mean(scores, weights):
numer = 0
denom = sum(weights)
for w, s in zip(weights, scores):
if pd.notnull(s):
numer += w * s
return numer / denom
def get_weights(phenotype, taskGroup, task_counts, forColumn="testCases"):
weights = task_counts.query(
"taskGroup == '{}' and phenotype == '{}'".format(
taskGroup, phenotype))
return weights[forColumn].values
def get_bootstrap_weights(dfhis, test_table_with_scores, phenotype):
test_table_with_scores_filtered = test_table_with_scores.loc[dfhis]
scores = test_table_with_scores_filtered["{}Score".format(phenotype)].values
scores_without_na = scores[pd.notnull(scores)]
unique, counts = np.unique(scores_without_na, return_counts=True)
class_counts = dict(zip(unique, counts))
for i in range(len(CATEGORY_WEIGHTS[phenotype])):
if i not in class_counts:
class_counts[i] = 0
return class_counts
def score_all(phenotype, submissions, task_dfhis, task_counts):
results = {}
for dfhi in submissions:
dfhi, path = str(dfhi), str(submissions[dfhi])
auprc_by_task_group = score(
phenotype, path, task_dfhis, task_counts, bootstrap)
results[dfhi] = auprc_by_task_group
return results
def score_all_bootstrap(phenotype, submissions, task_dfhis, task_counts,
num_bootstraps, test_table_with_scores):
results = {}
for dfhi in submissions:
dfhi, path = str(dfhi), str(submissions[dfhi])
print(dfhi)
score_info = score_bootstrap(
phenotype, path, task_dfhis, task_counts,
num_bootstraps, test_table_with_scores)
results[dfhi] = score_info
return results
def score_bootstrap(phenotype, path, task_dfhis, task_counts,
num_bootstraps, test_table_with_scores):
train_X, test_X, train_y, test_y, index = LDopaScorer.read_data(path, phenotype)
score_info = {}
for task in task_dfhis:
relevant_dfhis = task_dfhis[task]
train_relevant_dfhis = train_X.index.intersection(relevant_dfhis)
train_X_task = train_X.loc[train_relevant_dfhis]
train_y_task = train_y.loc[train_relevant_dfhis]
weights_train = get_weights(phenotype, task, task_counts, "trainCases")
classes = [i for i in range(len(weights_train))
if weights_train[i] > 0]
if len(train_X_task):
ensemble = LDopaScorer.train_ensemble(
train_X_task.values, train_y_task.values)
test_relevant_dfhis = test_X.index.intersection(relevant_dfhis)
for i in range(num_bootstraps):
weights_test = {0: None}
while 1 not in weights_test:
test_relevant_dfhis_bootstrap = np.random.choice(
np.array([j for j in test_relevant_dfhis]),
size=len(test_relevant_dfhis), replace=True)
weights_test = get_bootstrap_weights(
test_relevant_dfhis_bootstrap, test_table_with_scores, phenotype)
test_X_task = test_X.loc[test_relevant_dfhis_bootstrap]
test_y_task = test_y.loc[test_relevant_dfhis_bootstrap]
results, y_score, y_true = LDopaScorer.getNonLinearInterpAupr(
test_X_task, test_y_task, classes, ensemble)
if phenotype == 'tremor':
weighted_aupr = get_weighted_mean(results,
[weights_test.setdefault(k, 0) for k in range(5)])
else:
weighted_aupr = results[0]
score_info[(task, i)] = {
'aupr': weighted_aupr, 'weights': weights_test}
return score_info
def score(phenotype, path, task_dfhis, task_counts):
train_X, test_X, train_y, test_y, index = LDopaScorer.read_data(path, phenotype)
task_scores = {}
for task in task_dfhis:
relevant_dfhis = task_dfhis[task]
train_relevant_dfhis = train_X.index.intersection(relevant_dfhis)
test_relevant_dfhis = test_X.index.intersection(relevant_dfhis)
train_X_task = train_X.loc[train_relevant_dfhis]
test_X_task = test_X.loc[test_relevant_dfhis]
train_y_task = train_y.loc[train_relevant_dfhis]
test_y_task = test_y.loc[test_relevant_dfhis]
weights_train = get_weights(phenotype, task, task_counts, "trainCases")
weights_test = get_weights(phenotype, task, task_counts, "testCases")
classes = [i for i in range(len(weights_train))
if weights_train[i] > 0]
if len(train_X_task) and len(test_X_task):
ensemble = LDopaScorer.train_ensemble(
train_X_task.values, train_y_task.values)
results, y_score, y_true = LDopaScorer.getNonLinearInterpAupr(
test_X_task, test_y_task, classes, ensemble)
print("results", results)
if phenotype == 'tremor':
weighted_aupr = get_weighted_mean(results, weights_test)
else:
weighted_aupr = results[0]
task_scores[task] = weighted_aupr
return task_scores
def write_scores_to_file(scores, phenotype, submissions):
scores_as_list = []
tasks = None
for dfhi in scores:
tasks = scores[dfhi].keys()
break
for dfhi in scores:
score_row = np.append([dfhi], [scores[dfhi][t] for t in tasks])
scores_as_list.append(score_row)
scores = pd.DataFrame(
scores_as_list,
columns=np.append(['dataFileHandleId'], tasks))
# two hours later, dtypes must match...
submissions = submissions.dropna()
scores.dataFileHandleId = scores.dataFileHandleId.astype(int)
submissions.dataFileHandleId = submissions.dataFileHandleId.astype(int)
submissions.submissionId = submissions.submissionId.astype(int)
submissions = submissions.merge(scores, on="dataFileHandleId")
submissions = submissions.drop("dataFileHandleId", axis=1)
submissions.to_csv("{}_scores_by_task.csv".format(phenotype),
index=False, header=True)
return scores
def write_bootstrap_scores_to_file(scores, phenotype, submissions):
scores_as_list = []
tasks = None
for dfhi in scores:
tasks = list(np.unique([i[0] for i in scores[dfhi].keys()]))
break
for dfhi in scores:
for score_info in scores[dfhi]:
task, bootstrap_num = score_info
aupr, weights = scores[dfhi][score_info]['aupr'], \
scores[dfhi][score_info]['weights']
class_weights = [weights.setdefault(i, 0) for i in range(5)]
score_row = [dfhi, phenotype, task, bootstrap_num, aupr] + class_weights
scores_as_list.append(score_row)
scores = pd.DataFrame(
scores_as_list,
columns=['dataFileHandleId', 'phenotype', 'task', 'bootstrap', 'AUPR',
'testClassCount_0', 'testClassCount_1',
'testClassCount_2', 'testClassCount_3',
'testClassCount_4'])
# two hours later, dtypes must match...
submissions = submissions.dropna()
scores.dataFileHandleId = scores.dataFileHandleId.astype(int)
submissions.dataFileHandleId = submissions.dataFileHandleId.astype(int)
submissions.submissionId = submissions.submissionId.astype(int)
submissions = submissions.merge(scores, on="dataFileHandleId")
submissions = submissions.drop("dataFileHandleId", axis=1)
submissions.to_csv("{}_bootstrap_scores_by_task.csv".format(phenotype),
index=False, header=True)
return scores
def main():
args = read_args()
syn = sc.login()
task_dfhis = fetch_task_dfhis(syn)
task_counts = count_test_cases_for_each_task(syn)
test_table_with_scores = get_table(syn, TESTING_WITH_SCORES_TABLE)
for phenotype in SUBMISSION_TABLES:
submissions, submission_paths = fetch_submissions(syn, phenotype)
if args.num_bootstraps:
scores = score_all_bootstrap(
phenotype, submission_paths, task_dfhis, task_counts,
args.num_bootstraps, test_table_with_scores)
write_bootstrap_scores_to_file(scores, phenotype, submissions)
else:
scores = score_all(phenotype, submission_paths, task_dfhis, task_counts)
write_scores_to_file(scores, phenotype, submissions)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment