Last active
June 7, 2018 18:46
-
-
Save philerooski/68bda8ecb4c55efbeedcfa84d494f279 to your computer and use it in GitHub Desktop.
Get PD Biomarker AUPR scores by task. Requires the original scoring code `LDopaScorer.py`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division, print_function | |
import synapseclient as sc | |
import pandas as pd | |
import numpy as np | |
import argparse | |
import LDopaScorer | |
TRAINING_TABLE = 'syn10495809' | |
TESTING_TABLE = 'syn10701954' | |
TESTING_WITH_SCORES_TABLE = 'syn10518012' | |
SUBMISSION_TABLES = { | |
'tremor': 'syn11559815', | |
'dyskinesia': 'syn11559820', | |
'bradykinesia': 'syn11559823' | |
} | |
TASK_GROUPINGS = { | |
'drnkg': ('drnkg',), | |
'fldng': ('fldng',), | |
'ftn': ('ftnr1', 'ftnr2', 'ftnl1', 'ftnl2'), | |
'ntblt': ('ntblt',), | |
'orgpa': ('orgpa',), | |
'ram': ('ramr1', 'ramr2', 'raml1', 'raml2')} | |
CATEGORY_WEIGHTS = { | |
'tremor': [896, 381, 214, 9, 0], | |
'dyskinesia': [531, 129], | |
'bradykinesia': [990, 419]} | |
def read_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--num-bootstraps", type=int) | |
args = parser.parse_args() | |
return args | |
def fetch_task_dfhis(syn): | |
task_dfhis = {} | |
for group in TASK_GROUPINGS: | |
train_dfhi, test_dfhi = fetch_group_dfhis(syn, TASK_GROUPINGS[group]) | |
dfhis = np.append(train_dfhi, test_dfhi) | |
task_dfhis[group] = dfhis | |
return task_dfhis | |
def get_table(syn, synId): | |
q = syn.tableQuery("select * from {}".format(synId)) | |
df = q.asDataFrame() | |
df = df.set_index("dataFileHandleId", drop=True) | |
return df | |
def count_cases(syn, df, phenotype, group, isTestDataset): | |
if isTestDataset: | |
test = get_table(syn, TESTING_TABLE)[ | |
["tremorScore", "dyskinesiaScore", "bradykinesiaScore"]] | |
df = df.merge(test, left_index=True, right_index=True) | |
df = df[df['{}Score_y'.format(phenotype)] == "Score"] | |
scores_for_task = df.query("task in {}".format(TASK_GROUPINGS[group])) | |
scores = scores_for_task['{}Score_x'.format(phenotype)].values | |
else: | |
scores_for_task = df.query("task in {}".format(TASK_GROUPINGS[group])) | |
scores = scores_for_task['{}Score'.format(phenotype)].values | |
scores_without_na = scores[pd.notnull(scores)] | |
unique, counts = np.unique(scores_without_na, return_counts=True) | |
return dict(zip(unique, counts)) | |
def count_test_cases_for_each_task(syn): | |
task_counts = pd.DataFrame(columns=['taskGroup', 'phenotype', | |
'class', 'trainCases', 'testCases']) | |
train = get_table(syn, TRAINING_TABLE) | |
test = get_table(syn, TESTING_WITH_SCORES_TABLE) | |
for group in TASK_GROUPINGS: | |
train_dfhi, test_dfhi = fetch_group_dfhis(syn, TASK_GROUPINGS[group]) | |
train_classes = train.loc[train_dfhi] | |
test_classes = test.loc[test_dfhi] | |
for phenotype in ['tremor', 'dyskinesia', 'bradykinesia']: | |
train_counts = count_cases(syn, train, phenotype, group, False) | |
test_counts = count_cases(syn, test, phenotype, group, True) | |
classes = [0, 1, 2, 3, 4] if phenotype == "tremor" else [0, 1] | |
for c in classes: | |
task_counts = task_counts.append({ | |
'taskGroup': group, | |
'phenotype': phenotype, | |
'class': c, | |
'testCases': test_counts.setdefault(c, 0), | |
'trainCases': train_counts.setdefault(c, 0)}, | |
ignore_index=True) | |
return task_counts | |
def fetch_submissions(syn, phenotype): | |
tableId = SUBMISSION_TABLES[phenotype] | |
q_submission = syn.tableQuery( | |
"select submissionId, dataFileHandleId from {}".format(tableId)) | |
submission_paths = syn.downloadTableColumns( | |
q_submission, "dataFileHandleId") | |
return q_submission.asDataFrame(), submission_paths | |
def fetch_group_dfhis(syn, group): | |
query_string = "select dataFileHandleId from {} where task in {}" | |
group_string = "('" + "', '".join(group) + "')" | |
train_query = query_string.format(TRAINING_TABLE, group_string) | |
test_query = query_string.format(TESTING_TABLE, group_string) | |
q_train = syn.tableQuery(train_query).asDataFrame() | |
q_test = syn.tableQuery(test_query).asDataFrame() | |
return q_train.dataFileHandleId.values, q_test.dataFileHandleId.values | |
def get_weighted_mean(scores, weights): | |
numer = 0 | |
denom = sum(weights) | |
for w, s in zip(weights, scores): | |
if pd.notnull(s): | |
numer += w * s | |
return numer / denom | |
def get_weights(phenotype, taskGroup, task_counts, forColumn="testCases"): | |
weights = task_counts.query( | |
"taskGroup == '{}' and phenotype == '{}'".format( | |
taskGroup, phenotype)) | |
return weights[forColumn].values | |
def get_bootstrap_weights(dfhis, test_table_with_scores, phenotype): | |
test_table_with_scores_filtered = test_table_with_scores.loc[dfhis] | |
scores = test_table_with_scores_filtered["{}Score".format(phenotype)].values | |
scores_without_na = scores[pd.notnull(scores)] | |
unique, counts = np.unique(scores_without_na, return_counts=True) | |
class_counts = dict(zip(unique, counts)) | |
for i in range(len(CATEGORY_WEIGHTS[phenotype])): | |
if i not in class_counts: | |
class_counts[i] = 0 | |
return class_counts | |
def score_all(phenotype, submissions, task_dfhis, task_counts): | |
results = {} | |
for dfhi in submissions: | |
dfhi, path = str(dfhi), str(submissions[dfhi]) | |
auprc_by_task_group = score( | |
phenotype, path, task_dfhis, task_counts, bootstrap) | |
results[dfhi] = auprc_by_task_group | |
return results | |
def score_all_bootstrap(phenotype, submissions, task_dfhis, task_counts, | |
num_bootstraps, test_table_with_scores): | |
results = {} | |
for dfhi in submissions: | |
dfhi, path = str(dfhi), str(submissions[dfhi]) | |
print(dfhi) | |
score_info = score_bootstrap( | |
phenotype, path, task_dfhis, task_counts, | |
num_bootstraps, test_table_with_scores) | |
results[dfhi] = score_info | |
return results | |
def score_bootstrap(phenotype, path, task_dfhis, task_counts, | |
num_bootstraps, test_table_with_scores): | |
train_X, test_X, train_y, test_y, index = LDopaScorer.read_data(path, phenotype) | |
score_info = {} | |
for task in task_dfhis: | |
relevant_dfhis = task_dfhis[task] | |
train_relevant_dfhis = train_X.index.intersection(relevant_dfhis) | |
train_X_task = train_X.loc[train_relevant_dfhis] | |
train_y_task = train_y.loc[train_relevant_dfhis] | |
weights_train = get_weights(phenotype, task, task_counts, "trainCases") | |
classes = [i for i in range(len(weights_train)) | |
if weights_train[i] > 0] | |
if len(train_X_task): | |
ensemble = LDopaScorer.train_ensemble( | |
train_X_task.values, train_y_task.values) | |
test_relevant_dfhis = test_X.index.intersection(relevant_dfhis) | |
for i in range(num_bootstraps): | |
weights_test = {0: None} | |
while 1 not in weights_test: | |
test_relevant_dfhis_bootstrap = np.random.choice( | |
np.array([j for j in test_relevant_dfhis]), | |
size=len(test_relevant_dfhis), replace=True) | |
weights_test = get_bootstrap_weights( | |
test_relevant_dfhis_bootstrap, test_table_with_scores, phenotype) | |
test_X_task = test_X.loc[test_relevant_dfhis_bootstrap] | |
test_y_task = test_y.loc[test_relevant_dfhis_bootstrap] | |
results, y_score, y_true = LDopaScorer.getNonLinearInterpAupr( | |
test_X_task, test_y_task, classes, ensemble) | |
if phenotype == 'tremor': | |
weighted_aupr = get_weighted_mean(results, | |
[weights_test.setdefault(k, 0) for k in range(5)]) | |
else: | |
weighted_aupr = results[0] | |
score_info[(task, i)] = { | |
'aupr': weighted_aupr, 'weights': weights_test} | |
return score_info | |
def score(phenotype, path, task_dfhis, task_counts): | |
train_X, test_X, train_y, test_y, index = LDopaScorer.read_data(path, phenotype) | |
task_scores = {} | |
for task in task_dfhis: | |
relevant_dfhis = task_dfhis[task] | |
train_relevant_dfhis = train_X.index.intersection(relevant_dfhis) | |
test_relevant_dfhis = test_X.index.intersection(relevant_dfhis) | |
train_X_task = train_X.loc[train_relevant_dfhis] | |
test_X_task = test_X.loc[test_relevant_dfhis] | |
train_y_task = train_y.loc[train_relevant_dfhis] | |
test_y_task = test_y.loc[test_relevant_dfhis] | |
weights_train = get_weights(phenotype, task, task_counts, "trainCases") | |
weights_test = get_weights(phenotype, task, task_counts, "testCases") | |
classes = [i for i in range(len(weights_train)) | |
if weights_train[i] > 0] | |
if len(train_X_task) and len(test_X_task): | |
ensemble = LDopaScorer.train_ensemble( | |
train_X_task.values, train_y_task.values) | |
results, y_score, y_true = LDopaScorer.getNonLinearInterpAupr( | |
test_X_task, test_y_task, classes, ensemble) | |
print("results", results) | |
if phenotype == 'tremor': | |
weighted_aupr = get_weighted_mean(results, weights_test) | |
else: | |
weighted_aupr = results[0] | |
task_scores[task] = weighted_aupr | |
return task_scores | |
def write_scores_to_file(scores, phenotype, submissions): | |
scores_as_list = [] | |
tasks = None | |
for dfhi in scores: | |
tasks = scores[dfhi].keys() | |
break | |
for dfhi in scores: | |
score_row = np.append([dfhi], [scores[dfhi][t] for t in tasks]) | |
scores_as_list.append(score_row) | |
scores = pd.DataFrame( | |
scores_as_list, | |
columns=np.append(['dataFileHandleId'], tasks)) | |
# two hours later, dtypes must match... | |
submissions = submissions.dropna() | |
scores.dataFileHandleId = scores.dataFileHandleId.astype(int) | |
submissions.dataFileHandleId = submissions.dataFileHandleId.astype(int) | |
submissions.submissionId = submissions.submissionId.astype(int) | |
submissions = submissions.merge(scores, on="dataFileHandleId") | |
submissions = submissions.drop("dataFileHandleId", axis=1) | |
submissions.to_csv("{}_scores_by_task.csv".format(phenotype), | |
index=False, header=True) | |
return scores | |
def write_bootstrap_scores_to_file(scores, phenotype, submissions): | |
scores_as_list = [] | |
tasks = None | |
for dfhi in scores: | |
tasks = list(np.unique([i[0] for i in scores[dfhi].keys()])) | |
break | |
for dfhi in scores: | |
for score_info in scores[dfhi]: | |
task, bootstrap_num = score_info | |
aupr, weights = scores[dfhi][score_info]['aupr'], \ | |
scores[dfhi][score_info]['weights'] | |
class_weights = [weights.setdefault(i, 0) for i in range(5)] | |
score_row = [dfhi, phenotype, task, bootstrap_num, aupr] + class_weights | |
scores_as_list.append(score_row) | |
scores = pd.DataFrame( | |
scores_as_list, | |
columns=['dataFileHandleId', 'phenotype', 'task', 'bootstrap', 'AUPR', | |
'testClassCount_0', 'testClassCount_1', | |
'testClassCount_2', 'testClassCount_3', | |
'testClassCount_4']) | |
# two hours later, dtypes must match... | |
submissions = submissions.dropna() | |
scores.dataFileHandleId = scores.dataFileHandleId.astype(int) | |
submissions.dataFileHandleId = submissions.dataFileHandleId.astype(int) | |
submissions.submissionId = submissions.submissionId.astype(int) | |
submissions = submissions.merge(scores, on="dataFileHandleId") | |
submissions = submissions.drop("dataFileHandleId", axis=1) | |
submissions.to_csv("{}_bootstrap_scores_by_task.csv".format(phenotype), | |
index=False, header=True) | |
return scores | |
def main(): | |
args = read_args() | |
syn = sc.login() | |
task_dfhis = fetch_task_dfhis(syn) | |
task_counts = count_test_cases_for_each_task(syn) | |
test_table_with_scores = get_table(syn, TESTING_WITH_SCORES_TABLE) | |
for phenotype in SUBMISSION_TABLES: | |
submissions, submission_paths = fetch_submissions(syn, phenotype) | |
if args.num_bootstraps: | |
scores = score_all_bootstrap( | |
phenotype, submission_paths, task_dfhis, task_counts, | |
args.num_bootstraps, test_table_with_scores) | |
write_bootstrap_scores_to_file(scores, phenotype, submissions) | |
else: | |
scores = score_all(phenotype, submission_paths, task_dfhis, task_counts) | |
write_scores_to_file(scores, phenotype, submissions) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment