Last active
November 19, 2019 17:34
-
-
Save liangyy/6d4314dbc238236731e134abef2484f4 to your computer and use it in GitHub Desktop.
rlib on generating ROC/PR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
compute_auc = function(curve) { | |
# f = curve %>% filter(nTP > 0) %>% group_by(nTP) %>% summarize(y = mean(tpr), x = fpr[1] - fpr[2]) | |
o1 = rbind(c(1, 1), curve %>% select(tpr, fpr)) | |
o2 = rbind(curve %>% select(tpr, fpr), c(0, 0)) | |
# message('e') | |
y = data.frame(x1 = o1$fpr, x2 = o2$fpr, y1 = o1$tpr, y2 = o2$tpr) | |
e = y %>% mutate(s = (x1 - x2) * (y1 + y2) / 2) | |
data.frame(roc_auc = sum(e$s)) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
compute_power_and_fdr = function(discovered_genes, true_genes) { | |
true_positive_genes = true_genes[ true_genes %in% discovered_genes ] | |
nTP = length(true_positive_genes) | |
nT = length(true_genes) | |
nP = length(discovered_genes) | |
if(nP == 0) { | |
nP = 1 | |
} | |
data.frame(power = nTP / nT, fdr = 1 - nTP / nP, nTP = nTP, nP = nP, nT = nT) | |
} | |
compute_tpr_and_fpr = function(tested_genes, discovered_genes, true_genes) { | |
true_positive_genes = true_genes[ true_genes %in% discovered_genes ] | |
nTP = length(true_positive_genes) | |
nT = length(true_genes) | |
all_genes = union(tested_genes, true_genes) | |
not_true_genes = all_genes[!all_genes %in% true_genes] | |
nNT = length(not_true_genes) | |
false_positive_genes = not_true_genes[ not_true_genes %in% discovered_genes ] | |
nFP = length(false_positive_genes) | |
if(nNT == 0) { | |
nNT = 1 | |
} | |
data.frame(tpr = nTP / nT, fpr = nFP / nNT, nTP = nTP, nT = nT, nFP = nFP, nNT = nNT) | |
} | |
is_sig = function(score, threshold, method, tie = T) { | |
if(method == 'gt') { | |
if(tie == T) { | |
return(score >= threshold) | |
} else { | |
return(score > threshold) | |
} | |
} else if(method == 'lt') { | |
if(tie == T) { | |
return(score <= threshold) | |
} else { | |
return(score < threshold) | |
} | |
} else { | |
return(NA) | |
} | |
} | |
gen_fdr_power_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) { | |
if(is.null(cutoff)) { | |
true_cutoffs = sort(unique(score[gene %in% true_genes])) | |
} else { | |
true_cutoffs = cutoff | |
} | |
if(method == 'lt') { | |
true_cutoffs = sort(true_cutoffs, decreasing = T) | |
} | |
df_curve = data.frame() | |
for(i in true_cutoffs) { | |
positive_genes = gene[is_sig(score, i, method)] | |
sub = compute_power_and_fdr(positive_genes, true_genes) | |
sub$cutoff = i | |
sub$include_tie = T | |
df_curve = rbind(df_curve, sub) | |
positive_genes = gene[is_sig(score, i, method, tie = F)] | |
sub = compute_power_and_fdr(positive_genes, true_genes) | |
sub$cutoff = i | |
sub$include_tie = F | |
df_curve = rbind(df_curve, sub) | |
} | |
df_curve = df_curve %>% mutate(precision = 1 - fdr, recall = power) | |
df_curve$precision[df_curve$nP == 0] = 1 | |
df_curve | |
} | |
gen_roc_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) { | |
if(is.null(cutoff)) { | |
true_cutoffs = sort(unique(score[gene %in% true_genes])) | |
} else { | |
true_cutoffs = cutoff | |
} | |
if(method == 'lt') { | |
true_cutoffs = sort(true_cutoffs, decreasing = T) | |
} | |
df_curve = data.frame() | |
for(i in true_cutoffs) { | |
positive_genes = gene[is_sig(score, i, method)] | |
sub = compute_tpr_and_fpr(gene, positive_genes, true_genes) | |
sub$cutoff = i | |
sub$include_tie = T | |
df_curve = rbind(df_curve, sub) | |
positive_genes = gene[is_sig(score, i, method, tie = F)] | |
sub = compute_tpr_and_fpr(gene, positive_genes, true_genes) | |
sub$cutoff = i | |
sub$include_tie = F | |
df_curve = rbind(df_curve, sub) | |
} | |
# sub = compute_tpr_and_fpr(gene, union(gene, true_genes), true_genes) | |
# sub$cutoff = NA | |
# df_curve = rbind(df_curve, sub) | |
df_curve | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(ggplot2) | |
library(ROCR) | |
data(ROCR.simple) | |
source('rlib_roc_pr.R') | |
# add names for each sample | |
ROCR.simple$gene_name = paste('gene', 1 : length(ROCR.simple$predictions)) | |
# extract gene names for real signal | |
true_genes = ROCR.simple$gene_name[which(ROCR.simple$labels == 1)] | |
# PR curve | |
## gen_fdr_power_curve | |
## true_genes: a vector of gene name of the real signal | |
## gene: a vector of gene name for all candidate genes | |
## score: a vector of numeric score for all candidate genes | |
## method: 'gt' if the higher score means more likely to be signal; otherwise use 'lt' | |
## cutoff: a vector of numeric score cutoffs. If it is NULL, the function will return the whole curve. Otherwise, it will return FDR/Precision and Power/Recall at the cutoffs | |
df = gen_fdr_power_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL) | |
df[-nrow(df), ] %>% ggplot() + geom_path(aes(x = recall, y = precision)) | |
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels) | |
perf1 <- performance(pred, "prec", "rec") | |
plot(perf1) | |
# ROC curve | |
## gen_roc_curve | |
## same input | |
df = gen_roc_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL) | |
df %>% ggplot() + geom_path(aes(x = fpr, y = tpr)) | |
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels) | |
perf1 <- performance(pred, "tpr", "fpr") | |
plot(perf1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment