Skip to content

Instantly share code, notes, and snippets.

@liangyy
Last active November 19, 2019 17:34
Show Gist options
  • Save liangyy/6d4314dbc238236731e134abef2484f4 to your computer and use it in GitHub Desktop.
Save liangyy/6d4314dbc238236731e134abef2484f4 to your computer and use it in GitHub Desktop.
rlib on generating ROC/PR
compute_auc = function(curve) {
# f = curve %>% filter(nTP > 0) %>% group_by(nTP) %>% summarize(y = mean(tpr), x = fpr[1] - fpr[2])
o1 = rbind(c(1, 1), curve %>% select(tpr, fpr))
o2 = rbind(curve %>% select(tpr, fpr), c(0, 0))
# message('e')
y = data.frame(x1 = o1$fpr, x2 = o2$fpr, y1 = o1$tpr, y2 = o2$tpr)
e = y %>% mutate(s = (x1 - x2) * (y1 + y2) / 2)
data.frame(roc_auc = sum(e$s))
}
compute_power_and_fdr = function(discovered_genes, true_genes) {
true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
nTP = length(true_positive_genes)
nT = length(true_genes)
nP = length(discovered_genes)
if(nP == 0) {
nP = 1
}
data.frame(power = nTP / nT, fdr = 1 - nTP / nP, nTP = nTP, nP = nP, nT = nT)
}
compute_tpr_and_fpr = function(tested_genes, discovered_genes, true_genes) {
true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
nTP = length(true_positive_genes)
nT = length(true_genes)
all_genes = union(tested_genes, true_genes)
not_true_genes = all_genes[!all_genes %in% true_genes]
nNT = length(not_true_genes)
false_positive_genes = not_true_genes[ not_true_genes %in% discovered_genes ]
nFP = length(false_positive_genes)
if(nNT == 0) {
nNT = 1
}
data.frame(tpr = nTP / nT, fpr = nFP / nNT, nTP = nTP, nT = nT, nFP = nFP, nNT = nNT)
}
is_sig = function(score, threshold, method, tie = T) {
if(method == 'gt') {
if(tie == T) {
return(score >= threshold)
} else {
return(score > threshold)
}
} else if(method == 'lt') {
if(tie == T) {
return(score <= threshold)
} else {
return(score < threshold)
}
} else {
return(NA)
}
}
gen_fdr_power_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
if(is.null(cutoff)) {
true_cutoffs = sort(unique(score[gene %in% true_genes]))
} else {
true_cutoffs = cutoff
}
if(method == 'lt') {
true_cutoffs = sort(true_cutoffs, decreasing = T)
}
df_curve = data.frame()
for(i in true_cutoffs) {
positive_genes = gene[is_sig(score, i, method)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = T
df_curve = rbind(df_curve, sub)
positive_genes = gene[is_sig(score, i, method, tie = F)]
sub = compute_power_and_fdr(positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = F
df_curve = rbind(df_curve, sub)
}
df_curve = df_curve %>% mutate(precision = 1 - fdr, recall = power)
df_curve$precision[df_curve$nP == 0] = 1
df_curve
}
gen_roc_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
if(is.null(cutoff)) {
true_cutoffs = sort(unique(score[gene %in% true_genes]))
} else {
true_cutoffs = cutoff
}
if(method == 'lt') {
true_cutoffs = sort(true_cutoffs, decreasing = T)
}
df_curve = data.frame()
for(i in true_cutoffs) {
positive_genes = gene[is_sig(score, i, method)]
sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = T
df_curve = rbind(df_curve, sub)
positive_genes = gene[is_sig(score, i, method, tie = F)]
sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
sub$cutoff = i
sub$include_tie = F
df_curve = rbind(df_curve, sub)
}
# sub = compute_tpr_and_fpr(gene, union(gene, true_genes), true_genes)
# sub$cutoff = NA
# df_curve = rbind(df_curve, sub)
df_curve
}
library(dplyr)
library(ggplot2)
library(ROCR)
data(ROCR.simple)
source('rlib_roc_pr.R')
# add names for each sample
ROCR.simple$gene_name = paste('gene', 1 : length(ROCR.simple$predictions))
# extract gene names for real signal
true_genes = ROCR.simple$gene_name[which(ROCR.simple$labels == 1)]
# PR curve
## gen_fdr_power_curve
## true_genes: a vector of gene name of the real signal
## gene: a vector of gene name for all candidate genes
## score: a vector of numeric score for all candidate genes
## method: 'gt' if the higher score means more likely to be signal; otherwise use 'lt'
## cutoff: a vector of numeric score cutoffs. If it is NULL, the function will return the whole curve. Otherwise, it will return FDR/Precision and Power/Recall at the cutoffs
df = gen_fdr_power_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL)
df[-nrow(df), ] %>% ggplot() + geom_path(aes(x = recall, y = precision))
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
perf1 <- performance(pred, "prec", "rec")
plot(perf1)
# ROC curve
## gen_roc_curve
## same input
df = gen_roc_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL)
df %>% ggplot() + geom_path(aes(x = fpr, y = tpr))
pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
perf1 <- performance(pred, "tpr", "fpr")
plot(perf1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment