liangyy/compute_auc.R

## compute_auc.R
compute_auc = function(curve) {
  # f = curve %>% filter(nTP > 0) %>% group_by(nTP) %>% summarize(y = mean(tpr), x = fpr[1] - fpr[2])
  o1 = rbind(c(1, 1), curve %>% select(tpr, fpr))
  o2 = rbind(curve %>% select(tpr, fpr), c(0, 0))
  # message('e')
  y = data.frame(x1 = o1$fpr, x2 = o2$fpr, y1 = o1$tpr, y2 = o2$tpr)
  e = y %>% mutate(s = (x1 - x2) * (y1 + y2) / 2)
  data.frame(roc_auc = sum(e$s))
}

## rlib_roc_pr.R
compute_power_and_fdr = function(discovered_genes, true_genes) {
  true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
  nTP = length(true_positive_genes)
  nT = length(true_genes)
  nP = length(discovered_genes)
  if(nP == 0) {
    nP = 1
  }
  data.frame(power = nTP / nT, fdr = 1 - nTP / nP, nTP = nTP, nP = nP, nT = nT)
}

compute_tpr_and_fpr = function(tested_genes, discovered_genes, true_genes) {
  true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
  nTP = length(true_positive_genes)
  nT = length(true_genes)

  all_genes = union(tested_genes, true_genes)
  not_true_genes = all_genes[!all_genes %in% true_genes]
  nNT = length(not_true_genes)
  false_positive_genes = not_true_genes[ not_true_genes %in% discovered_genes ]
  nFP = length(false_positive_genes)
  if(nNT == 0) {
    nNT = 1
  }
  data.frame(tpr = nTP / nT, fpr = nFP / nNT, nTP = nTP, nT = nT, nFP = nFP, nNT = nNT)
}


is_sig = function(score, threshold, method, tie = T) {
  if(method == 'gt') {
    if(tie == T) {
      return(score >= threshold)
    } else {
      return(score > threshold)
    }
  } else if(method == 'lt') {
    if(tie == T) {
      return(score <= threshold)
    } else {
      return(score < threshold)
    }
  } else {
    return(NA)
  }
}

gen_fdr_power_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
  if(is.null(cutoff)) {
    true_cutoffs = sort(unique(score[gene %in% true_genes]))
  } else {
    true_cutoffs = cutoff
  }
  if(method == 'lt') {
    true_cutoffs = sort(true_cutoffs, decreasing = T)
  }
  df_curve = data.frame()
  for(i in true_cutoffs) {
    positive_genes = gene[is_sig(score, i, method)]
    sub = compute_power_and_fdr(positive_genes, true_genes)
    sub$cutoff = i
    sub$include_tie = T
    df_curve = rbind(df_curve, sub)
    positive_genes = gene[is_sig(score, i, method, tie = F)]
    sub = compute_power_and_fdr(positive_genes, true_genes)
    sub$cutoff = i
    sub$include_tie = F
    df_curve = rbind(df_curve, sub)
  }
  df_curve = df_curve %>% mutate(precision = 1 - fdr, recall = power)
  df_curve$precision[df_curve$nP == 0] = 1
  df_curve
}


gen_roc_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
  if(is.null(cutoff)) {
    true_cutoffs = sort(unique(score[gene %in% true_genes]))
  } else {
    true_cutoffs = cutoff
  }
  if(method == 'lt') {
    true_cutoffs = sort(true_cutoffs, decreasing = T)
  }
  df_curve = data.frame()
  for(i in true_cutoffs) {
    positive_genes = gene[is_sig(score, i, method)]
    sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
    sub$cutoff = i
    sub$include_tie = T
    df_curve = rbind(df_curve, sub)
    positive_genes = gene[is_sig(score, i, method, tie = F)]
    sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
    sub$cutoff = i
    sub$include_tie = F
    df_curve = rbind(df_curve, sub)
  }
  # sub = compute_tpr_and_fpr(gene, union(gene, true_genes), true_genes)
  # sub$cutoff = NA
  # df_curve = rbind(df_curve, sub)
  df_curve
}


## test_roc_pr.R
library(dplyr)
library(ggplot2)
library(ROCR)
data(ROCR.simple)

source('rlib_roc_pr.R')

# add names for each sample
ROCR.simple$gene_name = paste('gene', 1 : length(ROCR.simple$predictions))

# extract gene names for real signal
true_genes = ROCR.simple$gene_name[which(ROCR.simple$labels == 1)]

# PR curve
## gen_fdr_power_curve
## true_genes: a vector of gene name of the real signal
## gene: a vector of gene name for all candidate genes
## score: a vector of numeric score for all candidate genes
## method: 'gt' if the higher score means more likely to be signal; otherwise use 'lt'
## cutoff: a vector of numeric score cutoffs. If it is NULL, the function will return the whole curve. Otherwise, it will return FDR/Precision and Power/Recall at the cutoffs

df = gen_fdr_power_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL)
df[-nrow(df), ] %>% ggplot() + geom_path(aes(x = recall, y = precision))

pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
perf1 <- performance(pred, "prec", "rec")
plot(perf1)


# ROC curve
## gen_roc_curve
## same input

df = gen_roc_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL)
df %>% ggplot() + geom_path(aes(x = fpr, y = tpr))

pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
perf1 <- performance(pred, "tpr", "fpr")
plot(perf1)
	compute_auc = function(curve) {
	# f = curve %>% filter(nTP > 0) %>% group_by(nTP) %>% summarize(y = mean(tpr), x = fpr[1] - fpr[2])
	o1 = rbind(c(1, 1), curve %>% select(tpr, fpr))
	o2 = rbind(curve %>% select(tpr, fpr), c(0, 0))
	# message('e')
	y = data.frame(x1 = o1$fpr, x2 = o2$fpr, y1 = o1$tpr, y2 = o2$tpr)
	e = y %>% mutate(s = (x1 - x2) * (y1 + y2) / 2)
	data.frame(roc_auc = sum(e$s))
	}
	compute_power_and_fdr = function(discovered_genes, true_genes) {
	true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
	nTP = length(true_positive_genes)
	nT = length(true_genes)
	nP = length(discovered_genes)
	if(nP == 0) {
	nP = 1
	}
	data.frame(power = nTP / nT, fdr = 1 - nTP / nP, nTP = nTP, nP = nP, nT = nT)
	}

	compute_tpr_and_fpr = function(tested_genes, discovered_genes, true_genes) {
	true_positive_genes = true_genes[ true_genes %in% discovered_genes ]
	nTP = length(true_positive_genes)
	nT = length(true_genes)

	all_genes = union(tested_genes, true_genes)
	not_true_genes = all_genes[!all_genes %in% true_genes]
	nNT = length(not_true_genes)
	false_positive_genes = not_true_genes[ not_true_genes %in% discovered_genes ]
	nFP = length(false_positive_genes)
	if(nNT == 0) {
	nNT = 1
	}
	data.frame(tpr = nTP / nT, fpr = nFP / nNT, nTP = nTP, nT = nT, nFP = nFP, nNT = nNT)
	}


	is_sig = function(score, threshold, method, tie = T) {
	if(method == 'gt') {
	if(tie == T) {
	return(score >= threshold)
	} else {
	return(score > threshold)
	}
	} else if(method == 'lt') {
	if(tie == T) {
	return(score <= threshold)
	} else {
	return(score < threshold)
	}
	} else {
	return(NA)
	}
	}

	gen_fdr_power_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
	if(is.null(cutoff)) {
	true_cutoffs = sort(unique(score[gene %in% true_genes]))
	} else {
	true_cutoffs = cutoff
	}
	if(method == 'lt') {
	true_cutoffs = sort(true_cutoffs, decreasing = T)
	}
	df_curve = data.frame()
	for(i in true_cutoffs) {
	positive_genes = gene[is_sig(score, i, method)]
	sub = compute_power_and_fdr(positive_genes, true_genes)
	sub$cutoff = i
	sub$include_tie = T
	df_curve = rbind(df_curve, sub)
	positive_genes = gene[is_sig(score, i, method, tie = F)]
	sub = compute_power_and_fdr(positive_genes, true_genes)
	sub$cutoff = i
	sub$include_tie = F
	df_curve = rbind(df_curve, sub)
	}
	df_curve = df_curve %>% mutate(precision = 1 - fdr, recall = power)
	df_curve$precision[df_curve$nP == 0] = 1
	df_curve
	}


	gen_roc_curve = function(true_genes, gene, score, method = 'gt', cutoff = NULL) {
	if(is.null(cutoff)) {
	true_cutoffs = sort(unique(score[gene %in% true_genes]))
	} else {
	true_cutoffs = cutoff
	}
	if(method == 'lt') {
	true_cutoffs = sort(true_cutoffs, decreasing = T)
	}
	df_curve = data.frame()
	for(i in true_cutoffs) {
	positive_genes = gene[is_sig(score, i, method)]
	sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
	sub$cutoff = i
	sub$include_tie = T
	df_curve = rbind(df_curve, sub)
	positive_genes = gene[is_sig(score, i, method, tie = F)]
	sub = compute_tpr_and_fpr(gene, positive_genes, true_genes)
	sub$cutoff = i
	sub$include_tie = F
	df_curve = rbind(df_curve, sub)
	}
	# sub = compute_tpr_and_fpr(gene, union(gene, true_genes), true_genes)
	# sub$cutoff = NA
	# df_curve = rbind(df_curve, sub)
	df_curve
	}
	library(dplyr)
	library(ggplot2)
	library(ROCR)
	data(ROCR.simple)

	source('rlib_roc_pr.R')

	# add names for each sample
	ROCR.simple$gene_name = paste('gene', 1 : length(ROCR.simple$predictions))

	# extract gene names for real signal
	true_genes = ROCR.simple$gene_name[which(ROCR.simple$labels == 1)]

	# PR curve
	## gen_fdr_power_curve
	## true_genes: a vector of gene name of the real signal
	## gene: a vector of gene name for all candidate genes
	## score: a vector of numeric score for all candidate genes
	## method: 'gt' if the higher score means more likely to be signal; otherwise use 'lt'
	## cutoff: a vector of numeric score cutoffs. If it is NULL, the function will return the whole curve. Otherwise, it will return FDR/Precision and Power/Recall at the cutoffs

	df = gen_fdr_power_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL)
	df[-nrow(df), ] %>% ggplot() + geom_path(aes(x = recall, y = precision))

	pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
	perf1 <- performance(pred, "prec", "rec")
	plot(perf1)


	# ROC curve
	## gen_roc_curve
	## same input

	df = gen_roc_curve(true_genes = true_genes, gene = ROCR.simple$gene_name, score = ROCR.simple$predictions, method = 'gt', cutoff = NULL)
	df %>% ggplot() + geom_path(aes(x = fpr, y = tpr))

	pred <- prediction( ROCR.simple$predictions, ROCR.simple$labels)
	perf1 <- performance(pred, "tpr", "fpr")
	plot(perf1)