hakyim/eFDR.R

## eFDR.R
## May 7, 2012
## Hae Kyung Im
## haky@uchicago.edu
##


## Please cite:
## Eric R Gamazon, R. Stephanie Huang, Eileen Dolan, Nancy Cox, and Hae Kyung Im, (2012)
## Integrative Genomics: Quantifying significance of phenotype-genotype
## relationships from multiple sources of high-throughput data
## Frontiers of Genetics - under review

## eFDR
## compute empirical FDR based on observed vector of p values
## and null p values computed using simulated phenotype

## usage
## eFDR(pobs,pnulllist,nmax=20)

## arguments:

## pobs: vector of observed p values
## names(pobs) should be the names of the SNPs
## if no names, they will be labelled SNP1...SNPn

## pnulllist: list with simulated null pvalues

## nmax is the maximum number of top snps for which FDR will be computed
## the larger this number the slower the computation will be

## value: will return pvalue

library(qvalue)

eFDR = function(pobs,plist,nmax=100,qval=T)
{
  if(is.matrix(plist)) pnullmat = plist else
  if(is.data.frame(plist)) pnullmat = as.matrix(plist) else
  if(is.list(plist)) pnullmat = plist2pmat(pobs,plist)
  if(is.null(names(pobs)))
  {
    warning("Observed p vector, pobs, is not labelled - labelling SNP1..n")
    names(pobs) = "SNP" %&% (1:length(pobs))
  }
  fdrdata = eFDR.mat(pobs,pnullmat,nmax,qval)
  return(fdrdata)
}

## turns list of pnull to matrix pnullmat

plist2pmat = function(pobs,pnulllist)
{
  maxnrow = max(unlist(lapply(pnulllist,length)))
  maxnrow = max(maxnrow,pobs)
  nsim = length(pnulllist)
  pnullmat = matrix(NA,maxnrow,nsim)
  for(cc in 1:nsim)
  {
    pvec = pveclist[[cc]]
    pnullmat[1:length(pvec),cc] = pvec
  }
  return(pnullmat)
}

##

eFDR.mat = function(pobs,pnullmat,nmax,qval)
{
  npobs = sum(!is.na(pobs))
  nnull = sum(!is.na(pnullmat))
  lambda = median(pobs,na.rm=T) ## lambda is used to estimate pi0
  pi0 = ( sum(pobs>lambda,na.rm=T) / npobs ) /
    ( sum(pnullmat>lambda,na.rm=T) / nnull )
  pi0 = min(pi0,1)
  print(pi0)
  fdrvec = rep(NA,nmax)
  for(rr in 1:nmax)
    {
      gamma = pobs[rr]
      npless.null = sum(pnullmat<=gamma,na.rm=T)
      npless.obs = sum(pobs<=gamma,na.rm=T)
      fdr.cons = (npless.null/nnull) / (npless.obs/npobs)
      fdrvec[rr] = fdr.cons
    }

  fdrvec[fdrvec>1]=1
  for(rr in nmax:2)
  {
    if(fdrvec[rr]<fdrvec[rr-1]) fdrvec[rr-1] = fdrvec[rr]
  }

  fdrdata = data.frame(pobs=pobs[1:nmax])
  if(qval) fdrdata$FDR = qvalue(pobs)$qval[1:nmax]
  fdrdata = data.frame(fdrdata,eFDR = fdrvec, eFDR.pi0 = fdrvec * pi0)
  rownames(fdrdata) = names(pobs)[1:nmax]
  return(fdrdata)
}


##eFDR(pveclist[[1]],pveclist[-1],nmax=20)
	## May 7, 2012
	## Hae Kyung Im
	## haky@uchicago.edu
	##


	## Please cite:
	## Eric R Gamazon, R. Stephanie Huang, Eileen Dolan, Nancy Cox, and Hae Kyung Im, (2012)
	## Integrative Genomics: Quantifying significance of phenotype-genotype
	## relationships from multiple sources of high-throughput data
	## Frontiers of Genetics - under review

	## eFDR
	## compute empirical FDR based on observed vector of p values
	## and null p values computed using simulated phenotype

	## usage
	## eFDR(pobs,pnulllist,nmax=20)

	## arguments:

	## pobs: vector of observed p values
	## names(pobs) should be the names of the SNPs
	## if no names, they will be labelled SNP1...SNPn

	## pnulllist: list with simulated null pvalues

	## nmax is the maximum number of top snps for which FDR will be computed
	## the larger this number the slower the computation will be

	## value: will return pvalue

	library(qvalue)

	eFDR = function(pobs,plist,nmax=100,qval=T)
	{
	if(is.matrix(plist)) pnullmat = plist else
	if(is.data.frame(plist)) pnullmat = as.matrix(plist) else
	if(is.list(plist)) pnullmat = plist2pmat(pobs,plist)
	if(is.null(names(pobs)))
	{
	warning("Observed p vector, pobs, is not labelled - labelling SNP1..n")
	names(pobs) = "SNP" %&% (1:length(pobs))
	}
	fdrdata = eFDR.mat(pobs,pnullmat,nmax,qval)
	return(fdrdata)
	}

	## turns list of pnull to matrix pnullmat

	plist2pmat = function(pobs,pnulllist)
	{
	maxnrow = max(unlist(lapply(pnulllist,length)))
	maxnrow = max(maxnrow,pobs)
	nsim = length(pnulllist)
	pnullmat = matrix(NA,maxnrow,nsim)
	for(cc in 1:nsim)
	{
	pvec = pveclist[[cc]]
	pnullmat[1:length(pvec),cc] = pvec
	}
	return(pnullmat)
	}

	##

	eFDR.mat = function(pobs,pnullmat,nmax,qval)
	{
	npobs = sum(!is.na(pobs))
	nnull = sum(!is.na(pnullmat))
	lambda = median(pobs,na.rm=T) ## lambda is used to estimate pi0
	pi0 = ( sum(pobs>lambda,na.rm=T) / npobs ) /
	( sum(pnullmat>lambda,na.rm=T) / nnull )
	pi0 = min(pi0,1)
	print(pi0)
	fdrvec = rep(NA,nmax)
	for(rr in 1:nmax)
	{
	gamma = pobs[rr]
	npless.null = sum(pnullmat<=gamma,na.rm=T)
	npless.obs = sum(pobs<=gamma,na.rm=T)
	fdr.cons = (npless.null/nnull) / (npless.obs/npobs)
	fdrvec[rr] = fdr.cons
	}

	fdrvec[fdrvec>1]=1
	for(rr in nmax:2)
	{
	if(fdrvec[rr]<fdrvec[rr-1]) fdrvec[rr-1] = fdrvec[rr]
	}

	fdrdata = data.frame(pobs=pobs[1:nmax])
	if(qval) fdrdata$FDR = qvalue(pobs)$qval[1:nmax]
	fdrdata = data.frame(fdrdata,eFDR = fdrvec, eFDR.pi0 = fdrvec * pi0)
	rownames(fdrdata) = names(pobs)[1:nmax]
	return(fdrdata)
	}





	##eFDR(pveclist[[1]],pveclist[-1],nmax=20)