Hae Kyung Im hakyim

## fn_ratxcan.R
# calculate ptrs function
calc_ptrs <- function(expr, weights)
{
  fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
}

fast_generate_trait <- function(expr, weights){
  # convert the predicted df into a matrix
  if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
    as.matrix() else test <- expr

## metaanalysis.R
## meta analysis, sample size based
metaZn =  function(zdisc,zrepl,ndisc,nrepl)
  {
  ## calculate meta analysis Zscore
  ## zdisc and zrepl are zscores in the discovery and replication sets respectively
  ## ndisc and nrepl are sample sizes in the discovery and replication sets
    wdisc = sqrt(ndisc)
    wrepl = sqrt(nrepl)
    ( zdisc * wdisc  + zrepl * wrepl )/sqrt( wdisc^2 + wrepl^2 )
}

## qqunif.R
## pvalue vs uniform

qqunif =
  function(p,BH=T,CI=T,mlog10_p_thres=30,...)
  {
    ## thresholded by default at 1e-30
    p=na.omit(p)
    nn = length(p)
    xx =  -log10((1:nn)/(nn+1))


## read_write_GRM_gz
X = GXM
filename = paste0(data.dir,"tempo/gcta-run/gxm.grm")
ids = rnadata$sidno
write_GRMgz = function(X,filename,ids)
{
 #X[upper.tri(X,diag=TRUE)]
 rmat = row(X)
 cmat = col(X)
 omat = cbind(cmat[upper.tri(cmat,diag=TRUE)],rmat[upper.tri(rmat,diag=TRUE)],ncol(rnamat),X[upper.tri(X,diag=TRUE)])
 write_tsv(data.frame(omat),path=filename,col_names = FALSE)

## miscel.R
## loglik test
logLik.test = function(fit0,fit)
{
  l0 = logLik(fit0)
  l1 = logLik(fit)
  lratio = abs(as.numeric(l0)-as.numeric(l1))
  df = abs( attr(l1,'df') - attr(l0,'df') )
  pval = 1-pchisq( 2*lratio, df)
  return(pval)
}

## qqR2.R
qqR2 <- function(corvec,nn,pad_neg_with_0 = FALSE,...)
{
## nn is the sample size, number of individuals used to compute correlation.
## needs correlation vector as input.
## nullcorvec generates a random sample from correlation distributions, under the null hypothesis of 0 correlation using Fisher's approximation.
  if(pad_neg_with_0) corvec[corvec < 0 | is.na(corvec) ]=0
  mm <- length(corvec)
  nullcorvec = tanh(rnorm(mm)/sqrt(nn-3)) ## null correlation vector
  qqplot(nullcorvec^2,corvec^2,...); abline(0,1); grid()
}

## eFDR.R
## May 7, 2012
## Hae Kyung Im
## haky@uchicago.edu
##


## Please cite:
## Eric R Gamazon, R. Stephanie Huang, Eileen Dolan, Nancy Cox, and Hae Kyung Im, (2012)
## Integrative Genomics: Quantifying significance of phenotype-genotype
## relationships from multiple sources of high-throughput data

## my.scatter.rug.R
require(ggplot2); require(grid)

my.scatter.rug = function(df,xname,yname,main='myplot',round.digits=2)
{ ## df is data frame, xname is name of the column to be ploted in x axis
  ## http://stackoverflow.com/questions/8545035/scatterplot-with-marginal-histograms-in-ggplot2
  xy = mutate_(df, "x"=xname, "y"=yname)
  pp <- ggplot(xy, aes(x, y)) +
  # set the locations of the x-axis labels as Tukey's five numbers
  scale_x_continuous(limit=c(min(xy$x), max(xy$x)),
                     breaks=round(fivenum(xy$x),round.digits)) +

## tileplot.R
require(ggplot2)

tileplot <- function(mat)
{
  mat = data.frame(mat)
  mat$Var1 = factor(rownames(mat), levels=rownames(mat)) ## preserve rowname order
  melted_mat <- gather(mat,key=Var2,value=value,-Var1)
  melted_mat$Var2 = factor(melted_mat$Var2, levels=colnames(mat)) ## preserve colname order
  rango = range(melted_mat$value)
  pp <- ggplot(melted_mat,aes(x=Var1,y=Var2,fill=value)) + geom_tile() ##+scale_fill_gradientn(colours = c("#C00000", "#FF0000", "#FF8080", "#FFC0C0", "#FFFFFF", "#C0C0FF", "#8080FF", "#0000FF", "#0000C0"), limit = c(-1,1))

## ternary_plot_coloc_ps.R
##install.packages("ggtern")
## library(ggtern)
## library(ggplot2)

plot_coloc_tern <- function(df) {

## function expects a data frame df with column names as follows
##      coloc_p0     coloc_p1      coloc_p2     coloc_p3     coloc_p4
##  3.776851e-75 1.173353e-08  3.218853e-67 0.9999999326 5.563409e-08
##  2.020855e-107 5.216090e-01 1.633054e-107 0.4214557130 5.693525e-02
	# calculate ptrs function
	calc_ptrs <- function(expr, weights)
	{
	fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
	}

	fast_generate_trait <- function(expr, weights){
	# convert the predicted df into a matrix
	if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
	as.matrix() else test <- expr
	## meta analysis, sample size based
	metaZn = function(zdisc,zrepl,ndisc,nrepl)
	{
	## calculate meta analysis Zscore
	## zdisc and zrepl are zscores in the discovery and replication sets respectively
	## ndisc and nrepl are sample sizes in the discovery and replication sets
	wdisc = sqrt(ndisc)
	wrepl = sqrt(nrepl)
	( zdisc * wdisc + zrepl * wrepl )/sqrt( wdisc^2 + wrepl^2 )
	}
	## pvalue vs uniform

	qqunif =
	function(p,BH=T,CI=T,mlog10_p_thres=30,...)
	{
	## thresholded by default at 1e-30
	p=na.omit(p)
	nn = length(p)
	xx = -log10((1:nn)/(nn+1))
	X = GXM
	filename = paste0(data.dir,"tempo/gcta-run/gxm.grm")
	ids = rnadata$sidno
	write_GRMgz = function(X,filename,ids)
	{
	#X[upper.tri(X,diag=TRUE)]
	rmat = row(X)
	cmat = col(X)
	omat = cbind(cmat[upper.tri(cmat,diag=TRUE)],rmat[upper.tri(rmat,diag=TRUE)],ncol(rnamat),X[upper.tri(X,diag=TRUE)])
	write_tsv(data.frame(omat),path=filename,col_names = FALSE)
	## loglik test
	logLik.test = function(fit0,fit)
	{
	l0 = logLik(fit0)
	l1 = logLik(fit)
	lratio = abs(as.numeric(l0)-as.numeric(l1))
	df = abs( attr(l1,'df') - attr(l0,'df') )
	pval = 1-pchisq( 2*lratio, df)
	return(pval)
	}
	qqR2 <- function(corvec,nn,pad_neg_with_0 = FALSE,...)
	{
	## nn is the sample size, number of individuals used to compute correlation.
	## needs correlation vector as input.
	## nullcorvec generates a random sample from correlation distributions, under the null hypothesis of 0 correlation using Fisher's approximation.
	if(pad_neg_with_0) corvec[corvec < 0 \| is.na(corvec) ]=0
	mm <- length(corvec)
	nullcorvec = tanh(rnorm(mm)/sqrt(nn-3)) ## null correlation vector
	qqplot(nullcorvec^2,corvec^2,...); abline(0,1); grid()
	}
	## May 7, 2012
	## Hae Kyung Im
	## haky@uchicago.edu
	##


	## Please cite:
	## Eric R Gamazon, R. Stephanie Huang, Eileen Dolan, Nancy Cox, and Hae Kyung Im, (2012)
	## Integrative Genomics: Quantifying significance of phenotype-genotype
	## relationships from multiple sources of high-throughput data
	require(ggplot2); require(grid)

	my.scatter.rug = function(df,xname,yname,main='myplot',round.digits=2)
	{ ## df is data frame, xname is name of the column to be ploted in x axis
	## http://stackoverflow.com/questions/8545035/scatterplot-with-marginal-histograms-in-ggplot2
	xy = mutate_(df, "x"=xname, "y"=yname)
	pp <- ggplot(xy, aes(x, y)) +
	# set the locations of the x-axis labels as Tukey's five numbers
	scale_x_continuous(limit=c(min(xy$x), max(xy$x)),
	breaks=round(fivenum(xy$x),round.digits)) +
	require(ggplot2)

	tileplot <- function(mat)
	{
	mat = data.frame(mat)
	mat$Var1 = factor(rownames(mat), levels=rownames(mat)) ## preserve rowname order
	melted_mat <- gather(mat,key=Var2,value=value,-Var1)
	melted_mat$Var2 = factor(melted_mat$Var2, levels=colnames(mat)) ## preserve colname order
	rango = range(melted_mat$value)
	pp <- ggplot(melted_mat,aes(x=Var1,y=Var2,fill=value)) + geom_tile() ##+scale_fill_gradientn(colours = c("#C00000", "#FF0000", "#FF8080", "#FFC0C0", "#FFFFFF", "#C0C0FF", "#8080FF", "#0000FF", "#0000C0"), limit = c(-1,1))
	##install.packages("ggtern")
	## library(ggtern)
	## library(ggplot2)

	plot_coloc_tern <- function(df) {

	## function expects a data frame df with column names as follows
	## coloc_p0 coloc_p1 coloc_p2 coloc_p3 coloc_p4
	## 3.776851e-75 1.173353e-08 3.218853e-67 0.9999999326 5.563409e-08
	## 2.020855e-107 5.216090e-01 1.633054e-107 0.4214557130 5.693525e-02