Hae Kyung Im hakyim

## fn_ratxcan.R
# calculate ptrs function
calc_ptrs <- function(expr, weights)
{
  fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
}

fast_generate_trait <- function(expr, weights){
  # convert the predicted df into a matrix
  if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
    as.matrix() else test <- expr

## fastlm.R
## Relatively fast linear regression

fastlm = function(xx,yy)
{
  ## compute betahat (regression coef) and pvalue with Ftest
  ## for now it does not take covariates

  df1 = 2
  df0 = 1
  ind = !is.na(xx) & !is.na(yy)

## read_write_GRM_gz
X = GXM
filename = paste0(data.dir,"tempo/gcta-run/gxm.grm")
ids = rnadata$sidno
write_GRMgz = function(X,filename,ids)
{
 #X[upper.tri(X,diag=TRUE)]
 rmat = row(X)
 cmat = col(X)
 omat = cbind(cmat[upper.tri(cmat,diag=TRUE)],rmat[upper.tri(rmat,diag=TRUE)],ncol(rnamat),X[upper.tri(X,diag=TRUE)])
 write_tsv(data.frame(omat),path=filename,col_names = FALSE)

## power-calculation.R
library(pwr)
pwr.r.test(n=90,sig.level=0.05,power=0.8)

##     approximate correlation power calculation (arctangh transformation)
##
##              n = 90
##              r = 0.2913625
##      sig.level = 0.05
##          power = 0.8
##    alternative = two.sided

## addband2gene.R
## add cytoband to gene
addband2gene = function(df,geneid = "ensembl_gene_id")
{
  if(!exists("anno_gene"))
  {
    ensembl <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl")
    anno_gene <<- biomaRt::getBM(attributes = c("ensembl_gene_id","external_gene_name","chromosome_name","start_position","end_position","band", "gene_biotype"),mart=ensembl )
  print("defined anno_gene")
  }
  if(geneid != "ensembl_gene_id") names(df)[names(df)==geneid] = "ensembl_gene_id" ## this is an ugly workaround - need to find a way to use rename_ for this but don't know how to specify a string instead of name in rename(geneid = "ensembl_gene_id")

## ternary_plot_coloc_ps.R
##install.packages("ggtern")
## library(ggtern)
## library(ggplot2)

plot_coloc_tern <- function(df) {

## function expects a data frame df with column names as follows
##      coloc_p0     coloc_p1      coloc_p2     coloc_p3     coloc_p4
##  3.776851e-75 1.173353e-08  3.218853e-67 0.9999999326 5.563409e-08
##  2.020855e-107 5.216090e-01 1.633054e-107 0.4214557130 5.693525e-02

## metaanalysis.R
## meta analysis, sample size based
metaZn =  function(zdisc,zrepl,ndisc,nrepl)
  {
  ## calculate meta analysis Zscore
  ## zdisc and zrepl are zscores in the discovery and replication sets respectively
  ## ndisc and nrepl are sample sizes in the discovery and replication sets
    wdisc = sqrt(ndisc)
    wrepl = sqrt(nrepl)
    ( zdisc * wdisc  + zrepl * wrepl )/sqrt( wdisc^2 + wrepl^2 )
}

## miscel.R
## loglik test
logLik.test = function(fit0,fit)
{
  l0 = logLik(fit0)
  l1 = logLik(fit)
  lratio = abs(as.numeric(l0)-as.numeric(l1))
  df = abs( attr(l1,'df') - attr(l0,'df') )
  pval = 1-pchisq( 2*lratio, df)
  return(pval)
}

## tileplot.R
require(ggplot2)

tileplot <- function(mat)
{
  mat = data.frame(mat)
  mat$Var1 = factor(rownames(mat), levels=rownames(mat)) ## preserve rowname order
  melted_mat <- gather(mat,key=Var2,value=value,-Var1)
  melted_mat$Var2 = factor(melted_mat$Var2, levels=colnames(mat)) ## preserve colname order
  rango = range(melted_mat$value)
  pp <- ggplot(melted_mat,aes(x=Var1,y=Var2,fill=value)) + geom_tile() ##+scale_fill_gradientn(colours = c("#C00000", "#FF0000", "#FF8080", "#FFC0C0", "#FFFFFF", "#C0C0FF", "#8080FF", "#0000FF", "#0000C0"), limit = c(-1,1))

## my.scatter.rug.R
require(ggplot2); require(grid)

my.scatter.rug = function(df,xname,yname,main='myplot',round.digits=2)
{ ## df is data frame, xname is name of the column to be ploted in x axis
  ## http://stackoverflow.com/questions/8545035/scatterplot-with-marginal-histograms-in-ggplot2
  xy = mutate_(df, "x"=xname, "y"=yname)
  pp <- ggplot(xy, aes(x, y)) +
  # set the locations of the x-axis labels as Tukey's five numbers
  scale_x_continuous(limit=c(min(xy$x), max(xy$x)),
                     breaks=round(fivenum(xy$x),round.digits)) +
	# calculate ptrs function
	calc_ptrs <- function(expr, weights)
	{
	fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
	}

	fast_generate_trait <- function(expr, weights){
	# convert the predicted df into a matrix
	if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
	as.matrix() else test <- expr
	## Relatively fast linear regression

	fastlm = function(xx,yy)
	{
	## compute betahat (regression coef) and pvalue with Ftest
	## for now it does not take covariates

	df1 = 2
	df0 = 1
	ind = !is.na(xx) & !is.na(yy)
	X = GXM
	filename = paste0(data.dir,"tempo/gcta-run/gxm.grm")
	ids = rnadata$sidno
	write_GRMgz = function(X,filename,ids)
	{
	#X[upper.tri(X,diag=TRUE)]
	rmat = row(X)
	cmat = col(X)
	omat = cbind(cmat[upper.tri(cmat,diag=TRUE)],rmat[upper.tri(rmat,diag=TRUE)],ncol(rnamat),X[upper.tri(X,diag=TRUE)])
	write_tsv(data.frame(omat),path=filename,col_names = FALSE)
	library(pwr)
	pwr.r.test(n=90,sig.level=0.05,power=0.8)

	## approximate correlation power calculation (arctangh transformation)
	##
	## n = 90
	## r = 0.2913625
	## sig.level = 0.05
	## power = 0.8
	## alternative = two.sided
	## add cytoband to gene
	addband2gene = function(df,geneid = "ensembl_gene_id")
	{
	if(!exists("anno_gene"))
	{
	ensembl <- biomaRt::useMart(biomart="ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl")
	anno_gene <<- biomaRt::getBM(attributes = c("ensembl_gene_id","external_gene_name","chromosome_name","start_position","end_position","band", "gene_biotype"),mart=ensembl )
	print("defined anno_gene")
	}
	if(geneid != "ensembl_gene_id") names(df)[names(df)==geneid] = "ensembl_gene_id" ## this is an ugly workaround - need to find a way to use rename_ for this but don't know how to specify a string instead of name in rename(geneid = "ensembl_gene_id")
	##install.packages("ggtern")
	## library(ggtern)
	## library(ggplot2)

	plot_coloc_tern <- function(df) {

	## function expects a data frame df with column names as follows
	## coloc_p0 coloc_p1 coloc_p2 coloc_p3 coloc_p4
	## 3.776851e-75 1.173353e-08 3.218853e-67 0.9999999326 5.563409e-08
	## 2.020855e-107 5.216090e-01 1.633054e-107 0.4214557130 5.693525e-02
	## meta analysis, sample size based
	metaZn = function(zdisc,zrepl,ndisc,nrepl)
	{
	## calculate meta analysis Zscore
	## zdisc and zrepl are zscores in the discovery and replication sets respectively
	## ndisc and nrepl are sample sizes in the discovery and replication sets
	wdisc = sqrt(ndisc)
	wrepl = sqrt(nrepl)
	( zdisc * wdisc + zrepl * wrepl )/sqrt( wdisc^2 + wrepl^2 )
	}
	## loglik test
	logLik.test = function(fit0,fit)
	{
	l0 = logLik(fit0)
	l1 = logLik(fit)
	lratio = abs(as.numeric(l0)-as.numeric(l1))
	df = abs( attr(l1,'df') - attr(l0,'df') )
	pval = 1-pchisq( 2*lratio, df)
	return(pval)
	}
	require(ggplot2)

	tileplot <- function(mat)
	{
	mat = data.frame(mat)
	mat$Var1 = factor(rownames(mat), levels=rownames(mat)) ## preserve rowname order
	melted_mat <- gather(mat,key=Var2,value=value,-Var1)
	melted_mat$Var2 = factor(melted_mat$Var2, levels=colnames(mat)) ## preserve colname order
	rango = range(melted_mat$value)
	pp <- ggplot(melted_mat,aes(x=Var1,y=Var2,fill=value)) + geom_tile() ##+scale_fill_gradientn(colours = c("#C00000", "#FF0000", "#FF8080", "#FFC0C0", "#FFFFFF", "#C0C0FF", "#8080FF", "#0000FF", "#0000C0"), limit = c(-1,1))
	require(ggplot2); require(grid)

	my.scatter.rug = function(df,xname,yname,main='myplot',round.digits=2)
	{ ## df is data frame, xname is name of the column to be ploted in x axis
	## http://stackoverflow.com/questions/8545035/scatterplot-with-marginal-histograms-in-ggplot2
	xy = mutate_(df, "x"=xname, "y"=yname)
	pp <- ggplot(xy, aes(x, y)) +
	# set the locations of the x-axis labels as Tukey's five numbers
	scale_x_continuous(limit=c(min(xy$x), max(xy$x)),
	breaks=round(fivenum(xy$x),round.digits)) +