hakyim/fn_ratxcan.R

## fn_ratxcan.R
# calculate ptrs function
calc_ptrs <- function(expr, weights)
{
  fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
}

fast_generate_trait <- function(expr, weights){
  # convert the predicted df into a matrix
  if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
    as.matrix() else test <- expr

  if(!is.null(rownames(weights))) rownames(weights) = NULL
  weights = weights %>% data.frame() %>%
    column_to_rownames(var = "gene_name") %>% as.matrix()

  # find the intersect
  ww <- weights[intersect(colnames(test), rownames(weights)),]
  if(!is.matrix(ww)) rname_ww = names(ww) else rname_ww = rownames(ww) ## this fixes the problem with base R that turns column matrices to vectors
  test <- test[, intersect(colnames(test), rname_ww)]

  aa = test %*% ww
  return(aa)
}

## calculate p-value from correlation
cor2pval = function(cc,nn)
{
  zz = atanh(cc) * sqrt(nn-3)
  pnorm(-abs(zz))*2
}
p2chi2 <- function(pmat)
{
  qnorm(pmat/2)^2
}
cor2chi2 = function(cc,nn)
{
  pval = cor2pval(cc,nn)
  p2chi2(pval)
}

## calc correlation between matched columns
calc_cor_matched_cols = function(xx_df, yy_df)
{
  ## calc correlation between matched columns of two df
  ## first column should be IID
  ## remaining columns have rectangular data
  ## it can also be a matrix with IID as rownames
  if(!is.matrix(xx_df)) xx <- xx_df %>% column_to_rownames(var = "IID") %>% as.matrix() else xx<-xx_df
  if(!is.matrix(yy_df)) yy <- yy_df %>% column_to_rownames(var = "IID") %>% as.matrix() else yy<-yy_df

  indrow = intersect(rownames(xx), rownames(yy))
  indcol = intersect(colnames(xx), colnames(yy))

  xx <- xx[indrow,indcol] %>% scale()
  yy <- yy[indrow,indcol] %>% scale()

  print(glue("n rows = {length(indrow)}"))
  print(glue("n cols = {length(indcol)}"))

  res <- list()
  res$cor <- apply(xx * yy,2,mean)
  res$nsamp <- length(indrow)
  res
}


## order df with idlist order
align2idlist = function(df, idlist)
{
  ## takes df with id's in the first column, returns df ordered by idlist
  ## requirements
  ## df has id in the first column
  ## idlist must be a subset df$IID
  idname <- names(df)[1]
  names(df)[1]="_IID"
  if(sum(!idlist %in% df$`_IID`)>0)
    stop("Some ids in the idlist are missing. Make sure idlist is a subset of the ids present in the df")
  df <- df[match(idlist,df$`_IID`),]
  names(df)[1]=idname
  df
}

fast_predixcan_assoc = function(expr_df, trait_df, idlist)
{
  ## requirements
  ## expr_df has the predicted expression levels
  ## expr_df has id in the first column
  ## trait_df has id in the first column
  ## ids don't need to be in the same order
  ## no NA's

  idname1 <- names(expr_df)[1]
  idname2 <- names(trait_df)[1]
  names(expr_df)[1] = "_IID"
  names(trait_df)[1] = "_IID"

  nsamp <- length(idlist)
  idlist = intersect(idlist,intersect(expr_df$`_IID`,trait_df$`_IID`))
  if(nsamp > length(idlist)) warning("some of the ids were not present in the expr or trait df")

  ## build expression matrix ordered by idlist
  expr_df = align2idlist(expr_df,idlist)
  expr_mat <- expr_df %>% select(-`_IID`) %>% as.matrix
  expr_mat <- scale(expr_mat)
  print(dim(expr_mat))

  ## build trait vector ordered by idlist
  trait_df = align2idlist(trait_df,idlist)
  trait_mat <- trait_df %>% select(-`_IID`) %>% as.matrix
  trait_mat <- scale(trait_mat) ## this should be nsamp x 1 matrix
  print(dim(trait_mat))

  nsamp=length(idlist)
  cormat = t(trait_mat) %*% expr_mat / nsamp
  cormat = t(cormat)

  names(expr_df)[1] = idname1
  names(trait_df)[1] = idname2

  print(glue("A sample size of {nsamp} was used"))
  tibble(geneid=colnames(expr_mat),cor=c(cormat),pval=cor2pval(c(cormat),nsamp)) %>% arrange(pval)
}

fast_cor_ptrs_trait = function(ptrs_df, trait_df, idlist) fast_predixcan_assoc(ptrs_df,trait_df, idlist) %>% rename(model=geneid)
## correlate ptrs for multiple models (diff hyperparam) vs observed trait

## matrix linear regression
matrix_lm <- function(ymat, exp_mat)
{
  ## input
  ## ymat: phenotype matrix nsam by number of phenotypes
  ## exp_mat: expression matrix nsam by n_genes
  ## assumes matching of IDs, row names of exp_mat and ymat need to match
  nsam = nrow(ymat)
  if(!identical(rownames(ymat),rownames(exp_mat))) stop("ids don't match")
  yy = scale(ymat)
  XX = scale(exp_mat)
  cormat = t(yy) %*% XX /nsam
  cormat = t(cormat)
}

## read GRM from GCTA bin format
## function in OmicKriging modified from original fro GCTA documentation
read_GRMBin <-
function (prefix, size = 4)
{
    sum_i <- function(i) {
        return(sum(1:i))
    }
    BinFileName <- paste(prefix, ".bin", sep = "")
    NFileName <- paste(prefix, ".N.bin", sep = "")
    IDFileName <- paste(prefix, ".id", sep = "")
    id <- read.table(IDFileName)
    n <- dim(id)[1]
    BinFile <- file(BinFileName, "rb")
    grm <- readBin(BinFile, n = n * (n + 1)/2, what = numeric(0),
        size = size)
    NFile <- file(NFileName, "rb")
    N <- readBin(NFile, n = 1, what = numeric(0), size = size)
    i <- sapply(1:n, sum_i)
    close(BinFile)
    close(NFile)
    diag.elem <- grm[i]
    off.diag.elem <- grm[-i]
    X <- diag(diag.elem)
    X[upper.tri(X, diag = FALSE)] <- off.diag.elem
    X <- X + t(X) - diag(diag(X))
    rownames(X) <- id$V2
    colnames(X) <- id$V2
    return(X)
}
	# calculate ptrs function
	calc_ptrs <- function(expr, weights)
	{
	fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
	}

	fast_generate_trait <- function(expr, weights){
	# convert the predicted df into a matrix
	if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
	as.matrix() else test <- expr

	if(!is.null(rownames(weights))) rownames(weights) = NULL
	weights = weights %>% data.frame() %>%
	column_to_rownames(var = "gene_name") %>% as.matrix()

	# find the intersect
	ww <- weights[intersect(colnames(test), rownames(weights)),]
	if(!is.matrix(ww)) rname_ww = names(ww) else rname_ww = rownames(ww) ## this fixes the problem with base R that turns column matrices to vectors
	test <- test[, intersect(colnames(test), rname_ww)]

	aa = test %*% ww
	return(aa)
	}

	## calculate p-value from correlation
	cor2pval = function(cc,nn)
	{
	zz = atanh(cc) * sqrt(nn-3)
	pnorm(-abs(zz))*2
	}
	p2chi2 <- function(pmat)
	{
	qnorm(pmat/2)^2
	}
	cor2chi2 = function(cc,nn)
	{
	pval = cor2pval(cc,nn)
	p2chi2(pval)
	}

	## calc correlation between matched columns
	calc_cor_matched_cols = function(xx_df, yy_df)
	{
	## calc correlation between matched columns of two df
	## first column should be IID
	## remaining columns have rectangular data
	## it can also be a matrix with IID as rownames
	if(!is.matrix(xx_df)) xx <- xx_df %>% column_to_rownames(var = "IID") %>% as.matrix() else xx<-xx_df
	if(!is.matrix(yy_df)) yy <- yy_df %>% column_to_rownames(var = "IID") %>% as.matrix() else yy<-yy_df

	indrow = intersect(rownames(xx), rownames(yy))
	indcol = intersect(colnames(xx), colnames(yy))

	xx <- xx[indrow,indcol] %>% scale()
	yy <- yy[indrow,indcol] %>% scale()

	print(glue("n rows = {length(indrow)}"))
	print(glue("n cols = {length(indcol)}"))

	res <- list()
	res$cor <- apply(xx * yy,2,mean)
	res$nsamp <- length(indrow)
	res
	}


	## order df with idlist order
	align2idlist = function(df, idlist)
	{
	## takes df with id's in the first column, returns df ordered by idlist
	## requirements
	## df has id in the first column
	## idlist must be a subset df$IID
	idname <- names(df)[1]
	names(df)[1]="_IID"
	if(sum(!idlist %in% df$`_IID`)>0)
	stop("Some ids in the idlist are missing. Make sure idlist is a subset of the ids present in the df")
	df <- df[match(idlist,df$`_IID`),]
	names(df)[1]=idname
	df
	}

	fast_predixcan_assoc = function(expr_df, trait_df, idlist)
	{
	## requirements
	## expr_df has the predicted expression levels
	## expr_df has id in the first column
	## trait_df has id in the first column
	## ids don't need to be in the same order
	## no NA's

	idname1 <- names(expr_df)[1]
	idname2 <- names(trait_df)[1]
	names(expr_df)[1] = "_IID"
	names(trait_df)[1] = "_IID"

	nsamp <- length(idlist)
	idlist = intersect(idlist,intersect(expr_df$`_IID`,trait_df$`_IID`))
	if(nsamp > length(idlist)) warning("some of the ids were not present in the expr or trait df")

	## build expression matrix ordered by idlist
	expr_df = align2idlist(expr_df,idlist)
	expr_mat <- expr_df %>% select(-`_IID`) %>% as.matrix
	expr_mat <- scale(expr_mat)
	print(dim(expr_mat))

	## build trait vector ordered by idlist
	trait_df = align2idlist(trait_df,idlist)
	trait_mat <- trait_df %>% select(-`_IID`) %>% as.matrix
	trait_mat <- scale(trait_mat) ## this should be nsamp x 1 matrix
	print(dim(trait_mat))

	nsamp=length(idlist)
	cormat = t(trait_mat) %*% expr_mat / nsamp
	cormat = t(cormat)

	names(expr_df)[1] = idname1
	names(trait_df)[1] = idname2

	print(glue("A sample size of {nsamp} was used"))
	tibble(geneid=colnames(expr_mat),cor=c(cormat),pval=cor2pval(c(cormat),nsamp)) %>% arrange(pval)
	}

	fast_cor_ptrs_trait = function(ptrs_df, trait_df, idlist) fast_predixcan_assoc(ptrs_df,trait_df, idlist) %>% rename(model=geneid)
	## correlate ptrs for multiple models (diff hyperparam) vs observed trait

	## matrix linear regression
	matrix_lm <- function(ymat, exp_mat)
	{
	## input
	## ymat: phenotype matrix nsam by number of phenotypes
	## exp_mat: expression matrix nsam by n_genes
	## assumes matching of IDs, row names of exp_mat and ymat need to match
	nsam = nrow(ymat)
	if(!identical(rownames(ymat),rownames(exp_mat))) stop("ids don't match")
	yy = scale(ymat)
	XX = scale(exp_mat)
	cormat = t(yy) %*% XX /nsam
	cormat = t(cormat)
	}

	## read GRM from GCTA bin format
	## function in OmicKriging modified from original fro GCTA documentation
	read_GRMBin <-
	function (prefix, size = 4)
	{
	sum_i <- function(i) {
	return(sum(1:i))
	}
	BinFileName <- paste(prefix, ".bin", sep = "")
	NFileName <- paste(prefix, ".N.bin", sep = "")
	IDFileName <- paste(prefix, ".id", sep = "")
	id <- read.table(IDFileName)
	n <- dim(id)[1]
	BinFile <- file(BinFileName, "rb")
	grm <- readBin(BinFile, n = n * (n + 1)/2, what = numeric(0),
	size = size)
	NFile <- file(NFileName, "rb")
	N <- readBin(NFile, n = 1, what = numeric(0), size = size)
	i <- sapply(1:n, sum_i)
	close(BinFile)
	close(NFile)
	diag.elem <- grm[i]
	off.diag.elem <- grm[-i]
	X <- diag(diag.elem)
	X[upper.tri(X, diag = FALSE)] <- off.diag.elem
	X <- X + t(X) - diag(diag(X))
	rownames(X) <- id$V2
	colnames(X) <- id$V2
	return(X)
	}