Skip to content

Instantly share code, notes, and snippets.

@hakyim
Last active May 1, 2024 01:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hakyim/115403f16bec0a0e871f3616d552ce9b to your computer and use it in GitHub Desktop.
Save hakyim/115403f16bec0a0e871f3616d552ce9b to your computer and use it in GitHub Desktop.
RatXcan functions
# calculate ptrs function
calc_ptrs <- function(expr, weights)
{
fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble()
}
fast_generate_trait <- function(expr, weights){
# convert the predicted df into a matrix
if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>%
as.matrix() else test <- expr
if(!is.null(rownames(weights))) rownames(weights) = NULL
weights = weights %>% data.frame() %>%
column_to_rownames(var = "gene_name") %>% as.matrix()
# find the intersect
ww <- weights[intersect(colnames(test), rownames(weights)),]
if(!is.matrix(ww)) rname_ww = names(ww) else rname_ww = rownames(ww) ## this fixes the problem with base R that turns column matrices to vectors
test <- test[, intersect(colnames(test), rname_ww)]
aa = test %*% ww
return(aa)
}
## calculate p-value from correlation
cor2pval = function(cc,nn)
{
zz = atanh(cc) * sqrt(nn-3)
pnorm(-abs(zz))*2
}
p2chi2 <- function(pmat)
{
qnorm(pmat/2)^2
}
cor2chi2 = function(cc,nn)
{
pval = cor2pval(cc,nn)
p2chi2(pval)
}
## calc correlation between matched columns
calc_cor_matched_cols = function(xx_df, yy_df)
{
## calc correlation between matched columns of two df
## first column should be IID
## remaining columns have rectangular data
## it can also be a matrix with IID as rownames
if(!is.matrix(xx_df)) xx <- xx_df %>% column_to_rownames(var = "IID") %>% as.matrix() else xx<-xx_df
if(!is.matrix(yy_df)) yy <- yy_df %>% column_to_rownames(var = "IID") %>% as.matrix() else yy<-yy_df
indrow = intersect(rownames(xx), rownames(yy))
indcol = intersect(colnames(xx), colnames(yy))
xx <- xx[indrow,indcol] %>% scale()
yy <- yy[indrow,indcol] %>% scale()
print(glue("n rows = {length(indrow)}"))
print(glue("n cols = {length(indcol)}"))
res <- list()
res$cor <- apply(xx * yy,2,mean)
res$nsamp <- length(indrow)
res
}
## order df with idlist order
align2idlist = function(df, idlist)
{
## takes df with id's in the first column, returns df ordered by idlist
## requirements
## df has id in the first column
## idlist must be a subset df$IID
idname <- names(df)[1]
names(df)[1]="_IID"
if(sum(!idlist %in% df$`_IID`)>0)
stop("Some ids in the idlist are missing. Make sure idlist is a subset of the ids present in the df")
df <- df[match(idlist,df$`_IID`),]
names(df)[1]=idname
df
}
fast_predixcan_assoc = function(expr_df, trait_df, idlist)
{
## requirements
## expr_df has the predicted expression levels
## expr_df has id in the first column
## trait_df has id in the first column
## ids don't need to be in the same order
## no NA's
idname1 <- names(expr_df)[1]
idname2 <- names(trait_df)[1]
names(expr_df)[1] = "_IID"
names(trait_df)[1] = "_IID"
nsamp <- length(idlist)
idlist = intersect(idlist,intersect(expr_df$`_IID`,trait_df$`_IID`))
if(nsamp > length(idlist)) warning("some of the ids were not present in the expr or trait df")
## build expression matrix ordered by idlist
expr_df = align2idlist(expr_df,idlist)
expr_mat <- expr_df %>% select(-`_IID`) %>% as.matrix
expr_mat <- scale(expr_mat)
print(dim(expr_mat))
## build trait vector ordered by idlist
trait_df = align2idlist(trait_df,idlist)
trait_mat <- trait_df %>% select(-`_IID`) %>% as.matrix
trait_mat <- scale(trait_mat) ## this should be nsamp x 1 matrix
print(dim(trait_mat))
nsamp=length(idlist)
cormat = t(trait_mat) %*% expr_mat / nsamp
cormat = t(cormat)
names(expr_df)[1] = idname1
names(trait_df)[1] = idname2
print(glue("A sample size of {nsamp} was used"))
tibble(geneid=colnames(expr_mat),cor=c(cormat),pval=cor2pval(c(cormat),nsamp)) %>% arrange(pval)
}
fast_cor_ptrs_trait = function(ptrs_df, trait_df, idlist) fast_predixcan_assoc(ptrs_df,trait_df, idlist) %>% rename(model=geneid)
## correlate ptrs for multiple models (diff hyperparam) vs observed trait
## matrix linear regression
matrix_lm <- function(ymat, exp_mat)
{
## input
## ymat: phenotype matrix nsam by number of phenotypes
## exp_mat: expression matrix nsam by n_genes
## assumes matching of IDs, row names of exp_mat and ymat need to match
nsam = nrow(ymat)
if(!identical(rownames(ymat),rownames(exp_mat))) stop("ids don't match")
yy = scale(ymat)
XX = scale(exp_mat)
cormat = t(yy) %*% XX /nsam
cormat = t(cormat)
}
## read GRM from GCTA bin format
## function in OmicKriging modified from original fro GCTA documentation
read_GRMBin <-
function (prefix, size = 4)
{
sum_i <- function(i) {
return(sum(1:i))
}
BinFileName <- paste(prefix, ".bin", sep = "")
NFileName <- paste(prefix, ".N.bin", sep = "")
IDFileName <- paste(prefix, ".id", sep = "")
id <- read.table(IDFileName)
n <- dim(id)[1]
BinFile <- file(BinFileName, "rb")
grm <- readBin(BinFile, n = n * (n + 1)/2, what = numeric(0),
size = size)
NFile <- file(NFileName, "rb")
N <- readBin(NFile, n = 1, what = numeric(0), size = size)
i <- sapply(1:n, sum_i)
close(BinFile)
close(NFile)
diag.elem <- grm[i]
off.diag.elem <- grm[-i]
X <- diag(diag.elem)
X[upper.tri(X, diag = FALSE)] <- off.diag.elem
X <- X + t(X) - diag(diag(X))
rownames(X) <- id$V2
colnames(X) <- id$V2
return(X)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment