Last active
May 1, 2024 01:35
-
-
Save hakyim/115403f16bec0a0e871f3616d552ce9b to your computer and use it in GitHub Desktop.
RatXcan functions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# calculate ptrs function | |
calc_ptrs <- function(expr, weights) | |
{ | |
fast_generate_trait(expr, weights) %>% as.data.frame() %>% rownames_to_column(var = "IID") %>% tibble() | |
} | |
fast_generate_trait <- function(expr, weights){ | |
# convert the predicted df into a matrix | |
if(!is.matrix(expr)) test <- expr %>% column_to_rownames(var = "IID") %>% | |
as.matrix() else test <- expr | |
if(!is.null(rownames(weights))) rownames(weights) = NULL | |
weights = weights %>% data.frame() %>% | |
column_to_rownames(var = "gene_name") %>% as.matrix() | |
# find the intersect | |
ww <- weights[intersect(colnames(test), rownames(weights)),] | |
if(!is.matrix(ww)) rname_ww = names(ww) else rname_ww = rownames(ww) ## this fixes the problem with base R that turns column matrices to vectors | |
test <- test[, intersect(colnames(test), rname_ww)] | |
aa = test %*% ww | |
return(aa) | |
} | |
## calculate p-value from correlation | |
cor2pval = function(cc,nn) | |
{ | |
zz = atanh(cc) * sqrt(nn-3) | |
pnorm(-abs(zz))*2 | |
} | |
p2chi2 <- function(pmat) | |
{ | |
qnorm(pmat/2)^2 | |
} | |
cor2chi2 = function(cc,nn) | |
{ | |
pval = cor2pval(cc,nn) | |
p2chi2(pval) | |
} | |
## calc correlation between matched columns | |
calc_cor_matched_cols = function(xx_df, yy_df) | |
{ | |
## calc correlation between matched columns of two df | |
## first column should be IID | |
## remaining columns have rectangular data | |
## it can also be a matrix with IID as rownames | |
if(!is.matrix(xx_df)) xx <- xx_df %>% column_to_rownames(var = "IID") %>% as.matrix() else xx<-xx_df | |
if(!is.matrix(yy_df)) yy <- yy_df %>% column_to_rownames(var = "IID") %>% as.matrix() else yy<-yy_df | |
indrow = intersect(rownames(xx), rownames(yy)) | |
indcol = intersect(colnames(xx), colnames(yy)) | |
xx <- xx[indrow,indcol] %>% scale() | |
yy <- yy[indrow,indcol] %>% scale() | |
print(glue("n rows = {length(indrow)}")) | |
print(glue("n cols = {length(indcol)}")) | |
res <- list() | |
res$cor <- apply(xx * yy,2,mean) | |
res$nsamp <- length(indrow) | |
res | |
} | |
## order df with idlist order | |
align2idlist = function(df, idlist) | |
{ | |
## takes df with id's in the first column, returns df ordered by idlist | |
## requirements | |
## df has id in the first column | |
## idlist must be a subset df$IID | |
idname <- names(df)[1] | |
names(df)[1]="_IID" | |
if(sum(!idlist %in% df$`_IID`)>0) | |
stop("Some ids in the idlist are missing. Make sure idlist is a subset of the ids present in the df") | |
df <- df[match(idlist,df$`_IID`),] | |
names(df)[1]=idname | |
df | |
} | |
fast_predixcan_assoc = function(expr_df, trait_df, idlist) | |
{ | |
## requirements | |
## expr_df has the predicted expression levels | |
## expr_df has id in the first column | |
## trait_df has id in the first column | |
## ids don't need to be in the same order | |
## no NA's | |
idname1 <- names(expr_df)[1] | |
idname2 <- names(trait_df)[1] | |
names(expr_df)[1] = "_IID" | |
names(trait_df)[1] = "_IID" | |
nsamp <- length(idlist) | |
idlist = intersect(idlist,intersect(expr_df$`_IID`,trait_df$`_IID`)) | |
if(nsamp > length(idlist)) warning("some of the ids were not present in the expr or trait df") | |
## build expression matrix ordered by idlist | |
expr_df = align2idlist(expr_df,idlist) | |
expr_mat <- expr_df %>% select(-`_IID`) %>% as.matrix | |
expr_mat <- scale(expr_mat) | |
print(dim(expr_mat)) | |
## build trait vector ordered by idlist | |
trait_df = align2idlist(trait_df,idlist) | |
trait_mat <- trait_df %>% select(-`_IID`) %>% as.matrix | |
trait_mat <- scale(trait_mat) ## this should be nsamp x 1 matrix | |
print(dim(trait_mat)) | |
nsamp=length(idlist) | |
cormat = t(trait_mat) %*% expr_mat / nsamp | |
cormat = t(cormat) | |
names(expr_df)[1] = idname1 | |
names(trait_df)[1] = idname2 | |
print(glue("A sample size of {nsamp} was used")) | |
tibble(geneid=colnames(expr_mat),cor=c(cormat),pval=cor2pval(c(cormat),nsamp)) %>% arrange(pval) | |
} | |
fast_cor_ptrs_trait = function(ptrs_df, trait_df, idlist) fast_predixcan_assoc(ptrs_df,trait_df, idlist) %>% rename(model=geneid) | |
## correlate ptrs for multiple models (diff hyperparam) vs observed trait | |
## matrix linear regression | |
matrix_lm <- function(ymat, exp_mat) | |
{ | |
## input | |
## ymat: phenotype matrix nsam by number of phenotypes | |
## exp_mat: expression matrix nsam by n_genes | |
## assumes matching of IDs, row names of exp_mat and ymat need to match | |
nsam = nrow(ymat) | |
if(!identical(rownames(ymat),rownames(exp_mat))) stop("ids don't match") | |
yy = scale(ymat) | |
XX = scale(exp_mat) | |
cormat = t(yy) %*% XX /nsam | |
cormat = t(cormat) | |
} | |
## read GRM from GCTA bin format | |
## function in OmicKriging modified from original fro GCTA documentation | |
read_GRMBin <- | |
function (prefix, size = 4) | |
{ | |
sum_i <- function(i) { | |
return(sum(1:i)) | |
} | |
BinFileName <- paste(prefix, ".bin", sep = "") | |
NFileName <- paste(prefix, ".N.bin", sep = "") | |
IDFileName <- paste(prefix, ".id", sep = "") | |
id <- read.table(IDFileName) | |
n <- dim(id)[1] | |
BinFile <- file(BinFileName, "rb") | |
grm <- readBin(BinFile, n = n * (n + 1)/2, what = numeric(0), | |
size = size) | |
NFile <- file(NFileName, "rb") | |
N <- readBin(NFile, n = 1, what = numeric(0), size = size) | |
i <- sapply(1:n, sum_i) | |
close(BinFile) | |
close(NFile) | |
diag.elem <- grm[i] | |
off.diag.elem <- grm[-i] | |
X <- diag(diag.elem) | |
X[upper.tri(X, diag = FALSE)] <- off.diag.elem | |
X <- X + t(X) - diag(diag(X)) | |
rownames(X) <- id$V2 | |
colnames(X) <- id$V2 | |
return(X) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment