Skip to content

Instantly share code, notes, and snippets.

@nievergeltlab
Last active May 25, 2021 20:03
Show Gist options
  • Save nievergeltlab/5cbaa65b550b0b44f48b7a42126d1e26 to your computer and use it in GitHub Desktop.
Save nievergeltlab/5cbaa65b550b0b44f48b7a42126d1e26 to your computer and use it in GitHub Desktop.
args <- commandArgs(trailingOnly = TRUE)
famfile <- args[1]
phenofile <- args[2]
covfile <- args[3]
genofile <- args[4]
genofile_path <- args[5]
outfile <- args[6]
phenoname <- args[7]
#Right now: assuming covariates C1-C5 are used, pheno is coded such that minimum score is 0
#Model will give false positives for monomorphic markers, some filtering must be applied!
#Load libraries
library(data.table)
library(MASS)
library(fastglm)
famfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr15_093_103.out.dosage.fam'
phenofile='1_MRSC_C_V2_2.5_PCL4.txt'
covfile='pts_mrsc_mix_am-qc-eur_pca.menv.mds_cov'
genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz'
genofile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz'
genofile_path='genotypes'
phenoname='pcl4_c_12_future'
outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz.results.txt'
outfile='dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz.results.txt'
# dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_016_039.out.dosage.doscnt.gz
# dos_pts_mrsc_mix_am-qc.hg19.ch.fl.chr22_039_052.out.dosage.doscnt.gz
#Load famfile
fam <- read.table(famfile,header=F,na.strings=c("NA","-9")) #PLINK phenotype
names(fam) <- c("FID","IID","F","M","G","P")
#Make a variable to index subjects, because this data will be merged with phenotype data and cause data to be resorted
fam$order <- c(1:nrow(fam))
#Load phenotypes.
pheno <- read.table(phenofile,header=T,na.strings=c("NA","-9")) #PLINK phenotype
#If the phenotype ranges from 0-4, you MUST make sure the mininum value is 0!
#Load covariates
covar <- read.table(covfile,header=T,na.strings=c("NA","-9")) #PLINK phenotype
covar$intercept <- 1 # It is necessary to add an intercept
#Notice that all.x=T, because the family file length must be preserved!
d1 <- merge(fam,pheno,by=c("FID","IID"),all.x=T,suffixes=c("_fam",""))
dm0 <- merge(d1,covar,by=c("FID","IID"),all.x=T,suffixes=c("_cov",""))
dm <- dm0[order(dm0$order),] #This will order the data according to the geotypes
dm$pheno <- dm[,phenoname] -1 #Make generic column 'pheno' that takes the data of the phenotype
#have -1 for now because of my crummy coding
#Load (stream?) genotypes. Currently a CSV file where columns 1,2,3 are rsID, A1, and A2. Map file is therefore optional. Possibly just stack into this file intiially!!
genotypes <- fread(paste('zcat ', genofile_path,'/', genofile,sep=''),data.table=F,sep=",")[-1,] #remove row 1: that is just going to be SNP A1 A2... columns
#Remove genotypes and data with missing phenotype or PCs
remove <- is.na(dm$pcl4_c_12_future) | is.na(dm$C1)
dm2 <- dm[!remove,]
genotypes_rs <- genotypes[,c(1:3)] #SNP A1 A2 columns
genotypes <- as.matrix(genotypes[,-c(1:3)][,!remove]) #note, because the indexing for 'remove' is based on the famfile, which has no extra columns, must remove columns 1-3 in the genotype matrix to align indexing
row.names(genotypes) <- genotypes_rs[,1]
#Make covariate matrix
covmat <- model.matrix(~C1+C2+C3+C4+C5,data=dm2)
#Get estimate for theta assuming null genetic effect
theta_baseline <- summary(glm.nb(pheno ~ C1 + C2 + C3 +C4+C5,data=dm2))$theta
#apply function right now, could also do some splitting based approach
nbglm <- function(genovector,datamatrix,theta)
{
#it's faster to save the variable and concatenate the relevant stuff than to use SUMMARY
m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,family=negative.binomial(theta),method=3)
#Contrast with a gaussian...
#m <- fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=gaussian())
# print(summary(m))
c(m$coefficients,m$se)
#if you just save c(m$coefficients[1],m$se[1]) you ignore the covariates but makes indexing easy. if you do this , set zvalues <- results[,1]/results[,2]
# summary((fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)))$coefficients
}
#everycolumn is a SNP. Results are reported as estimate, SE, tvalue, and pr>t.Currently genovector,
results <- t(apply(genotypes,1,nbglm,datamatrix=covmat,theta=theta_baseline))
zvalues <- results[,1]/results[,8] #this indexing is precariuous and needs fixing
pvalues <- 2*pnorm(abs(zvalues),lower.tail=F)
total_results <- cbind(genotypes_rs,results[,c(1,8)],zvalues,pvalues)
names(total_results) <- c("SNP","A1","A2","Beta","SE","Z","P")
write.table(total_results,file=paste(outfile,'/',phenoname,'.',genofile,'.txt',sep=''),quote=F,row.names=F)
#cat outputs/*.txt | awk '{if (NR==1 || $1 != "SNP") print}' > adam.results.total.txt
# library(data.table)
# total_results <- fread('adam.results.total.txt',data.table=F)
# liz_snps <- fread('zcat liz_mrsc.pcl4_c_12_future.assoc.linear_fixed.gz',data.table=F)
# data <- total_results[which(total_results$SNP %in% liz_snps$ID) ,]
# data <- liz_snps[which( liz_snps$ID %in% total_results$SNP ) ,]
# outfilename=paste(outfile,'.qq.png',sep='')
# outfilename="chr22_liz.png"
# unadj_filtered <- sort(data[,"P"])
# UNADJ <- -log(unadj_filtered,10)
# QQ <- -log(ppoints(length(UNADJ)),10)
# GCfactor= round(median(qchisq(data[,"P"],1,lower.tail=F),na.rm=T)/.455,3)
# png(paste(outfilename,'.png', sep=''))
# par(bty='l')
# plot(c(0,max(QQ)), c(0,max(UNADJ)+ 1), xlab='Expected -log10(p)', ylab='Observed -log10(p)', col='white', cex=1.3, cex.axis=1.2, cex.lab=1.5,pch=20)
# if(errorbars == 1)
# {
# #code for error bars
# ranks <- c(1:length(QQ))
# CIlower <- qbeta(.025, ranks, length(QQ)-ranks +1)
# CIupper <- qbeta(.975, ranks, length(QQ)-ranks +1)
# plotCIlower <- -log(CIlower,10)
# plotCIupper <- -log(CIupper,10)
# segments(x0=QQ,x1=QQ, y0=plotCIlower,y1=plotCIupper,lwd=2,col='grey')
# }
# abline(0,1,col='red', lwd=2)
# points(QQ, UNADJ, ,pch=20,col='blue')
# legend('topleft', paste('GC Lambda =', GCfactor), bty='n', cex=1.5, xjust=1)
# dev.off()
# # Error checking
# which(pvalues==0)
# #odd things going on which.min(pvalues)
# summary(glm.nb(pheno ~ genotypes[503,] + C1 + C2 + C3 +C4+C5,data=dm2)) #won't even run under this model
# summary(fastglm(cbind(genotypes[503,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3))
# summary(glm.nb(pheno ~ genotypes[2,] + C1 + C2 + C3 +C4+C5,data=dm2))
# which(total_results$P < 0.00001)
# fastglm(cbind(genovector,datamatrix),dm2$pheno, ,data=dm,family=negative.binomial(theta),method=3)
# total_results[which(total_results$P < 5e-8 & total_results$P > 8.312274e-83),]
# data[which(data$P < 5e-7),]
# which(genotypes_rs[,1]== "rs190121825")
# which(genotypes_rs[,1]== "rs16991916")
# summary(fastglm(cbind(genotypes[84565,],covmat),dm2$pheno, family=negative.binomial(theta_baseline),method=3))
# summary(glm.nb(pheno ~ genotypes[84565,] + C1 + C2 + C3 +C4+C5,data=dm2))
# #Convergence isn't sufficient, fastglm tends to converge where glm does not!
# 84565
# liz
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment