Created
April 10, 2017 16:00
-
-
Save nievergeltlab/94b89dd810baa7794b0dfcd83c230781 to your computer and use it in GitHub Desktop.
Logistic Mixed Models with GMMAT
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Script to convert dosage data to GMMAT format. Requires also a list of usuable SNPs as input, but this list does not need to be filtered. | |
#Create a valid SNPlist | |
info=0.9 | |
maf=0.01 | |
zcat /home/maihofer/vets/qc/imputation/distribution/vets_eur_analysis_run2/daner_vets_eur_analysis_run2.gz | awk -v info=$info -v maf=$maf '{if (($8 > info) && ($6 > maf) && ($7 > maf) && ($6 < 1-maf) && ($7 < 1-maf) && ($11 != "NA")) print $2}' > european_valid_snps.snplist | |
echo "SNP" > snp.txt | |
cat snp.txt european_valid_snps.snplist | LC_ALL=C sort -k 1b,1 > european_valid_snps.sorted.snplist | |
#Specify where probability format genotypes are stored | |
probdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1 | |
#Specify where output will be stored | |
outdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose | |
#Specify working directory (error logs will go here) | |
workingdir=/home/maihofer/vets/qc/imputation/ | |
#Get the number of subjects (assuming that it is the same across all files) | |
nsub=$(wc -l "$probdir"/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr1_234_237.out.dosage.fam | awk '{print $1}') | |
#Specify list of valid SNPs | |
valid_snps=/home/maihofer/vets/qc/european_valid_snps.sorted.snplist | |
ls $probdir | grep .gz$ > files_To_make_dose.txt | |
#Number of commands to run is a function of the number of files | |
ncommands=$(wc -l files_To_make_dose.txt | awk '{print $1}' ) | |
#Make a job code, where 'nodesize' processes will run on each node simultaneously | |
nodesize=16 | |
nodeuse=$(($nodesize - 1)) | |
#Total number of jobs = Number of commands / number of commands used per job, rounded up | |
totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse )) | |
qsub make_dose_gmmat.pbs -t 1-$totjobs -d $workingdir -F "-s $nsub -d $outdir -p $probdir -n $nodeuse -v $valid_snps -o files_To_make_dose.txt" | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#PBS -lnodes=1 | |
#PBS -lwalltime=0:05:00 | |
#!/bin/bash | |
#Script to run GMMAT | |
while getopts d:e:p:c:g:n: option | |
do | |
case "${option}" | |
in | |
d) dosages=${OPTARG};; | |
e) dosages_dir=${OPTARG};; | |
p) phenotype=${OPTARG};; | |
c) covariate=${OPTARG};; | |
g) grm=${OPTARG};; | |
n) nodeuse=${OPTARG};; | |
esac | |
done | |
module load R | |
#Write the start and stop points of the file | |
jstart=$((($PBS_ARRAYID-1)*$nodeuse +1)) | |
jstop=$(($PBS_ARRAYID*$nodeuse)) | |
for j in $(seq -w $jstart 1 $jstop) | |
do | |
file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $dosages) | |
Rscript gmmat.rscript "$dosages_dir"/"$file_use" "$grm" "$phenotype" "$covariate" gmmat_output/"$file_use".gmmat_score & | |
done | |
wait |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
args <- commandArgs(trailingOnly = TRUE) | |
dosfilename <- args[1] | |
grm <- args[2] | |
phenotype <- args[3] | |
covar <- args[4] | |
outfilename <- args[5] | |
#Function script for GMMAT | |
library('GMMAT') | |
phen <- read.table(phenotype,header=F,na.strings=c("NA","-9")) | |
names(phen) <- c("FID","IID","PTSD") | |
phen$order <- 1:dim(phen)[1] | |
european <- read.table(covar,header=T) | |
dat0 <- merge(phen,european,all.x=TRUE,by=c("FID","IID")) | |
dat <- dat0[order(dat0$order),] | |
grm <- as.matrix(read.table(grm)) | |
model0 <- glmmkin(as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins =grm, family = binomial(link = "logit")) | |
scoret <- glmm.score(model0,infile=dosfilename,outfile=outfilename,infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"),center=F) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Note: Input dosage data MUST be tab delmited!! | |
module load c/intel | |
module load fortran/intel | |
R | |
# install.packages(c('Rcpp','RcppArmadillo') | |
# install.packages('/home/maihofer/vets/qc/GMMAT_0.7.tar.gz',repos=NULL,type='source') | |
library('GMMAT') | |
phen <- read.table('pts_vets_mix_am-qc.pheno',header=F,na.strings=c("NA","-9")) #PLINK phenotype | |
names(phen) <- c("FID","IID","PTSD") | |
phen$order <- 1:dim(phen)[1] | |
european <- read.table('/home/maihofer/vets/qc/pca/pts_vets_mix_am-qc-eur_pca.menv.mds_cov',header=T) | |
dat0 <- merge(phen,european,all.x=TRUE,by=c("FID","IID")) | |
dat <- dat0[order(dat0$order),] | |
grm <- as.matrix(read.table("/home/maihofer/vets/qc/output/grm/grm_pts_vets_mix_am-qc.cXX.txt")) # GRM from GEMMA | |
#Fit GRM | |
model0 <- glmmkin(as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins =grm, family = binomial(link = "logit")) | |
#Derive score test pvalue | |
scoret <- glmm.score(model0,infile="/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_015_018.out.dosage.gz.doscnt.gmmat.gz",outfile='testout.txt',infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"),center=F) | |
#Alternative version: Wald test pvalue. Gives beta and SE, but incredibly slow! | |
snplist <- read.table("testsnplist.txt", header=F,stringsAsFactors=F) | |
system.time (results <- glmm.wald(fixed = as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins = grm, | |
family = binomial(link = "logit"), snps=snplist$V1[1:5], method.optim = "Brent" infile = "/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_015_018.out.dosage.gz.doscnt.gmmat.gz", infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2")) | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#PBS -lnodes=1 | |
#PBS -lwalltime=0:05:00 | |
#!/bin/bash | |
#Convert data to GMMAT format | |
while getopts n:o:p:d:s:v: option | |
do | |
case "${option}" | |
in | |
o) outputfile=${OPTARG};; | |
n) nodeuse=${OPTARG};; | |
p) probdir=${OPTARG};; | |
d) outdir=${OPTARG};; | |
s) nsub=${OPTARG};; | |
v) valid_snplist=${OPTARG};; | |
esac | |
done | |
#This version of the script is made for GMMAT format files. GMMAT only works with non-monomorphic SNPS. requires a SNPLIST to be provided | |
#Write the start and stop points of the file | |
jstart=$((($PBS_ARRAYID-1)*$nodeuse +1)) | |
jstop=$(($PBS_ARRAYID*$nodeuse)) | |
for j in $(seq -w $jstart 1 $jstop) | |
do | |
file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $outputfile) | |
LC_ALL=C join $valid_snplist <(zcat "$probdir"/"$file_use" | awk -v s=$nsub 'NR>1{ printf $1 " " $2 " " $3; for(i=1; i<=s; i++) printf " " $(i*2+2)*2+$(i*2+3); printf "\n" }' | LC_ALL=C sort -k1b,1) | sed 's/ /\t/g' | gzip > "$outdir"/"$file_use".doscnt.gmmat.gz & | |
done | |
wait |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Job script mode for GMMAT | |
#Specify where dosages are stored | |
dosdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose | |
#Specify working directory (error logs will go here) | |
workingdir=/home/maihofer/vets/qc/ | |
phenotype=pts_vets_mix_am-qc.pheno | |
covariate=/home/maihofer/vets/qc/pca/pts_vets_mix_am-qc-eur_pca.menv.mds_cov | |
grm=/home/maihofer/vets/qc/output/grm/grm_pts_vets_mix_am-qc.cXX.txt | |
ls $dosdir | grep gmmat > files_to_gmmat.txt | |
#Number of commands to run is a function of the number of files | |
ncommands=$(wc -l files_To_make_dose.txt | awk '{print $1}' ) | |
#Make a job code, where 'nodesize' processes will run on each node simultaneously | |
nodesize=16 | |
nodeuse=$(($nodesize - 1)) | |
#Total number of jobs = Number of commands / number of commands used per job, rounded up | |
totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse )) | |
workingdir=/home/maihofer/vets/qc | |
qsub gmmat.pbs -V -t 1-$totjobs -d $workingdir -F "-d files_to_gmmat.txt -e $dosdir -p $phenotype -c $covariate -g $grm -n $nodeuse " | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment