nievergeltlab/convert_dosage_to_gmmat_format.sh

## convert_dosage_to_gmmat_format.sh
#Script to convert dosage data to GMMAT format. Requires also a list of usuable SNPs as input, but this list does not need to be filtered.

#Create a valid SNPlist
info=0.9
maf=0.01
zcat /home/maihofer/vets/qc/imputation/distribution/vets_eur_analysis_run2/daner_vets_eur_analysis_run2.gz | awk -v info=$info -v maf=$maf '{if (($8 > info) &&  ($6 > maf) && ($7 > maf) &&  ($6 < 1-maf) && ($7 < 1-maf) && ($11 != "NA")) print $2}' > european_valid_snps.snplist
echo "SNP" > snp.txt

cat snp.txt european_valid_snps.snplist | LC_ALL=C sort -k 1b,1 > european_valid_snps.sorted.snplist

#Specify where probability format genotypes are stored
probdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1

#Specify where output will be stored
outdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

#Specify working directory (error logs will go here)
workingdir=/home/maihofer/vets/qc/imputation/

#Get the number of subjects (assuming that it is the same across all files)
nsub=$(wc -l "$probdir"/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr1_234_237.out.dosage.fam  | awk '{print $1}')

#Specify list of valid SNPs
valid_snps=/home/maihofer/vets/qc/european_valid_snps.sorted.snplist

ls $probdir | grep .gz$ > files_To_make_dose.txt

#Number of commands to run is a function of the number of files
ncommands=$(wc -l files_To_make_dose.txt | awk '{print $1}' )

#Make a job code, where 'nodesize' processes will run on each node simultaneously
 nodesize=16
 nodeuse=$(($nodesize - 1))

#Total number of jobs = Number of commands / number of commands used per job, rounded up
totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))


qsub make_dose_gmmat.pbs  -t 1-$totjobs -d $workingdir -F "-s $nsub -d $outdir -p $probdir -n $nodeuse -v $valid_snps -o files_To_make_dose.txt"


## gmmat.pbs
#PBS -lnodes=1
#PBS -lwalltime=0:05:00

#!/bin/bash

#Script to run GMMAT
while getopts d:e:p:c:g:n: option
do
  case "${option}"
    in
      d) dosages=${OPTARG};;
      e) dosages_dir=${OPTARG};;
      p) phenotype=${OPTARG};;
      c) covariate=${OPTARG};;
      g) grm=${OPTARG};;
      n) nodeuse=${OPTARG};;
    esac
done

 module load R
 #Write the start and stop points of the file
 jstart=$((($PBS_ARRAYID-1)*$nodeuse +1))
 jstop=$(($PBS_ARRAYID*$nodeuse))

 for j in $(seq -w $jstart 1 $jstop)
 do
  file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $dosages)
   Rscript gmmat.rscript "$dosages_dir"/"$file_use" "$grm" "$phenotype" "$covariate" gmmat_output/"$file_use".gmmat_score &
 done
wait

## gmmat.r
args <- commandArgs(trailingOnly = TRUE)

dosfilename <- args[1]
grm <- args[2]
phenotype <- args[3]
covar <- args[4]
outfilename <- args[5]

#Function script for GMMAT
library('GMMAT')

phen <- read.table(phenotype,header=F,na.strings=c("NA","-9"))
names(phen) <- c("FID","IID","PTSD")
phen$order <- 1:dim(phen)[1]
european <- read.table(covar,header=T)
dat0 <- merge(phen,european,all.x=TRUE,by=c("FID","IID"))
dat <- dat0[order(dat0$order),]

grm <- as.matrix(read.table(grm))

model0 <- glmmkin(as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins =grm, family = binomial(link = "logit"))

scoret <- glmm.score(model0,infile=dosfilename,outfile=outfilename,infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"),center=F)


## installation_and_code_sample.r
#Note: Input dosage data MUST be tab delmited!!
module load c/intel
module load fortran/intel
R
# install.packages(c('Rcpp','RcppArmadillo')
# install.packages('/home/maihofer/vets/qc/GMMAT_0.7.tar.gz',repos=NULL,type='source')

library('GMMAT')

phen <- read.table('pts_vets_mix_am-qc.pheno',header=F,na.strings=c("NA","-9")) #PLINK phenotype
names(phen) <- c("FID","IID","PTSD")
phen$order <- 1:dim(phen)[1]
european <- read.table('/home/maihofer/vets/qc/pca/pts_vets_mix_am-qc-eur_pca.menv.mds_cov',header=T)
dat0 <- merge(phen,european,all.x=TRUE,by=c("FID","IID"))
dat <- dat0[order(dat0$order),]

grm <- as.matrix(read.table("/home/maihofer/vets/qc/output/grm/grm_pts_vets_mix_am-qc.cXX.txt")) # GRM from GEMMA

#Fit GRM
model0 <- glmmkin(as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins =grm, family = binomial(link = "logit"))

#Derive score test pvalue
scoret <- glmm.score(model0,infile="/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_015_018.out.dosage.gz.doscnt.gmmat.gz",outfile='testout.txt',infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"),center=F)


#Alternative version: Wald test pvalue. Gives beta and SE, but incredibly slow!
snplist <- read.table("testsnplist.txt", header=F,stringsAsFactors=F)

system.time (results <- glmm.wald(fixed = as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins = grm,
        family = binomial(link = "logit"), snps=snplist$V1[1:5], method.optim = "Brent" infile = "/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_015_018.out.dosage.gz.doscnt.gmmat.gz", infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"))
)

## make_dose_gmmat.pbs
#PBS -lnodes=1
#PBS -lwalltime=0:05:00

#!/bin/bash

#Convert data to GMMAT format

while getopts n:o:p:d:s:v: option
do
  case "${option}"
    in
      o) outputfile=${OPTARG};;
      n) nodeuse=${OPTARG};;
      p) probdir=${OPTARG};;
      d) outdir=${OPTARG};;
      s) nsub=${OPTARG};;
      v) valid_snplist=${OPTARG};;
    esac
done

 #This version of the script is made for GMMAT format files. GMMAT only works with non-monomorphic SNPS. requires a SNPLIST to be provided

 #Write the start and stop points of the file
 jstart=$((($PBS_ARRAYID-1)*$nodeuse +1))
 jstop=$(($PBS_ARRAYID*$nodeuse))

 for j in $(seq -w $jstart 1 $jstop)
 do
  file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $outputfile)

  LC_ALL=C join $valid_snplist <(zcat "$probdir"/"$file_use" | awk -v s=$nsub 'NR>1{ printf $1 " " $2 " " $3; for(i=1; i<=s; i++) printf " " $(i*2+2)*2+$(i*2+3); printf "\n" }' | LC_ALL=C sort -k1b,1) | sed 's/ /\t/g' | gzip > "$outdir"/"$file_use".doscnt.gmmat.gz &
 done
wait

## run_gmmat.sh
#Job script mode for GMMAT


#Specify where dosages are stored
dosdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

#Specify working directory (error logs will go here)
workingdir=/home/maihofer/vets/qc/

phenotype=pts_vets_mix_am-qc.pheno
covariate=/home/maihofer/vets/qc/pca/pts_vets_mix_am-qc-eur_pca.menv.mds_cov
grm=/home/maihofer/vets/qc/output/grm/grm_pts_vets_mix_am-qc.cXX.txt

ls $dosdir | grep gmmat > files_to_gmmat.txt

#Number of commands to run is a function of the number of files
ncommands=$(wc -l files_To_make_dose.txt | awk '{print $1}' )

#Make a job code, where 'nodesize' processes will run on each node simultaneously
 nodesize=16
 nodeuse=$(($nodesize - 1))

#Total number of jobs = Number of commands / number of commands used per job, rounded up
totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))

workingdir=/home/maihofer/vets/qc

qsub gmmat.pbs  -V -t 1-$totjobs -d $workingdir -F "-d files_to_gmmat.txt -e $dosdir -p $phenotype -c $covariate -g $grm -n $nodeuse "
	#Script to convert dosage data to GMMAT format. Requires also a list of usuable SNPs as input, but this list does not need to be filtered.

	#Create a valid SNPlist
	info=0.9
	maf=0.01
	zcat /home/maihofer/vets/qc/imputation/distribution/vets_eur_analysis_run2/daner_vets_eur_analysis_run2.gz \| awk -v info=$info -v maf=$maf '{if (($8 > info) && ($6 > maf) && ($7 > maf) && ($6 < 1-maf) && ($7 < 1-maf) && ($11 != "NA")) print $2}' > european_valid_snps.snplist
	echo "SNP" > snp.txt

	cat snp.txt european_valid_snps.snplist \| LC_ALL=C sort -k 1b,1 > european_valid_snps.sorted.snplist

	#Specify where probability format genotypes are stored
	probdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1

	#Specify where output will be stored
	outdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

	#Specify working directory (error logs will go here)
	workingdir=/home/maihofer/vets/qc/imputation/

	#Get the number of subjects (assuming that it is the same across all files)
	nsub=$(wc -l "$probdir"/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr1_234_237.out.dosage.fam \| awk '{print $1}')

	#Specify list of valid SNPs
	valid_snps=/home/maihofer/vets/qc/european_valid_snps.sorted.snplist

	ls $probdir \| grep .gz$ > files_To_make_dose.txt

	#Number of commands to run is a function of the number of files
	ncommands=$(wc -l files_To_make_dose.txt \| awk '{print $1}' )

	#Make a job code, where 'nodesize' processes will run on each node simultaneously
	nodesize=16
	nodeuse=$(($nodesize - 1))

	#Total number of jobs = Number of commands / number of commands used per job, rounded up
	totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))


	qsub make_dose_gmmat.pbs -t 1-$totjobs -d $workingdir -F "-s $nsub -d $outdir -p $probdir -n $nodeuse -v $valid_snps -o files_To_make_dose.txt"
	#PBS -lnodes=1
	#PBS -lwalltime=0:05:00

	#!/bin/bash

	#Script to run GMMAT
	while getopts d:e:p:c:g:n: option
	do
	case "${option}"
	in
	d) dosages=${OPTARG};;
	e) dosages_dir=${OPTARG};;
	p) phenotype=${OPTARG};;
	c) covariate=${OPTARG};;
	g) grm=${OPTARG};;
	n) nodeuse=${OPTARG};;
	esac
	done

	module load R
	#Write the start and stop points of the file
	jstart=$((($PBS_ARRAYID-1)*$nodeuse +1))
	jstop=$(($PBS_ARRAYID*$nodeuse))

	for j in $(seq -w $jstart 1 $jstop)
	do
	file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $dosages)
	Rscript gmmat.rscript "$dosages_dir"/"$file_use" "$grm" "$phenotype" "$covariate" gmmat_output/"$file_use".gmmat_score &
	done
	wait
	args <- commandArgs(trailingOnly = TRUE)

	dosfilename <- args[1]
	grm <- args[2]
	phenotype <- args[3]
	covar <- args[4]
	outfilename <- args[5]

	#Function script for GMMAT
	library('GMMAT')

	phen <- read.table(phenotype,header=F,na.strings=c("NA","-9"))
	names(phen) <- c("FID","IID","PTSD")
	phen$order <- 1:dim(phen)[1]
	european <- read.table(covar,header=T)
	dat0 <- merge(phen,european,all.x=TRUE,by=c("FID","IID"))
	dat <- dat0[order(dat0$order),]

	grm <- as.matrix(read.table(grm))

	model0 <- glmmkin(as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins =grm, family = binomial(link = "logit"))

	scoret <- glmm.score(model0,infile=dosfilename,outfile=outfilename,infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"),center=F)
	#Note: Input dosage data MUST be tab delmited!!
	module load c/intel
	module load fortran/intel
	R
	# install.packages(c('Rcpp','RcppArmadillo')
	# install.packages('/home/maihofer/vets/qc/GMMAT_0.7.tar.gz',repos=NULL,type='source')

	library('GMMAT')

	phen <- read.table('pts_vets_mix_am-qc.pheno',header=F,na.strings=c("NA","-9")) #PLINK phenotype
	names(phen) <- c("FID","IID","PTSD")
	phen$order <- 1:dim(phen)[1]
	european <- read.table('/home/maihofer/vets/qc/pca/pts_vets_mix_am-qc-eur_pca.menv.mds_cov',header=T)
	dat0 <- merge(phen,european,all.x=TRUE,by=c("FID","IID"))
	dat <- dat0[order(dat0$order),]

	grm <- as.matrix(read.table("/home/maihofer/vets/qc/output/grm/grm_pts_vets_mix_am-qc.cXX.txt")) # GRM from GEMMA

	#Fit GRM
	model0 <- glmmkin(as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins =grm, family = binomial(link = "logit"))

	#Derive score test pvalue
	scoret <- glmm.score(model0,infile="/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_015_018.out.dosage.gz.doscnt.gmmat.gz",outfile='testout.txt',infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"),center=F)


	#Alternative version: Wald test pvalue. Gives beta and SE, but incredibly slow!
	snplist <- read.table("testsnplist.txt", header=F,stringsAsFactors=F)

	system.time (results <- glmm.wald(fixed = as.factor(PTSD) ~ C1 + C2 + C3 +C4 + C5, data = dat, kins = grm,
	family = binomial(link = "logit"), snps=snplist$V1[1:5], method.optim = "Brent" infile = "/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr10_015_018.out.dosage.gz.doscnt.gmmat.gz", infile.ncol.skip=3,infile.nrow.skip=0, infile.ncol.print = 1:3,infile.header.print = c("SNP","Allele1", "Allele2"))
	)
	#Job script mode for GMMAT



	#Specify where dosages are stored
	dosdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

	#Specify working directory (error logs will go here)
	workingdir=/home/maihofer/vets/qc/

	phenotype=pts_vets_mix_am-qc.pheno
	covariate=/home/maihofer/vets/qc/pca/pts_vets_mix_am-qc-eur_pca.menv.mds_cov
	grm=/home/maihofer/vets/qc/output/grm/grm_pts_vets_mix_am-qc.cXX.txt

	ls $dosdir \| grep gmmat > files_to_gmmat.txt

	#Number of commands to run is a function of the number of files
	ncommands=$(wc -l files_To_make_dose.txt \| awk '{print $1}' )

	#Make a job code, where 'nodesize' processes will run on each node simultaneously
	nodesize=16
	nodeuse=$(($nodesize - 1))

	#Total number of jobs = Number of commands / number of commands used per job, rounded up
	totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))

	workingdir=/home/maihofer/vets/qc

	qsub gmmat.pbs -V -t 1-$totjobs -d $workingdir -F "-d files_to_gmmat.txt -e $dosdir -p $phenotype -c $covariate -g $grm -n $nodeuse "