nievergeltlab/make_dos.pbs

## make_dos.pbs
#PBS -lnodes=1
#PBS -lwalltime=0:05:00

#!/bin/bash

while getopts n:o:p:d:s: option
do
  case "${option}"
    in
      o) outputfile=${OPTARG};;
      n) nodeuse=${OPTARG};;
      p) probdir=${OPTARG};;
      d) outdir=${OPTARG};;
      s) nsub=${OPTARG};;
    esac
done

 #This version of the script is made for GEMMA format files

 #Write the start and stop points of the file
 jstart=$((($PBS_ARRAYID-1)*$nodeuse +1))
 jstop=$(($PBS_ARRAYID*$nodeuse))

 for j in $(seq -w $jstart 1 $jstop)
 do
  file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $outputfile)
  zcat "$probdir"/"$file_use" | awk -v s=$nsub '{ printf $1 "," $2 "," $3; for(i=1; i<=s; i++) printf "," $(i*2+2)*2+$(i*2+3); printf "\n" }' | gzip > "$outdir"/"$file_use".doscnt.gz &
 done
wait


## make_dos_gmmat.pbs
#PBS -lnodes=1
#PBS -lwalltime=0:05:00

#!/bin/bash

while getopts n:o:p:d:s:v: option
do
  case "${option}"
    in
      o) outputfile=${OPTARG};;
      n) nodeuse=${OPTARG};;
      p) probdir=${OPTARG};;
      d) outdir=${OPTARG};;
      s) nsub=${OPTARG};;
      v) valid_snplist=${OPTARG};;
    esac
done

 #This version of the script is made for GMMAT format files. GMMAT only works with non-monomorphic SNPS. requires a SNPLIST to be provided

 #Write the start and stop points of the file
 jstart=$((($PBS_ARRAYID-1)*$nodeuse +1))
 jstop=$(($PBS_ARRAYID*$nodeuse))

 for j in $(seq -w $jstart 1 $jstop)
 do
  file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $outputfile)

  LC_ALL=C join $valid_snplist <(zcat "$probdir"/"$file_use" | awk -v s=$nsub '{ printf $1 " " $2 " " $3; for(i=1; i<=s; i++) printf " " $(i*2+2)*2+$(i*2+3); printf "\n" }' | LC_ALL=C sort -k1b,1) | gzip > "$outdir"/"$file_use".doscnt.gmmat.gz &
 done
wait


## probability_to_dosage
#Assumes that data is in 2 column probability format (probability of homozygote monor, heterozygote), with 3 preceding values (SNP, A1, A2)

nsub=$(wc -l probfile.fam | awk '{print $1}')

zcat probfile.gz | awk -v s=$nsub '{ printf $1 "," $2 "," $3; for(i=1; i<=s; i++) printf "," $(i*2+2)*2+$(i*2+3); printf "\n" }' | gzip > dosagefile.gz

## repeated_jobs.sh
#Specify where probability format genotypes are stored
probdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1

#Specify where output will be stored
outdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

#Specify working directory (error logs will go here)
workingdir=/home/maihofer/vets/qc/imputation/

#Get the number of subjects (assuming that it is the same across all files)
nsub=$(wc -l "$probdir"/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr1_234_237.out.dosage.fam  | awk '{print $1}')

ls $probdir | grep .gz$ > files_To_make_dose.txt

#Number of commands to run is a function of the number of files
ncommands=$(wc -l files_To_make_dose.txt | awk '{print $1}' )

#Make a job code, where 'nodesize' processes will run on each node simultaneously
 nodesize=16
 nodeuse=$(($nodesize - 1))

#Total number of jobs = Number of commands / number of commands used per job, rounded up
totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))


qsub make_dose.pbs  -t 1-$totjobs -d $workingdir -F "-s $nsub -d $outdir -p $probdir -n $nodeuse -o files_To_make_dose.txt"

## repeated_jobs_gmmat.pbs
#Specify where probability format genotypes are stored
probdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1

#Specify where output will be stored
outdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

#Specify working directory (error logs will go here)
workingdir=/home/maihofer/vets/qc/imputation/

#Get the number of subjects (assuming that it is the same across all files)
nsub=$(wc -l "$probdir"/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr1_234_237.out.dosage.fam  | awk '{print $1}')

#Specify list of valid SNPs
valid_snps=/home/maihofer/vets/qc/european_valid_snps.sorted.snplist

ls $probdir | grep .gz$ > files_To_make_dose.txt

#Number of commands to run is a function of the number of files
ncommands=$(wc -l files_To_make_dose.txt | awk '{print $1}' )

#Make a job code, where 'nodesize' processes will run on each node simultaneously
 nodesize=16
 nodeuse=$(($nodesize - 1))

#Total number of jobs = Number of commands / number of commands used per job, rounded up
totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))


qsub make_dose_gmmat.pbs  -t 1-$totjobs -d $workingdir -F "-s $nsub -d $outdir -p $probdir -n $nodeuse -v $valid_snps -o files_To_make_dose.txt"
	#PBS -lnodes=1
	#PBS -lwalltime=0:05:00

	#!/bin/bash

	while getopts n:o:p:d:s: option
	do
	case "${option}"
	in
	o) outputfile=${OPTARG};;
	n) nodeuse=${OPTARG};;
	p) probdir=${OPTARG};;
	d) outdir=${OPTARG};;
	s) nsub=${OPTARG};;
	esac
	done

	#This version of the script is made for GEMMA format files

	#Write the start and stop points of the file
	jstart=$((($PBS_ARRAYID-1)*$nodeuse +1))
	jstop=$(($PBS_ARRAYID*$nodeuse))

	for j in $(seq -w $jstart 1 $jstop)
	do
	file_use=$(awk -v lineno=$j '{if(NR==lineno) print}' $outputfile)
	zcat "$probdir"/"$file_use" \| awk -v s=$nsub '{ printf $1 "," $2 "," $3; for(i=1; i<=s; i++) printf "," $(i2+2)2+$(i*2+3); printf "\n" }' \| gzip > "$outdir"/"$file_use".doscnt.gz &
	done
	wait
	#Assumes that data is in 2 column probability format (probability of homozygote monor, heterozygote), with 3 preceding values (SNP, A1, A2)

	nsub=$(wc -l probfile.fam \| awk '{print $1}')

	zcat probfile.gz \| awk -v s=$nsub '{ printf $1 "," $2 "," $3; for(i=1; i<=s; i++) printf "," $(i2+2)2+$(i*2+3); printf "\n" }' \| gzip > dosagefile.gz
	#Specify where probability format genotypes are stored
	probdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1

	#Specify where output will be stored
	outdir=/home/maihofer/vets/qc/imputation/dasuqc1_pts_vets_mix_am-qc.hg19.ch.fl/qc1_dose

	#Specify working directory (error logs will go here)
	workingdir=/home/maihofer/vets/qc/imputation/

	#Get the number of subjects (assuming that it is the same across all files)
	nsub=$(wc -l "$probdir"/dos_pts_vets_mix_am-qc.hg19.ch.fl.chr1_234_237.out.dosage.fam \| awk '{print $1}')

	ls $probdir \| grep .gz$ > files_To_make_dose.txt

	#Number of commands to run is a function of the number of files
	ncommands=$(wc -l files_To_make_dose.txt \| awk '{print $1}' )

	#Make a job code, where 'nodesize' processes will run on each node simultaneously
	nodesize=16
	nodeuse=$(($nodesize - 1))

	#Total number of jobs = Number of commands / number of commands used per job, rounded up
	totjobs=$(( ($ncommands + $nodeuse - 1 ) / $nodeuse ))


	qsub make_dose.pbs -t 1-$totjobs -d $workingdir -F "-s $nsub -d $outdir -p $probdir -n $nodeuse -o files_To_make_dose.txt"