Skip to content

Instantly share code, notes, and snippets.

@arq5x
Created February 7, 2011 18:20
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save arq5x/814879 to your computer and use it in GitHub Desktop.
Save arq5x/814879 to your computer and use it in GitHub Desktop.
RS Exome analysis on the PBS environment
export BATCH1="1094PC0005 1094PC0009 1094PC0012 1094PC0013 "
export BATCH2="1094PC0016 1094PC0017 1094PC0018 1094PC0019 \
1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025 "
export BATCH3="1478PC0001B 1478PC0002 1478PC0003 1478PC0004 \
1478PC0005 1478PC0006B 1478PC0007B 1478PC0008B \
1478PC0009B 1478PC0010 1478PC0011 1478PC0012 \
1478PC0013B 1478PC0014B 1478PC0015B 1478PC0016 \
1478PC0017B 1478PC0018 1478PC0019 1478PC0020 \
1478PC0021 1478PC0022B 1478PC0023B 1478PC0024B"
export BATCH4="1719PC0001 1719PC0002 1719PC0003 1719PC0004 \
1719PC0005 1719PC0006 1719PC0007 1719PC0008 \
1719PC0009 1719PC0010 1719PC0011 1719PC0012 \
1719PC0013 1719PC0014 1719PC0015 1719PC0016 \
1719PC0017 1719PC0018 1719PC0019 1719PC0020 \
1719PC0021 1719PC0022"
export ALL="1094PC0005 1094PC0009 1094PC0012 1094PC0013 \
1094PC0016 1094PC0017 1094PC0018 1094PC0019 \
1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025 \
1478PC0001B 1478PC0002 1478PC0003 1478PC0004 \
1478PC0005 1478PC0006B 1478PC0007B 1478PC0008B \
1478PC0009B 1478PC0010 1478PC0011 1478PC0012 \
1478PC0013B 1478PC0014B 1478PC0015B 1478PC0016 \
1478PC0017B 1478PC0018 1478PC0019 1478PC0020 \
1478PC0021 1478PC0022B 1478PC0023B 1478PC0024B \
1478PC0025 \
1719PC0001 1719PC0002 1719PC0003 1719PC0004 \
1719PC0005 1719PC0006 1719PC0007 1719PC0008 \
1719PC0009 1719PC0010 1719PC0011 1719PC0012 \
1719PC0013 1719PC0014 1719PC0015 1719PC0016 \
1719PC0017 1719PC0018 1719PC0019 1719PC0020 \
1719PC0021 1719PC0022"
export LAST="1478PC0025"
############################################################
# Convert latest batch.
############################################################
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_2_SL16528.fastq C0UPMACXX_s2_1_GSL48index_2_SL16528.fastq > 1719-PC-0001.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_2_SL16528.fastq C0UPMACXX_s2_2_GSL48index_2_SL16528.fastq > 1719-PC-0001.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_3_SL16529.fastq C0UPMACXX_s2_1_GSL48index_3_SL16529.fastq > 1719-PC-0002.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_3_SL16529.fastq C0UPMACXX_s2_2_GSL48index_3_SL16529.fastq > 1719-PC-0002.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_4_SL16530.fastq C0UPMACXX_s2_1_GSL48index_4_SL16530.fastq > 1719-PC-0003.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_4_SL16530.fastq C0UPMACXX_s2_2_GSL48index_4_SL16530.fastq > 1719-PC-0003.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_5_SL16531.fastq C0UPMACXX_s2_1_GSL48index_5_SL16531.fastq > 1719-PC-0004.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_5_SL16531.fastq C0UPMACXX_s2_2_GSL48index_5_SL16531.fastq > 1719-PC-0004.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_6_SL16532.fastq C0UPMACXX_s2_1_GSL48index_6_SL16532.fastq > 1719-PC-0005.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_6_SL16532.fastq C0UPMACXX_s2_2_GSL48index_6_SL16532.fastq > 1719-PC-0005.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_7_SL16533.fastq C0UPMACXX_s2_1_GSL48index_7_SL16533.fastq > 1719-PC-0006.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_7_SL16533.fastq C0UPMACXX_s2_2_GSL48index_7_SL16533.fastq > 1719-PC-0006.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_8_SL16534.fastq C0UPMACXX_s2_1_GSL48index_8_SL16534.fastq > 1719-PC-0007.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_8_SL16534.fastq C0UPMACXX_s2_2_GSL48index_8_SL16534.fastq > 1719-PC-0007.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_1_GSL48index_9_SL16535.fastq C0UPMACXX_s2_1_GSL48index_9_SL16535.fastq > 1719-PC-0008.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s1_2_GSL48index_9_SL16535.fastq C0UPMACXX_s2_2_GSL48index_9_SL16535.fastq > 1719-PC-0008.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_10_SL16536.fastq C0UPMACXX_s4_1_GSL48index_10_SL16536.fastq > 1719-PC-0009.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_10_SL16536.fastq C0UPMACXX_s4_2_GSL48index_10_SL16536.fastq > 1719-PC-0009.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_11_SL16537.fastq C0UPMACXX_s4_1_GSL48index_11_SL16537.fastq > 1719-PC-0010.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_11_SL16537.fastq C0UPMACXX_s4_2_GSL48index_11_SL16537.fastq > 1719-PC-0010.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_12_SL16538.fastq C0UPMACXX_s4_1_GSL48index_12_SL16538.fastq > 1719-PC-0011.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_12_SL16538.fastq C0UPMACXX_s4_2_GSL48index_12_SL16538.fastq > 1719-PC-0011.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_1_SL16539.fastq C0UPMACXX_s4_1_GSL48index_1_SL16539.fastq > 1719-PC-0012.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_1_SL16539.fastq C0UPMACXX_s4_2_GSL48index_1_SL16539.fastq > 1719-PC-0012.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_2_SL16540.fastq C0UPMACXX_s4_1_GSL48index_2_SL16540.fastq > 1719-PC-0013.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_2_SL16540.fastq C0UPMACXX_s4_2_GSL48index_2_SL16540.fastq > 1719-PC-0013.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_3_SL16541.fastq C0UPMACXX_s4_1_GSL48index_3_SL16541.fastq > 1719-PC-0014.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_3_SL16541.fastq C0UPMACXX_s4_2_GSL48index_3_SL16541.fastq > 1719-PC-0014.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_4_SL16542.fastq C0UPMACXX_s4_1_GSL48index_4_SL16542.fastq > 1719-PC-0015.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_4_SL16542.fastq C0UPMACXX_s4_2_GSL48index_4_SL16542.fastq > 1719-PC-0015.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_1_GSL48index_5_SL16543.fastq C0UPMACXX_s4_1_GSL48index_5_SL16543.fastq > 1719-PC-0016.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s3_2_GSL48index_5_SL16543.fastq C0UPMACXX_s4_2_GSL48index_5_SL16543.fastq > 1719-PC-0016.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_1_GSL48index_6_SL16544.fastq C0UPMACXX_s6_1_GSL48index_6_SL16544.fastq > 1719-PC-0017.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_2_GSL48index_6_SL16544.fastq C0UPMACXX_s6_2_GSL48index_6_SL16544.fastq > 1719-PC-0017.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_1_GSL48index_7_SL16545.fastq C0UPMACXX_s6_1_GSL48index_7_SL16545.fastq > 1719-PC-0018.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_2_GSL48index_7_SL16545.fastq C0UPMACXX_s6_2_GSL48index_7_SL16545.fastq > 1719-PC-0018.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_1_GSL48index_8_SL16546.fastq C0UPMACXX_s6_1_GSL48index_8_SL16546.fastq > 1719-PC-0019.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_2_GSL48index_8_SL16546.fastq C0UPMACXX_s6_2_GSL48index_8_SL16546.fastq > 1719-PC-0019.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_1_GSL48index_9_SL16547.fastq C0UPMACXX_s6_1_GSL48index_9_SL16547.fastq > 1719-PC-0020.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_2_GSL48index_9_SL16547.fastq C0UPMACXX_s6_2_GSL48index_9_SL16547.fastq > 1719-PC-0020.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_1_GSL48index_10_SL16548.fastq C0UPMACXX_s6_1_GSL48index_10_SL16548.fastq > 1719-PC-0021.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_2_GSL48index_10_SL16548.fastq C0UPMACXX_s6_2_GSL48index_10_SL16548.fastq > 1719-PC-0021.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_1_GSL48index_11_SL16549.fastq C0UPMACXX_s6_1_GSL48index_11_SL16549.fastq > 1719-PC-0022.1.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
echo "cd /home/arq5x/cphg-home/projects/rs-exome/origFromHudsonAlpha/2012-Sept; cat C0UPMACXX_s5_2_GSL48index_11_SL16549.fastq C0UPMACXX_s6_2_GSL48index_11_SL16549.fastq > 1719-PC-0022.2.fq" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=1000m:ncpus=1
############################################################
# Quick hack to ensure no sample mix-ups.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-proper
for sample in `echo $BATCH3`
do
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=2 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools view -q 20 -f 2 -u origFromHudsonAlpha/$sample.bam \
| intersectBed -abam stdin -b bed/sureselect.liftedTohg19.slop500.numchroms.bed \
> bam/$sample.conc.on.bam" | $QSUB
done
######################################
# Sort the original BAM files by name:
######################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-nmsrt
for sample in `echo $BATCH4`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=6000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools sort -n -m 2000000000 origFromHudsonAlpha/$sample.bam bam/$sample.namesorted" | $QSUB
done
############################################################
# Create new FASTQ files for every sample
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-b2fq
for sample in `echo $BATCH4`;
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -jar /home/arq5x/cphg-home/bin/SamToFastq.jar \
INPUT=bam/$sample.namesorted.bam \
VALIDATION_STRINGENCY=LENIENT \
F=fastq/$sample.1.fq \
F2=fastq/$sample.2.fq" | $QSUB;
done
############################################################
# Gzip the FASTQs
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-fqzip
for sample in `echo $BATCH4`;
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; gzip fastq/$sample.1.fq" | $QSUB;
echo "cd $RSHOME; gzip fastq/$sample.2.fq" | $QSUB;
done
##########################################
# Delete the name-sorted BAMs
##########################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
for sample in `echo $BATCH4`
do
cd $RSHOME; rm bam/$sample.namesorted.bam
done
##########################################
# Align the raw FASTQ with BWA
##########################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-bwa-aln
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
for sample in `echo $BATCH4`
do
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=8 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.1.fq > bam/$sample.1.sai" | $QSUB
echo "cd $RSHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.2.fq > bam/$sample.2.sai" | $QSUB
done
############################################################
# Pair the alignments.
# Keep proper, on-target (i.e. +/- 500 bp of a probe) pairs.
# Require mapping quality >= 20
# Add the sample ID as a RG
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-bwa-pair
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
for sample in `echo $BATCH4`
do
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; bwa sampe -r '@RG\tID:$sample\tSM:$sample' $GENOME bam/$sample.1.sai bam/$sample.2.sai fastq/$sample.1.fq fastq/$sample.2.fq \
| samtools view -q 20 -f 2 -Su - \
| intersectBed -abam stdin -b bed/sureselect.liftedTohg19.slop500.bed \
> bam/$sample.conc.on.bam" | $QSUB
done
######################################
# Data quality assessment - flagstat:
######################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-flagstat
for sample in `echo $ALL`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools flagstat origFromHudsonAlpha/$sample.bam > origFromHudsonAlpha/$sample.bam.flagstat" | $QSUB
done
##########################################################
# Data quality assessment - flagstat for conc, on-target:
##########################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-conc-fs
for sample in `echo $ALL`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools flagstat bam/$sample.conc.on.bam > bam/$sample.conc.on.bam.flagstat" | $QSUB
done
######################################
# Data quality assessment - coverage:
######################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-cov
export SAMPLES="1094PC0005 1094PC0009 1094PC0012 1094PC0013 1094PC0016 1094PC0017 1094PC0018 1094PC0019 1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025"
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; coverageBed -abam bam/$sample.conc.on.bam -b bed/sureselect.liftedTohg19.slop500.bed > bam/$sample.conc.on.bam.cov" | $QSUB
done
######################################
# Data quality assessment - err. rate:
######################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-err
export SAMPLES="1094PC0005 1094PC0009 1094PC0012 1094PC0013 1094PC0016 1094PC0017 1094PC0018 1094PC0019 1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025"
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; bamToBed -i bam/$sample.conc.on.bam -tag NM | head -1000000 | cut -f 5 | stats > bam/$sample.conc.on.bam.err" | $QSUB
done
############################################################
# Sort the on-target concordants by genome position.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-possrt
for sample in `echo $BATCH4`
do
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=12000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools sort -m 4000000000 bam/$sample.conc.on.bam bam/$sample.conc.on.pos" | $QSUB
done
############################################################
# Index the position-sorted BAM files.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-index
for sample in `echo $BATCH4`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools index bam/$sample.conc.on.pos.bam" | $QSUB
done
############################################################
# Use mpileup to get a snese of any sample mix-ups
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-mpile
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export SAMPLES="bam/1094PC0005.conc.on.pos.bam \
bam/1094PC0009.conc.on.pos.bam \
bam/1094PC0012.conc.on.pos.bam \
bam/1094PC0013.conc.on.pos.bam \
bam/1094PC0016.conc.on.pos.bam \
bam/1094PC0017.conc.on.pos.bam \
bam/1094PC0018.conc.on.pos.bam \
bam/1094PC0019.conc.on.pos.bam \
bam/1094PC0020.conc.on.pos.bam \
bam/1094PC0021.conc.on.pos.bam \
bam/1094PC0022.conc.on.pos.bam \
bam/1094PC0023.conc.on.pos.bam \
bam/1094PC0025.conc.on.pos.bam \
bam/1478PC0017B.conc.on.bam \
bam/1478PC0012.conc.on.bam \
bam/1478PC0024B.conc.on.bam \
bam/1478PC0004.conc.on.bam \
bam/1478PC0003.conc.on.bam \
bam/1478PC0018.conc.on.bam \
bam/1478PC0009B.conc.on.bam \
bam/1478PC0010.conc.on.bam \
bam/1478PC0013B.conc.on.bam \
bam/1478PC0011.conc.on.bam \
bam/1478PC0019.conc.on.bam \
bam/1478PC0007B.conc.on.bam \
bam/1478PC0022B.conc.on.bam \
bam/1478PC0015B.conc.on.bam \
bam/1478PC0021.conc.on.bam \
bam/1478PC0014B.conc.on.bam \
bam/1478PC0005.conc.on.bam \
bam/1478PC0001B.conc.on.bam \
bam/1478PC0006B.conc.on.bam \
bam/1478PC0008B.conc.on.bam \
bam/1478PC0016.conc.on.bam \
bam/1478PC0002.conc.on.bam \
bam/1478PC0023B.conc.on.bam \
bam/1478PC0020.conc.on.bam"
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools mpileup -uf $GENOME $SAMPLES | bcftools view -bvcg - > varCalling/2012-Jan/kinship-quick/var.mpileup.raw.bcf" | $QSUB
############################################################
# Mark duplicates
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-dupe
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60
for sample in `echo $BATCH4`;
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=2 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -jar $BIN/MarkDuplicates.jar \
INPUT=bam/$sample.conc.on.pos.bam \
OUTPUT=bam/$sample.conc.on.pos.dedup.bam \
TMP_DIR=$RSHOME/bam \
VALIDATION_STRINGENCY=LENIENT \
ASSUME_SORTED=true \
METRICS_FILE=bam/$sample.markdup_metrics" | $QSUB;
done
############################################################
# Index the position-sorted, deduped BAM files.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-index
for sample in `echo $BATCH4`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools index bam/$sample.conc.on.pos.dedup.bam" | $QSUB
done
############################################################
# Merge BAM files
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-merge
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=4 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -jar $BIN/MergeSamFiles.jar \
O=bam/all.conc.on.pos.dedup.bam \
I=bam/1094PC0005.conc.on.pos.dedup.bam \
I=bam/1094PC0009.conc.on.pos.dedup.bam \
I=bam/1094PC0012.conc.on.pos.dedup.bam \
I=bam/1094PC0013.conc.on.pos.dedup.bam \
I=bam/1094PC0016.conc.on.pos.dedup.bam \
I=bam/1094PC0017.conc.on.pos.dedup.bam \
I=bam/1094PC0018.conc.on.pos.dedup.bam \
I=bam/1094PC0019.conc.on.pos.dedup.bam \
I=bam/1094PC0020.conc.on.pos.dedup.bam \
I=bam/1094PC0021.conc.on.pos.dedup.bam \
I=bam/1094PC0022.conc.on.pos.dedup.bam \
I=bam/1094PC0023.conc.on.pos.dedup.bam \
I=bam/1094PC0025.conc.on.pos.dedup.bam \
I=bam/1478PC0001B.conc.on.pos.dedup.bam \
I=bam/1478PC0002.conc.on.pos.dedup.bam \
I=bam/1478PC0003.conc.on.pos.dedup.bam \
I=bam/1478PC0004.conc.on.pos.dedup.bam \
I=bam/1478PC0005.conc.on.pos.dedup.bam \
I=bam/1478PC0006B.conc.on.pos.dedup.bam \
I=bam/1478PC0007B.conc.on.pos.dedup.bam \
I=bam/1478PC0008B.conc.on.pos.dedup.bam \
I=bam/1478PC0009B.conc.on.pos.dedup.bam \
I=bam/1478PC0010.conc.on.pos.dedup.bam \
I=bam/1478PC0011.conc.on.pos.dedup.bam \
I=bam/1478PC0012.conc.on.pos.dedup.bam \
I=bam/1478PC0013B.conc.on.pos.dedup.bam \
I=bam/1478PC0014B.conc.on.pos.dedup.bam \
I=bam/1478PC0015B.conc.on.pos.dedup.bam \
I=bam/1478PC0016.conc.on.pos.dedup.bam \
I=bam/1478PC0017B.conc.on.pos.dedup.bam \
I=bam/1478PC0018.conc.on.pos.dedup.bam \
I=bam/1478PC0019.conc.on.pos.dedup.bam \
I=bam/1478PC0020.conc.on.pos.dedup.bam \
I=bam/1478PC0021.conc.on.pos.dedup.bam \
I=bam/1478PC0022B.conc.on.pos.dedup.bam \
I=bam/1478PC0023B.conc.on.pos.dedup.bam \
I=bam/1478PC0024B.conc.on.pos.dedup.bam \
I=bam/1478PC0025.conc.on.pos.dedup.bam \
I=bam/1719PC0001.conc.on.pos.dedup.bam \
I=bam/1719PC0002.conc.on.pos.dedup.bam \
I=bam/1719PC0003.conc.on.pos.dedup.bam \
I=bam/1719PC0004.conc.on.pos.dedup.bam \
I=bam/1719PC0005.conc.on.pos.dedup.bam \
I=bam/1719PC0006.conc.on.pos.dedup.bam \
I=bam/1719PC0007.conc.on.pos.dedup.bam \
I=bam/1719PC0008.conc.on.pos.dedup.bam \
I=bam/1719PC0009.conc.on.pos.dedup.bam \
I=bam/1719PC0010.conc.on.pos.dedup.bam \
I=bam/1719PC0011.conc.on.pos.dedup.bam \
I=bam/1719PC0012.conc.on.pos.dedup.bam \
I=bam/1719PC0013.conc.on.pos.dedup.bam \
I=bam/1719PC0014.conc.on.pos.dedup.bam \
I=bam/1719PC0015.conc.on.pos.dedup.bam \
I=bam/1719PC0016.conc.on.pos.dedup.bam \
I=bam/1719PC0017.conc.on.pos.dedup.bam \
I=bam/1719PC0018.conc.on.pos.dedup.bam \
I=bam/1719PC0019.conc.on.pos.dedup.bam \
I=bam/1719PC0020.conc.on.pos.dedup.bam \
I=bam/1719PC0021.conc.on.pos.dedup.bam \
I=bam/1719PC0022.conc.on.pos.dedup.bam \
TMP_DIR=$RSHOME/bam \
USE_THREADING=true" | $QSUB;
############################################################
# Identify realignment targets.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-realtgts
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \
-T RealignerTargetCreator \
-R $GENOME \
-I bam/all.conc.on.pos.dedup.bam \
-o bam/all.conc.on.pos.dedup.bam.intervals" | $QSUB;
############################################################
# Use GATK to realign the BAMs at the suspicious targets.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-realtgts
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=4 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -Djava.io.tmpdir=$RSHOME/bam -jar $BIN/GenomeAnalysisTK.jar \
-T IndelRealigner \
--num_threads 1 \
-R $GENOME \
-targetIntervals bam/all.conc.on.pos.dedup.bam.intervals \
-I bam/all.conc.on.pos.dedup.bam \
-o bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB;
############################################################
# Index the merged, deduped BAM file as well as the
# merged, deduped, realigned file.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-index
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=4 -N $STEPNAME -m bea -M arq5x@virginia.edu"
echo "cd $RSHOME; samtools index bam/all.conc.on.pos.dedup.bam" | $QSUB
echo "cd $RSHOME; samtools index bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB
############################################################
# Call SNPs and INDELs with GATK using BAQ.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-varbaq
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=12 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-glm BOTH \
--num_threads 10 \
-baq CALCULATE_AS_NECESSARY \
-R $GENOME \
-I bam/all.conc.on.pos.dedup.realigned.bam \
-o varCalling/2012-Nov-09/all.raw.baq.vcf" | $QSUB;
############################################################
# Call SNPs and INDELs with GATK W/O using BAQ.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-varbaq
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=12 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-glm BOTH \
--num_threads 10 \
-baq OFF \
-R $GENOME \
-I bam/all.conc.on.pos.dedup.realigned.bam \
-o varCalling/2012-Nov-09/all.raw.nobaq.vcf" | $QSUB;
############################################################
# Call INDELs with GATK.
############################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-callsnps
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export SAMPLES="1094PC0005 1094PC0009 1094PC0012 1094PC0013 1094PC0016 1094PC0017 1094PC0018 1094PC0019 1094PC0020 1094PC0021 1094PC0022 1094PC0023 1094PC0025"
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=4 -N $STEPNAME -m bea -M arq5x@virginia.edu";
for sample in `echo $SAMPLES`;
do
echo "cd $RSHOME; java -Xmx4g -jar /home/arq5x/cphg-home/bin/GenomeAnalysisTK.jar \
-T IndelGenotyperV2 \
-l INFO \
-R $GENOME \
-I bam/$sample.conc.on.pos.realigned.dedup.baq.bam \
-o varCalling/$sample.indel.vcf" | $QSUB;
done
########################################################################
# Combine the INDEL call from each sample into a single file with GATK.
########################################################################
export RSHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-callsnps
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=2 -N $STEPNAME -m bea -M arq5x@virginia.edu";
echo "cd $RSHOME; java -Xmx10g -jar /home/arq5x/cphg-home/bin/GenomeAnalysisTK.jar \
-T CombineVariants \
-R $GENOME \
-o varCalling/indels.raw.vcf \
-variantMergeOptions UNION \
-genotypeMergeOptions UNIQUIFY \
-B:1094PC0005,VCF varCalling/1094PC0005.indel.vcf \
-B:1094PC0009,VCF varCalling/1094PC0009.indel.vcf \
-B:1094PC0012,VCF varCalling/1094PC0012.indel.vcf \
-B:1094PC0013,VCF varCalling/1094PC0013.indel.vcf \
-B:1094PC0016,VCF varCalling/1094PC0016.indel.vcf \
-B:1094PC0017,VCF varCalling/1094PC0017.indel.vcf \
-B:1094PC0018,VCF varCalling/1094PC0018.indel.vcf \
-B:1094PC0019,VCF varCalling/1094PC0019.indel.vcf \
-B:1094PC0020,VCF varCalling/1094PC0020.indel.vcf \
-B:1094PC0021,VCF varCalling/1094PC0021.indel.vcf \
-B:1094PC0022,VCF varCalling/1094PC0022.indel.vcf \
-B:1094PC0023,VCF varCalling/1094PC0023.indel.vcf \
-B:1094PC0025,VCF varCalling/1094PC0025.indel.vcf " | $QSUB;
echo "cd /home/arq5x/cphg-home/projects/rs-exome/varCalling/; annotateVcf.py -i snps.raw.vcf -p /home/arq5x/cphg-home/bin/annovar/ -s /home/arq5x/cphg-home/shared/annotations/hg19/dbSNP/dbsnp.131.hg19.bed -a /home/arq5x/cphg-home/shared/data/exomes/1000G-210Exomes-20110214/ALL.broad_exome.20110124.both.sites.vcf -o /home/arq5x/cphg-home/projects/rs-exome/varCalling/" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=12000m:ncpus=1 -N snp -m bea -M arq5x@virginia.edu
echo "cd /home/arq5x/cphg-home/projects/rs-exome/varCalling/; annotateVcf.py -i indels.raw.vcf -p /home/arq5x/cphg-home/bin/annovar/ -s /home/arq5x/cphg-home/shared/annotations/hg19/dbSNP/dbsnp.131.hg19.bed -a /home/arq5x/cphg-home/shared/data/exomes/1000G-210Exomes-20110214/ALL.broad_exome.20110124.both.sites.vcf -o /home/arq5x/cphg-home/projects/rs-exome/varCalling/" | qsub -q cphg -W group_list=CPHG -V -l select=1:mem=12000m:ncpus=1 -N indel -m bea -M arq5x@virginia.edu
# combine the output of snps and indel annotations into a single file
cat snps.raw.vcf.1.annovar_anno.2.dbsnp_anno.3.vcf_anno indels.raw.vcf.1.annovar_anno.2.dbsnp_anno.3.vcf_anno > all.vcf.anno
# move the file to my laptop
scp elder.itc.virginia.edu:~/cphg-home/projects/rs-exome/varCalling/all.vcf.anno all.vcf.anno.txt
# extract some add'l information from the VCF INFO field
perl -lane 'if (/DP=(\d+)/) {print $1} else {print "-1"}' all.vcf.anno.txt > depths.txt
perl -lane 'if (/HRun=(\d+)/) {print $1} else {print "-1"}' all.vcf.anno.txt > hruns.txt
perl -lane 'if (/AF=(\d.\d+)/) {print $1} else {print "-1"}' all.vcf.anno.txt > dafs.txt
perl -lane 'if (/MQ=(\d+.\d+)/) {print $1} else {print "-1"}' all.vcf.anno.txt > mapqs.txt
# add the extra fields.
paste depths.txt mapqs.txt dafs.txt hruns.txt all.vcf.anno.txt > all.vcf.anno.info.txt
# require: at least 100 reads
# mapq>=30
# no intronic, intergenic, upstream, downstream or synonymous
awk '$1>=100 && $2 >30' all.vcf.anno.info.txt | \
grep -v intronic | \
grep -v intergenic | \
grep -v upstream | \
grep -v downstream > \
all.vcf.anno.info.final.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment