Skip to content

Instantly share code, notes, and snippets.

@arq5x
Created June 15, 2012 13:35
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save arq5x/2936510 to your computer and use it in GitHub Desktop.
Save arq5x/2936510 to your computer and use it in GitHub Desktop.
Exome pipeline for Charles
##########################################
# Step 0. setup a list of sample names.
# Assume that each of your gzipped
# FASTQ files is named as follows:
# sample1.1.fq.gz
# sample1.2.fq.gz
# sample2.1.fq.gz
# sample2.2.fq.gz
# ...
# sampleN.1.fq.gz
# sampleN.2.fq.gz
##########################################
export SAMPLES="sample1 sample2 sample3"
##########################################
# Step 1. Align the raw FASTQ with BWA
##########################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=bwa-aln
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=8 -N $STEPNAME"
echo "cd $EXHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.1.fq.gz > bam/$sample.1.sai" | $QSUB
echo "cd $EXHOME; bwa aln -e 1 -t 8 $GENOME fastq/$sample.2.fq.gz > bam/$sample.2.sai" | $QSUB
done
############################################################
# Step 2. Pair the alignments (assumes PE data).
# Keep proper, on-target (i.e. +/- 500 bp of a probe) pairs
# using bedtools.
# Require mapping quality >= 20
# Add the sample ID as a RG
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=bwa-pair
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
for sample in `echo $SAMPLES`
do
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME"
echo "cd $EXHOME; bwa sampe -r '@RG\tID:$sample\tSM:$sample' $GENOME bam/$sample.1.sai bam/$sample.2.sai fastq/$sample.1.fq.gz fastq/$sample.2.fq.gz \
| samtools view -q 20 -f 2 -Su - \
| intersectBed -abam stdin -b bed/sureselect.liftedTohg19.slop500.bed \
> bam/$sample.conc.on.bam" | $QSUB
done
############################################################
# Step 3. Sort the on-target concordants by genome position.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=possrt
for sample in `echo $SAMPLES`
do
# -m ea sends an email when job (e)nds or (a)borts
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=1 -N $STEPNAME"
echo "cd $EXHOME; samtools sort -m 4000000000 bam/$sample.conc.on.bam bam/$sample.conc.on.pos" | $QSUB
done
############################################################
# Step 4. Index the position-sorted BAM files.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=index
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME"
echo "cd $EXHOME; samtools index bam/$sample.conc.on.pos.bam" | $QSUB
done
############################################################
# Step 5. Mark duplicates
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rs-ex-dupe
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60
for sample in `echo $SAMPLES`;
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=2 -N $STEPNAME";
echo "cd $EXHOME; java -Xmx2g -jar $BIN/MarkDuplicates.jar \
INPUT=bam/$sample.conc.on.pos.bam \
OUTPUT=bam/$sample.conc.on.pos.dedup.bam \
TMP_DIR=$EXHOME/bam \
VALIDATION_STRINGENCY=LENIENT \
ASSUME_SORTED=true \
METRICS_FILE=bam/$sample.markdup_metrics" | $QSUB;
done
############################################################
# Step 6. Index the position-sorted, deduped BAM files.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=dup-index
for sample in `echo $SAMPLES`
do
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=2000m:ncpus=1 -N $STEPNAME"
echo "cd $EXHOME; samtools index bam/$sample.conc.on.pos.dedup.bam" | $QSUB
done
############################################################
# Step 7. Merge BAM files from each sample into a _single_ bam
# file. This makes variant calling more accurate and
# facilitates visualization and variant calling.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=merge
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/picard-tools-1.60
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=4 -N $STEPNAME";
echo "cd $EXHOME; java -Xmx2g -jar $BIN/MergeSamFiles.jar \
O=bam/all.conc.on.pos.dedup.bam \
I=bam/sample1.conc.on.pos.dedup.bam \
I=bam/sample2.conc.on.pos.dedup.bam \
I=bam/sample3.conc.on.pos.dedup.bam \
TMP_DIR=$EXHOME/bam \
USE_THREADING=true" | $QSUB;
############################################################
# Step 8. Identify realignment targets. These are cases where
# the alignments are suspicious owing to INDELs in the region
#. This step identifies such regions and the next step
# corrects them. Improves accuracy of variant detection.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=realtgts
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=8000m:ncpus=3 -N $STEPNAME";
echo "cd $EXHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \
-T RealignerTargetCreator \
-R $GENOME \
-I bam/all.conc.on.pos.dedup.bam \
-o bam/all.conc.on.pos.dedup.bam.intervals" | $QSUB;
############################################################
# Step 9. Use GATK to realign the BAMs at the suspicious targets.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=realtgts
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=4 -N $STEPNAME";
echo "cd $RSHOME; java -Xmx2g -Djava.io.tmpdir=$EXHOME/bam -jar $BIN/GenomeAnalysisTK.jar \
-T IndelRealigner \
--num_threads 1 \
-R $GENOME \
-targetIntervals bam/all.conc.on.pos.dedup.bam.intervals \
-I bam/all.conc.on.pos.dedup.bam \
-o bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB;
############################################################
# Step 10. Index the merged, deduped BAM file as well as the
# merged, deduped, realigned file.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-index
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=4000m:ncpus=4 -N $STEPNAME"
echo "cd $EXHOME; samtools index bam/all.conc.on.pos.dedup.bam" | $QSUB
echo "cd $EXHOME; samtools index bam/all.conc.on.pos.dedup.realigned.bam" | $QSUB
############################################################
# Step 11. Call SNPs and INDELs with GATK W/O using BAQ.
############################################################
export EXHOME=/home/arq5x/cphg-home/projects/rs-exome
export STEPNAME=rsex-varbaq
export GENOME=/home/arq5x/cphg-home/shared/genomes/hg19/bwa/gatk/hg19_gatk.fa
export BIN=/home/arq5x/cphg-home/shared/bin/GenomeAnalysisTK-1.4-9-g1f1233b
export QSUB="qsub -q cphg -W group_list=CPHG -V -l select=1:mem=16000m:ncpus=12 -N $STEPNAME";
echo "cd $EXHOME; java -Xmx2g -jar $BIN/GenomeAnalysisTK.jar \
-T UnifiedGenotyper \
-glm BOTH \
--num_threads 10 \
-baq OFF \
-R $GENOME \
-I bam/all.conc.on.pos.dedup.realigned.bam \
-o varCalling/2012-Feb-01/all.raw.nobaq.vcf" | $QSUB;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment