terrematte/rna_seq_variant_pipeline.sh

## rna_seq_variant_pipeline.sh
#!/bin/bash
#
# AUTHOR: Patrick Terrematte for HPC of NPAD/UFRN and hg38:
# http://hungria.imd.ufrn.br/~terrematte/aDNA/rna_seq_variant_pipeline.sh
#
# Based on pieline Anand M.
# RNA-Seq variant calling pieline according to GATK Best practices.
# https://www.broadinstitute.org/gatk/guide/article?id=3891
# Full Pipeline of Gatk of hg19:
# https://gist.github.com/PoisonAlien/c6c03539cf4b1ac41cf1
#
#
# Call with following arguments
# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename>
# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>
#
#
# Install and create a conda environment
# wget --no-check-certificate  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# bash Miniconda3-latest-Linux-x86_64.sh # add -u to update
# conda create -n genomic
# conda activate genomic
# conda install -c bioconda/label/cf201901 sra-tools
# conda install -c bioconda fastqc
# conda install -c bioconda star
# conda install -c bioconda picard
#
#
# Best practices of Gatk4 and Arguments Correspondence
# https://github.com/gatk-workflows/gatk4-rnaseq-germline-snps-indels/blob/master/gatk4-rna-best-practices.wdl
# https://gatk.broadinstitute.org/hc/en-us/articles/360036817931--Tool-Documentation-Index
#
#
# Download Genome hg38 (UCSC) and annotations
# Genomic public Data of hg38 - https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta  --no-check-certificate
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai  --no-check-certificate
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict --no-check-certificate
# wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.chr_patch_hapl_scaff.annotation.gtf.gz # GTF annotation
# unzip *gz
# cp gencode.v34.chr_patch_hapl_scaff.annotation.gtf  $SCRATCH_GLOBAL/ref_data/
# cp Homo_sapiens_assembly38.* $SCRATCH_GLOBAL/ref_data/
#
# This is required to build the indexes and dictionary of the reference genome, in case of another reference genome,
# for instance the Homo_sapiens.GRCh38.dna.primary_assembly.fa :
# wget ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz --no-check-certificate
# cd $SCRATCH_GLOBAL/ref_data/
# picard CreateSequenceDictionary \
#        R=Homo_sapiens.GRCh38.dna.primary_assembly.fa \
#        O=Homo_sapiens.GRCh38.dna.primary_assembly.dict
# samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa


# SingleEnd parameters
if [ "$#" -eq 3 ]; then
	cpus=$1
	f=$2
	bn=$3
fi

# PairedEnd parameters
if [ "$#" -eq 4 ]; then
	cpus=$1
	fwd=$2
	rev=$3
	bn=$4
fi

# Wrong parameters
if [ "$#" -gt 4  ] || [ "$#" -lt 3 ]; then
   echo "Illegal number of parameters."
   echo "Call with following arguments:
			./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename>
			./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>"
   exit 1
fi


#Loading modules
module load softwares/samtools/1.3.1-gnu-4.8
module load softwares/gatk/4.0.9-pre-compiled

#Path to reference genome and Index files.
star_ref="$SCRATCH_GLOBAL/STAR_hg38_index/"
ref="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.fasta"

#Path to gatk and picard tools
gatk="gatk"
picard="picard"

#Path to gatk bundle set files
millsIndels="$SCRATCH_GLOBAL/ref_data/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
KGIndels="$SCRATCH_GLOBAL/ref_data/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
dbSNP138="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.dbsnp138.vcf.gz"

#Create an output directory
bn=$bn"."
opdir=$SCRATCH_GLOBAL"/"$bn"processed"
mkdir -p $opdir

mkdir -p $SCRATCH_GLOBAL/tmp

if [ "$#" -eq 3 ]; then
        #STAR SingleEnd basic mode run
        echo -e "["$(date)"]\tStep 1/7 -  Aligning with STAR SingleEnd..."
        STAR --genomeDir $star_ref \
                        --runThreadN $cpus  \
                        --readFilesIn $f  \
                        --outFileNamePrefix $opdir/$bn  \
                        --outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \
                        --outSAMtype BAM Unsorted \
                        --outSAMunmapped Within


        mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out"
        mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out"
        mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out"


        #samtools sort
        echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.."
        samtools sort -T  $opdir/$bn --threads  $cpus  $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam"


        #samtools index
        echo -e "["$(date)"]\tStep 1/7 - Indexing with samtools.."
        samtools index $opdir/$bn"sorted.bam"


        #Building BaseRecalibrator
        echo -e "["$(date)"]\tStep 2/7 - Building BaseRecalibrator.."
        $gatk BaseRecalibrator \
                        -R $ref \
                        -I $opdir/$bn"sorted.bam" \
                        -O $opdir/$bn"recal.data.csv" \
                        --known-sites $KGIndels \
                        --known-sites $millsIndels \
                        --known-sites $dbSNP138 2> $opdir/$bn"2.BaseRecalibrator.log"


        #Perform ApplyBQSR
        # This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced
        # in earlier versions of GATK (2.x and 3.x).
        echo -e "["$(date)"]\tStep 3/7 - Performing BQSR.."
        $gatk ApplyBQSR \
                        -R $ref \
                        -I $opdir/$bn"sorted.bam" \
                        -O $opdir/$bn"processed.bam" \
                        --bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"3.ApplyBQSR.log"


        #Run HaplotypeCaller
        echo -e "["$(date)"]\tStep 4/7 - Running HaplotypeCaller.."
        $gatk HaplotypeCaller \
                        -R $ref \
                        -I $opdir/$bn"processed.bam" \
                        -O $opdir/$bn"dbSNP.vcf" \
                        -stand-call-conf  20.0 \
                        --dont-use-soft-clipped-bases 2> $opdir/$bn"4.HaplotypeCaller.log"


        #Filter variants
        echo -e "["$(date)"]\tStep 5/7 - Filtering Variants.."
        $gatk VariantFiltration \
                        -R $ref \
                        -V $opdir/$bn"dbSNP.vcf" \
                        -O $opdir/$bn"filtered.vcf" \
                        -window 35 \
                        -cluster 3 \
                        -filter "FS > 30.0 || QD < 2.0" \
                        --filter-name FSQD 2>$opdir/$bn"5.VariantFilter.log"


        #Convert VariantsToTable - dbSNP.vcf.
        echo -e "["$(date)"]\tStep 6/7 - Convert VariantsToTable - dbSNP.vcf.."
        $gatk VariantsToTable \
             -V $opdir/$bn"dbSNP.vcf" \
             -F CHROM -F POS -F TYPE -GF AD \
             -O $opdir/$bn"dbSNP.vcf.tsv" 2> $opdir/$bn"6.VariantsToTable.dbSNP.log"


        #Convert VariantsToTable - filtered
        echo -e "["$(date)"]\tStep 7/7 - Convert VariantsToTable - filt.."
        $gatk VariantsToTable \
             -V $opdir/$bn"filtered.vcf" \
             -F CHROM -F POS -F TYPE -GF AD \
             -O $opdir/$bn"filt.vcf.tsv" 2> $opdir/$bn"7.VariantsToTable.filt.log"

        echo -e "["$(date)"]\tDONE!"

fi

if [ "$#" -eq 4 ]; then
        #STAR PairedEnd basic mode run
        echo -e "["$(date)"]\tStep 1/10 - Aligning with STAR PairedEnd..."
        STAR --outFileNamePrefix $opdir/$bn \
                        --outSAMtype BAM Unsorted \
                        --outSAMunmapped Within \
                        --outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \
                        --outSAMstrandField intronMotif \
                        --genomeDir $star_ref \
                        --runThreadN $cpus \
                        --readFilesIn $fwd $rev

        mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out"
        mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out"
        mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out"

        #samtools sort
        echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.."
        samtools sort -T  $opdir/$bn --threads  $cpus  $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam"

        #samtools index
        echo -e "["$(date)"]\tStep 2/10 - Indexing with samtools.."
        samtools index $opdir/$bn"sorted.bam"

        #picard mark duplicates
        echo -e "["$(date)"]\tStep 3/10 - Marking duplicates.."
        $picard MarkDuplicates \
                        I=$opdir/$bn"sorted.bam" \
                        O=$opdir/$bn"dupMarked.bam" \
                        M=$opdir/$bn"dup.metrics" \
                        TMP_DIR=$SCRATCH_GLOBAL/tmp \
                        CREATE_INDEX=true \
                        VALIDATION_STRINGENCY=SILENT 2> $opdir/$bn"3.MarkDuplicates.log"

#		rm $opdir/$bn"sorted.bam"
#		rm $opdir/$bn"sorted.bam.bai"

        #SplitNCigarReads
        echo -e "["$(date)"]\tStep 4/10 - Spliting reads.."
        $gatk SplitNCigarReads \
                        -R $ref \
                        -I $opdir/$bn"dupMarked.bam" \
                        -O $opdir/$bn"split.bam" \
                         -skip-mq-transform \
                        --tmp-dir=$SCRATCH_GLOBAL/tmp 2> $opdir/$bn"4.SplitNCigarReads.log"

#		rm $opdir/$bn"dupMarked.bam"
#		rm $opdir/$bn"dupMarked.bai"

        #Building BaseRecalibrator
        echo -e "["$(date)"]\tStep 6/10 - Building BaseRecalibrator.."
        $gatk BaseRecalibrator \
                        -R $ref \
                        -I $opdir/$bn"split.bam" \
                        -O $opdir/$bn"recal.data.csv" \
                        --known-sites $KGIndels \
                        --known-sites $millsIndels \
                        --known-sites $dbSNP138 2> $opdir/$bn"6.BaseRecalibrator.log"

        #Perform ApplyBQSR
        # This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced
        # in earlier versions of GATK (2.x and 3.x).
        echo -e "["$(date)"]\tStep 7/10 - Performing BQSR.."
        $gatk ApplyBQSR \
                        -R $ref \
                        -I $opdir/$bn"split.bam" \
                        -O $opdir/$bn"processed.bam" \
                        --bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"7.ApplyBQSR.log"

#		rm $opdir/$bn"split.bam"
#		rm $opdir/$bn"split.bai"

        #Run HaplotypeCaller
        echo -e "["$(date)"]\tStep 8/10 - Running HaplotypeCaller.."
        $gatk HaplotypeCaller \
                        -R $ref \
                        -I $opdir/$bn"processed.bam" \
                        -O $opdir/$bn"vcf" \
                        -stand-call-conf  20.0 \
                         -D $dbSNP138 \
                        --dont-use-soft-clipped-bases 2> $opdir/$bn"8.HaplotypeCaller.log"


        #Filter variants
        echo -e "["$(date)"]\tStep 9/10 - Filtering Variants.."
        $gatk VariantFiltration \
                        -R $ref \
                        -V $opdir/$bn"vcf" \
                        -O $opdir/$bn"filtered.vcf" \
                        -window 35 \
                        -cluster 3 \
                        -filter "FS > 30.0 || QD < 2.0" \
                        --filter-name FSQD 2>$opdir/$bn"9.VariantFilter.log"


        #Convert VariantsToTable
        echo -e "["$(date)"]\tStep 10/10 - Convert VariantsToTable.."
        $gatk VariantsToTable \
             -V $opdir/$bn"filtered.vcf" \
             -F CHROM -F POS -F TYPE -GF AD \
             -O $opdir/$bn".vcf.tsv" 2> $opdir/$bn"10.VariantsToTable.log"

        echo -e "["$(date)"]\tDONE!"
fi
	#!/bin/bash
	#
	# AUTHOR: Patrick Terrematte for HPC of NPAD/UFRN and hg38:
	# http://hungria.imd.ufrn.br/~terrematte/aDNA/rna_seq_variant_pipeline.sh
	#
	# Based on pieline Anand M.
	# RNA-Seq variant calling pieline according to GATK Best practices.
	# https://www.broadinstitute.org/gatk/guide/article?id=3891
	# Full Pipeline of Gatk of hg19:
	# https://gist.github.com/PoisonAlien/c6c03539cf4b1ac41cf1
	#
	#
	# Call with following arguments
	# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename>
	# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>
	#
	#
	# Install and create a conda environment
	# wget --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
	# bash Miniconda3-latest-Linux-x86_64.sh # add -u to update
	# conda create -n genomic
	# conda activate genomic
	# conda install -c bioconda/label/cf201901 sra-tools
	# conda install -c bioconda fastqc
	# conda install -c bioconda star
	# conda install -c bioconda picard
	#
	#
	# Best practices of Gatk4 and Arguments Correspondence
	# https://github.com/gatk-workflows/gatk4-rnaseq-germline-snps-indels/blob/master/gatk4-rna-best-practices.wdl
	# https://gatk.broadinstitute.org/hc/en-us/articles/360036817931--Tool-Documentation-Index
	#
	#
	# Download Genome hg38 (UCSC) and annotations
	# Genomic public Data of hg38 - https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1
	# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta --no-check-certificate
	# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai --no-check-certificate
	# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict --no-check-certificate
	# wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.chr_patch_hapl_scaff.annotation.gtf.gz # GTF annotation
	# unzip *gz
	# cp gencode.v34.chr_patch_hapl_scaff.annotation.gtf $SCRATCH_GLOBAL/ref_data/
	# cp Homo_sapiens_assembly38.* $SCRATCH_GLOBAL/ref_data/
	#
	# This is required to build the indexes and dictionary of the reference genome, in case of another reference genome,
	# for instance the Homo_sapiens.GRCh38.dna.primary_assembly.fa :
	# wget ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz --no-check-certificate
	# cd $SCRATCH_GLOBAL/ref_data/
	# picard CreateSequenceDictionary \
	# R=Homo_sapiens.GRCh38.dna.primary_assembly.fa \
	# O=Homo_sapiens.GRCh38.dna.primary_assembly.dict
	# samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa



	# SingleEnd parameters
	if [ "$#" -eq 3 ]; then
	cpus=$1
	f=$2
	bn=$3
	fi

	# PairedEnd parameters
	if [ "$#" -eq 4 ]; then
	cpus=$1
	fwd=$2
	rev=$3
	bn=$4
	fi

	# Wrong parameters
	if [ "$#" -gt 4 ] \|\| [ "$#" -lt 3 ]; then
	echo "Illegal number of parameters."
	echo "Call with following arguments:
	./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename>
	./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>"
	exit 1
	fi


	#Loading modules
	module load softwares/samtools/1.3.1-gnu-4.8
	module load softwares/gatk/4.0.9-pre-compiled

	#Path to reference genome and Index files.
	star_ref="$SCRATCH_GLOBAL/STAR_hg38_index/"
	ref="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.fasta"

	#Path to gatk and picard tools
	gatk="gatk"
	picard="picard"

	#Path to gatk bundle set files
	millsIndels="$SCRATCH_GLOBAL/ref_data/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
	KGIndels="$SCRATCH_GLOBAL/ref_data/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
	dbSNP138="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.dbsnp138.vcf.gz"

	#Create an output directory
	bn=$bn"."
	opdir=$SCRATCH_GLOBAL"/"$bn"processed"
	mkdir -p $opdir

	mkdir -p $SCRATCH_GLOBAL/tmp

	if [ "$#" -eq 3 ]; then
	#STAR SingleEnd basic mode run
	echo -e "["$(date)"]\tStep 1/7 - Aligning with STAR SingleEnd..."
	STAR --genomeDir $star_ref \
	--runThreadN $cpus \
	--readFilesIn $f \
	--outFileNamePrefix $opdir/$bn \
	--outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \
	--outSAMtype BAM Unsorted \
	--outSAMunmapped Within


	mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out"
	mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out"
	mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out"


	#samtools sort
	echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.."
	samtools sort -T $opdir/$bn --threads $cpus $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam"


	#samtools index
	echo -e "["$(date)"]\tStep 1/7 - Indexing with samtools.."
	samtools index $opdir/$bn"sorted.bam"


	#Building BaseRecalibrator
	echo -e "["$(date)"]\tStep 2/7 - Building BaseRecalibrator.."
	$gatk BaseRecalibrator \
	-R $ref \
	-I $opdir/$bn"sorted.bam" \
	-O $opdir/$bn"recal.data.csv" \
	--known-sites $KGIndels \
	--known-sites $millsIndels \
	--known-sites $dbSNP138 2> $opdir/$bn"2.BaseRecalibrator.log"


	#Perform ApplyBQSR
	# This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced
	# in earlier versions of GATK (2.x and 3.x).
	echo -e "["$(date)"]\tStep 3/7 - Performing BQSR.."
	$gatk ApplyBQSR \
	-R $ref \
	-I $opdir/$bn"sorted.bam" \
	-O $opdir/$bn"processed.bam" \
	--bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"3.ApplyBQSR.log"


	#Run HaplotypeCaller
	echo -e "["$(date)"]\tStep 4/7 - Running HaplotypeCaller.."
	$gatk HaplotypeCaller \
	-R $ref \
	-I $opdir/$bn"processed.bam" \
	-O $opdir/$bn"dbSNP.vcf" \
	-stand-call-conf 20.0 \
	--dont-use-soft-clipped-bases 2> $opdir/$bn"4.HaplotypeCaller.log"


	#Filter variants
	echo -e "["$(date)"]\tStep 5/7 - Filtering Variants.."
	$gatk VariantFiltration \
	-R $ref \
	-V $opdir/$bn"dbSNP.vcf" \
	-O $opdir/$bn"filtered.vcf" \
	-window 35 \
	-cluster 3 \
	-filter "FS > 30.0 \|\| QD < 2.0" \
	--filter-name FSQD 2>$opdir/$bn"5.VariantFilter.log"


	#Convert VariantsToTable - dbSNP.vcf.
	echo -e "["$(date)"]\tStep 6/7 - Convert VariantsToTable - dbSNP.vcf.."
	$gatk VariantsToTable \
	-V $opdir/$bn"dbSNP.vcf" \
	-F CHROM -F POS -F TYPE -GF AD \
	-O $opdir/$bn"dbSNP.vcf.tsv" 2> $opdir/$bn"6.VariantsToTable.dbSNP.log"


	#Convert VariantsToTable - filtered
	echo -e "["$(date)"]\tStep 7/7 - Convert VariantsToTable - filt.."
	$gatk VariantsToTable \
	-V $opdir/$bn"filtered.vcf" \
	-F CHROM -F POS -F TYPE -GF AD \
	-O $opdir/$bn"filt.vcf.tsv" 2> $opdir/$bn"7.VariantsToTable.filt.log"

	echo -e "["$(date)"]\tDONE!"

	fi

	if [ "$#" -eq 4 ]; then
	#STAR PairedEnd basic mode run
	echo -e "["$(date)"]\tStep 1/10 - Aligning with STAR PairedEnd..."
	STAR --outFileNamePrefix $opdir/$bn \
	--outSAMtype BAM Unsorted \
	--outSAMunmapped Within \
	--outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \
	--outSAMstrandField intronMotif \
	--genomeDir $star_ref \
	--runThreadN $cpus \
	--readFilesIn $fwd $rev

	mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out"
	mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out"
	mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out"

	#samtools sort
	echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.."
	samtools sort -T $opdir/$bn --threads $cpus $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam"

	#samtools index
	echo -e "["$(date)"]\tStep 2/10 - Indexing with samtools.."
	samtools index $opdir/$bn"sorted.bam"

	#picard mark duplicates
	echo -e "["$(date)"]\tStep 3/10 - Marking duplicates.."
	$picard MarkDuplicates \
	I=$opdir/$bn"sorted.bam" \
	O=$opdir/$bn"dupMarked.bam" \
	M=$opdir/$bn"dup.metrics" \
	TMP_DIR=$SCRATCH_GLOBAL/tmp \
	CREATE_INDEX=true \
	VALIDATION_STRINGENCY=SILENT 2> $opdir/$bn"3.MarkDuplicates.log"

	# rm $opdir/$bn"sorted.bam"
	# rm $opdir/$bn"sorted.bam.bai"

	#SplitNCigarReads
	echo -e "["$(date)"]\tStep 4/10 - Spliting reads.."
	$gatk SplitNCigarReads \
	-R $ref \
	-I $opdir/$bn"dupMarked.bam" \
	-O $opdir/$bn"split.bam" \
	-skip-mq-transform \
	--tmp-dir=$SCRATCH_GLOBAL/tmp 2> $opdir/$bn"4.SplitNCigarReads.log"

	# rm $opdir/$bn"dupMarked.bam"
	# rm $opdir/$bn"dupMarked.bai"

	#Building BaseRecalibrator
	echo -e "["$(date)"]\tStep 6/10 - Building BaseRecalibrator.."
	$gatk BaseRecalibrator \
	-R $ref \
	-I $opdir/$bn"split.bam" \
	-O $opdir/$bn"recal.data.csv" \
	--known-sites $KGIndels \
	--known-sites $millsIndels \
	--known-sites $dbSNP138 2> $opdir/$bn"6.BaseRecalibrator.log"

	#Perform ApplyBQSR
	# This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced
	# in earlier versions of GATK (2.x and 3.x).
	echo -e "["$(date)"]\tStep 7/10 - Performing BQSR.."
	$gatk ApplyBQSR \
	-R $ref \
	-I $opdir/$bn"split.bam" \
	-O $opdir/$bn"processed.bam" \
	--bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"7.ApplyBQSR.log"

	# rm $opdir/$bn"split.bam"
	# rm $opdir/$bn"split.bai"

	#Run HaplotypeCaller
	echo -e "["$(date)"]\tStep 8/10 - Running HaplotypeCaller.."
	$gatk HaplotypeCaller \
	-R $ref \
	-I $opdir/$bn"processed.bam" \
	-O $opdir/$bn"vcf" \
	-stand-call-conf 20.0 \
	-D $dbSNP138 \
	--dont-use-soft-clipped-bases 2> $opdir/$bn"8.HaplotypeCaller.log"


	#Filter variants
	echo -e "["$(date)"]\tStep 9/10 - Filtering Variants.."
	$gatk VariantFiltration \
	-R $ref \
	-V $opdir/$bn"vcf" \
	-O $opdir/$bn"filtered.vcf" \
	-window 35 \
	-cluster 3 \
	-filter "FS > 30.0 \|\| QD < 2.0" \
	--filter-name FSQD 2>$opdir/$bn"9.VariantFilter.log"


	#Convert VariantsToTable
	echo -e "["$(date)"]\tStep 10/10 - Convert VariantsToTable.."
	$gatk VariantsToTable \
	-V $opdir/$bn"filtered.vcf" \
	-F CHROM -F POS -F TYPE -GF AD \
	-O $opdir/$bn".vcf.tsv" 2> $opdir/$bn"10.VariantsToTable.log"

	echo -e "["$(date)"]\tDONE!"
	fi