Skip to content

Instantly share code, notes, and snippets.

@terrematte
Forked from PoisonAlien/rna_seq_variant_pipeline.sh
Last active December 16, 2023 06:53
Show Gist options
  • Save terrematte/43116d40ce5756fb5978fb345eb3d586 to your computer and use it in GitHub Desktop.
Save terrematte/43116d40ce5756fb5978fb345eb3d586 to your computer and use it in GitHub Desktop.
RNA seq Variant calling pipeline according to gatk4 best practices for hg38
#!/bin/bash
#
# AUTHOR: Patrick Terrematte for HPC of NPAD/UFRN and hg38:
# http://hungria.imd.ufrn.br/~terrematte/aDNA/rna_seq_variant_pipeline.sh
#
# Based on pieline Anand M.
# RNA-Seq variant calling pieline according to GATK Best practices.
# https://www.broadinstitute.org/gatk/guide/article?id=3891
# Full Pipeline of Gatk of hg19:
# https://gist.github.com/PoisonAlien/c6c03539cf4b1ac41cf1
#
#
# Call with following arguments
# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename>
# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>
#
#
# Install and create a conda environment
# wget --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
# bash Miniconda3-latest-Linux-x86_64.sh # add -u to update
# conda create -n genomic
# conda activate genomic
# conda install -c bioconda/label/cf201901 sra-tools
# conda install -c bioconda fastqc
# conda install -c bioconda star
# conda install -c bioconda picard
#
#
# Best practices of Gatk4 and Arguments Correspondence
# https://github.com/gatk-workflows/gatk4-rnaseq-germline-snps-indels/blob/master/gatk4-rna-best-practices.wdl
# https://gatk.broadinstitute.org/hc/en-us/articles/360036817931--Tool-Documentation-Index
#
#
# Download Genome hg38 (UCSC) and annotations
# Genomic public Data of hg38 - https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta --no-check-certificate
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai --no-check-certificate
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict --no-check-certificate
# wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.chr_patch_hapl_scaff.annotation.gtf.gz # GTF annotation
# unzip *gz
# cp gencode.v34.chr_patch_hapl_scaff.annotation.gtf $SCRATCH_GLOBAL/ref_data/
# cp Homo_sapiens_assembly38.* $SCRATCH_GLOBAL/ref_data/
#
# This is required to build the indexes and dictionary of the reference genome, in case of another reference genome,
# for instance the Homo_sapiens.GRCh38.dna.primary_assembly.fa :
# wget ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz --no-check-certificate
# cd $SCRATCH_GLOBAL/ref_data/
# picard CreateSequenceDictionary \
# R=Homo_sapiens.GRCh38.dna.primary_assembly.fa \
# O=Homo_sapiens.GRCh38.dna.primary_assembly.dict
# samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa
# SingleEnd parameters
if [ "$#" -eq 3 ]; then
cpus=$1
f=$2
bn=$3
fi
# PairedEnd parameters
if [ "$#" -eq 4 ]; then
cpus=$1
fwd=$2
rev=$3
bn=$4
fi
# Wrong parameters
if [ "$#" -gt 4 ] || [ "$#" -lt 3 ]; then
echo "Illegal number of parameters."
echo "Call with following arguments:
./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename>
./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>"
exit 1
fi
#Loading modules
module load softwares/samtools/1.3.1-gnu-4.8
module load softwares/gatk/4.0.9-pre-compiled
#Path to reference genome and Index files.
star_ref="$SCRATCH_GLOBAL/STAR_hg38_index/"
ref="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.fasta"
#Path to gatk and picard tools
gatk="gatk"
picard="picard"
#Path to gatk bundle set files
millsIndels="$SCRATCH_GLOBAL/ref_data/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
KGIndels="$SCRATCH_GLOBAL/ref_data/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
dbSNP138="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.dbsnp138.vcf.gz"
#Create an output directory
bn=$bn"."
opdir=$SCRATCH_GLOBAL"/"$bn"processed"
mkdir -p $opdir
mkdir -p $SCRATCH_GLOBAL/tmp
if [ "$#" -eq 3 ]; then
#STAR SingleEnd basic mode run
echo -e "["$(date)"]\tStep 1/7 - Aligning with STAR SingleEnd..."
STAR --genomeDir $star_ref \
--runThreadN $cpus \
--readFilesIn $f \
--outFileNamePrefix $opdir/$bn \
--outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \
--outSAMtype BAM Unsorted \
--outSAMunmapped Within
mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out"
mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out"
mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out"
#samtools sort
echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.."
samtools sort -T $opdir/$bn --threads $cpus $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam"
#samtools index
echo -e "["$(date)"]\tStep 1/7 - Indexing with samtools.."
samtools index $opdir/$bn"sorted.bam"
#Building BaseRecalibrator
echo -e "["$(date)"]\tStep 2/7 - Building BaseRecalibrator.."
$gatk BaseRecalibrator \
-R $ref \
-I $opdir/$bn"sorted.bam" \
-O $opdir/$bn"recal.data.csv" \
--known-sites $KGIndels \
--known-sites $millsIndels \
--known-sites $dbSNP138 2> $opdir/$bn"2.BaseRecalibrator.log"
#Perform ApplyBQSR
# This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced
# in earlier versions of GATK (2.x and 3.x).
echo -e "["$(date)"]\tStep 3/7 - Performing BQSR.."
$gatk ApplyBQSR \
-R $ref \
-I $opdir/$bn"sorted.bam" \
-O $opdir/$bn"processed.bam" \
--bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"3.ApplyBQSR.log"
#Run HaplotypeCaller
echo -e "["$(date)"]\tStep 4/7 - Running HaplotypeCaller.."
$gatk HaplotypeCaller \
-R $ref \
-I $opdir/$bn"processed.bam" \
-O $opdir/$bn"dbSNP.vcf" \
-stand-call-conf 20.0 \
--dont-use-soft-clipped-bases 2> $opdir/$bn"4.HaplotypeCaller.log"
#Filter variants
echo -e "["$(date)"]\tStep 5/7 - Filtering Variants.."
$gatk VariantFiltration \
-R $ref \
-V $opdir/$bn"dbSNP.vcf" \
-O $opdir/$bn"filtered.vcf" \
-window 35 \
-cluster 3 \
-filter "FS > 30.0 || QD < 2.0" \
--filter-name FSQD 2>$opdir/$bn"5.VariantFilter.log"
#Convert VariantsToTable - dbSNP.vcf.
echo -e "["$(date)"]\tStep 6/7 - Convert VariantsToTable - dbSNP.vcf.."
$gatk VariantsToTable \
-V $opdir/$bn"dbSNP.vcf" \
-F CHROM -F POS -F TYPE -GF AD \
-O $opdir/$bn"dbSNP.vcf.tsv" 2> $opdir/$bn"6.VariantsToTable.dbSNP.log"
#Convert VariantsToTable - filtered
echo -e "["$(date)"]\tStep 7/7 - Convert VariantsToTable - filt.."
$gatk VariantsToTable \
-V $opdir/$bn"filtered.vcf" \
-F CHROM -F POS -F TYPE -GF AD \
-O $opdir/$bn"filt.vcf.tsv" 2> $opdir/$bn"7.VariantsToTable.filt.log"
echo -e "["$(date)"]\tDONE!"
fi
if [ "$#" -eq 4 ]; then
#STAR PairedEnd basic mode run
echo -e "["$(date)"]\tStep 1/10 - Aligning with STAR PairedEnd..."
STAR --outFileNamePrefix $opdir/$bn \
--outSAMtype BAM Unsorted \
--outSAMunmapped Within \
--outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \
--outSAMstrandField intronMotif \
--genomeDir $star_ref \
--runThreadN $cpus \
--readFilesIn $fwd $rev
mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out"
mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out"
mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out"
#samtools sort
echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.."
samtools sort -T $opdir/$bn --threads $cpus $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam"
#samtools index
echo -e "["$(date)"]\tStep 2/10 - Indexing with samtools.."
samtools index $opdir/$bn"sorted.bam"
#picard mark duplicates
echo -e "["$(date)"]\tStep 3/10 - Marking duplicates.."
$picard MarkDuplicates \
I=$opdir/$bn"sorted.bam" \
O=$opdir/$bn"dupMarked.bam" \
M=$opdir/$bn"dup.metrics" \
TMP_DIR=$SCRATCH_GLOBAL/tmp \
CREATE_INDEX=true \
VALIDATION_STRINGENCY=SILENT 2> $opdir/$bn"3.MarkDuplicates.log"
# rm $opdir/$bn"sorted.bam"
# rm $opdir/$bn"sorted.bam.bai"
#SplitNCigarReads
echo -e "["$(date)"]\tStep 4/10 - Spliting reads.."
$gatk SplitNCigarReads \
-R $ref \
-I $opdir/$bn"dupMarked.bam" \
-O $opdir/$bn"split.bam" \
-skip-mq-transform \
--tmp-dir=$SCRATCH_GLOBAL/tmp 2> $opdir/$bn"4.SplitNCigarReads.log"
# rm $opdir/$bn"dupMarked.bam"
# rm $opdir/$bn"dupMarked.bai"
#Building BaseRecalibrator
echo -e "["$(date)"]\tStep 6/10 - Building BaseRecalibrator.."
$gatk BaseRecalibrator \
-R $ref \
-I $opdir/$bn"split.bam" \
-O $opdir/$bn"recal.data.csv" \
--known-sites $KGIndels \
--known-sites $millsIndels \
--known-sites $dbSNP138 2> $opdir/$bn"6.BaseRecalibrator.log"
#Perform ApplyBQSR
# This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced
# in earlier versions of GATK (2.x and 3.x).
echo -e "["$(date)"]\tStep 7/10 - Performing BQSR.."
$gatk ApplyBQSR \
-R $ref \
-I $opdir/$bn"split.bam" \
-O $opdir/$bn"processed.bam" \
--bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"7.ApplyBQSR.log"
# rm $opdir/$bn"split.bam"
# rm $opdir/$bn"split.bai"
#Run HaplotypeCaller
echo -e "["$(date)"]\tStep 8/10 - Running HaplotypeCaller.."
$gatk HaplotypeCaller \
-R $ref \
-I $opdir/$bn"processed.bam" \
-O $opdir/$bn"vcf" \
-stand-call-conf 20.0 \
-D $dbSNP138 \
--dont-use-soft-clipped-bases 2> $opdir/$bn"8.HaplotypeCaller.log"
#Filter variants
echo -e "["$(date)"]\tStep 9/10 - Filtering Variants.."
$gatk VariantFiltration \
-R $ref \
-V $opdir/$bn"vcf" \
-O $opdir/$bn"filtered.vcf" \
-window 35 \
-cluster 3 \
-filter "FS > 30.0 || QD < 2.0" \
--filter-name FSQD 2>$opdir/$bn"9.VariantFilter.log"
#Convert VariantsToTable
echo -e "["$(date)"]\tStep 10/10 - Convert VariantsToTable.."
$gatk VariantsToTable \
-V $opdir/$bn"filtered.vcf" \
-F CHROM -F POS -F TYPE -GF AD \
-O $opdir/$bn".vcf.tsv" 2> $opdir/$bn"10.VariantsToTable.log"
echo -e "["$(date)"]\tDONE!"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment