Forked from PoisonAlien/rna_seq_variant_pipeline.sh
Last active
December 16, 2023 06:53
-
-
Save terrematte/43116d40ce5756fb5978fb345eb3d586 to your computer and use it in GitHub Desktop.
RNA seq Variant calling pipeline according to gatk4 best practices for hg38
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# AUTHOR: Patrick Terrematte for HPC of NPAD/UFRN and hg38: | |
# http://hungria.imd.ufrn.br/~terrematte/aDNA/rna_seq_variant_pipeline.sh | |
# | |
# Based on pieline Anand M. | |
# RNA-Seq variant calling pieline according to GATK Best practices. | |
# https://www.broadinstitute.org/gatk/guide/article?id=3891 | |
# Full Pipeline of Gatk of hg19: | |
# https://gist.github.com/PoisonAlien/c6c03539cf4b1ac41cf1 | |
# | |
# | |
# Call with following arguments | |
# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename> | |
# ./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename> | |
# | |
# | |
# Install and create a conda environment | |
# wget --no-check-certificate https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh | |
# bash Miniconda3-latest-Linux-x86_64.sh # add -u to update | |
# conda create -n genomic | |
# conda activate genomic | |
# conda install -c bioconda/label/cf201901 sra-tools | |
# conda install -c bioconda fastqc | |
# conda install -c bioconda star | |
# conda install -c bioconda picard | |
# | |
# | |
# Best practices of Gatk4 and Arguments Correspondence | |
# https://github.com/gatk-workflows/gatk4-rnaseq-germline-snps-indels/blob/master/gatk4-rna-best-practices.wdl | |
# https://gatk.broadinstitute.org/hc/en-us/articles/360036817931--Tool-Documentation-Index | |
# | |
# | |
# Download Genome hg38 (UCSC) and annotations | |
# Genomic public Data of hg38 - https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0?pli=1 | |
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta --no-check-certificate | |
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta.fai --no-check-certificate | |
# wget https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.dict --no-check-certificate | |
# wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/gencode.v34.chr_patch_hapl_scaff.annotation.gtf.gz # GTF annotation | |
# unzip *gz | |
# cp gencode.v34.chr_patch_hapl_scaff.annotation.gtf $SCRATCH_GLOBAL/ref_data/ | |
# cp Homo_sapiens_assembly38.* $SCRATCH_GLOBAL/ref_data/ | |
# | |
# This is required to build the indexes and dictionary of the reference genome, in case of another reference genome, | |
# for instance the Homo_sapiens.GRCh38.dna.primary_assembly.fa : | |
# wget ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.28_GRCh38.p13/GCA_000001405.28_GRCh38.p13_genomic.fna.gz --no-check-certificate | |
# cd $SCRATCH_GLOBAL/ref_data/ | |
# picard CreateSequenceDictionary \ | |
# R=Homo_sapiens.GRCh38.dna.primary_assembly.fa \ | |
# O=Homo_sapiens.GRCh38.dna.primary_assembly.dict | |
# samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa | |
# SingleEnd parameters | |
if [ "$#" -eq 3 ]; then | |
cpus=$1 | |
f=$2 | |
bn=$3 | |
fi | |
# PairedEnd parameters | |
if [ "$#" -eq 4 ]; then | |
cpus=$1 | |
fwd=$2 | |
rev=$3 | |
bn=$4 | |
fi | |
# Wrong parameters | |
if [ "$#" -gt 4 ] || [ "$#" -lt 3 ]; then | |
echo "Illegal number of parameters." | |
echo "Call with following arguments: | |
./rna_seq_variant_pipeline.sh <cpus> <Input_Reads1.fastq> <Input_Reads2.fastq> <output_basename> | |
./rna_seq_variant_pipeline.sh <cpus> <Input_Reads.fastq> <output_basename>" | |
exit 1 | |
fi | |
#Loading modules | |
module load softwares/samtools/1.3.1-gnu-4.8 | |
module load softwares/gatk/4.0.9-pre-compiled | |
#Path to reference genome and Index files. | |
star_ref="$SCRATCH_GLOBAL/STAR_hg38_index/" | |
ref="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.fasta" | |
#Path to gatk and picard tools | |
gatk="gatk" | |
picard="picard" | |
#Path to gatk bundle set files | |
millsIndels="$SCRATCH_GLOBAL/ref_data/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" | |
KGIndels="$SCRATCH_GLOBAL/ref_data/1000G_phase1.snps.high_confidence.hg38.vcf.gz" | |
dbSNP138="$SCRATCH_GLOBAL/ref_data/Homo_sapiens_assembly38.dbsnp138.vcf.gz" | |
#Create an output directory | |
bn=$bn"." | |
opdir=$SCRATCH_GLOBAL"/"$bn"processed" | |
mkdir -p $opdir | |
mkdir -p $SCRATCH_GLOBAL/tmp | |
if [ "$#" -eq 3 ]; then | |
#STAR SingleEnd basic mode run | |
echo -e "["$(date)"]\tStep 1/7 - Aligning with STAR SingleEnd..." | |
STAR --genomeDir $star_ref \ | |
--runThreadN $cpus \ | |
--readFilesIn $f \ | |
--outFileNamePrefix $opdir/$bn \ | |
--outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \ | |
--outSAMtype BAM Unsorted \ | |
--outSAMunmapped Within | |
mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out" | |
mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out" | |
mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out" | |
#samtools sort | |
echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.." | |
samtools sort -T $opdir/$bn --threads $cpus $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam" | |
#samtools index | |
echo -e "["$(date)"]\tStep 1/7 - Indexing with samtools.." | |
samtools index $opdir/$bn"sorted.bam" | |
#Building BaseRecalibrator | |
echo -e "["$(date)"]\tStep 2/7 - Building BaseRecalibrator.." | |
$gatk BaseRecalibrator \ | |
-R $ref \ | |
-I $opdir/$bn"sorted.bam" \ | |
-O $opdir/$bn"recal.data.csv" \ | |
--known-sites $KGIndels \ | |
--known-sites $millsIndels \ | |
--known-sites $dbSNP138 2> $opdir/$bn"2.BaseRecalibrator.log" | |
#Perform ApplyBQSR | |
# This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced | |
# in earlier versions of GATK (2.x and 3.x). | |
echo -e "["$(date)"]\tStep 3/7 - Performing BQSR.." | |
$gatk ApplyBQSR \ | |
-R $ref \ | |
-I $opdir/$bn"sorted.bam" \ | |
-O $opdir/$bn"processed.bam" \ | |
--bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"3.ApplyBQSR.log" | |
#Run HaplotypeCaller | |
echo -e "["$(date)"]\tStep 4/7 - Running HaplotypeCaller.." | |
$gatk HaplotypeCaller \ | |
-R $ref \ | |
-I $opdir/$bn"processed.bam" \ | |
-O $opdir/$bn"dbSNP.vcf" \ | |
-stand-call-conf 20.0 \ | |
--dont-use-soft-clipped-bases 2> $opdir/$bn"4.HaplotypeCaller.log" | |
#Filter variants | |
echo -e "["$(date)"]\tStep 5/7 - Filtering Variants.." | |
$gatk VariantFiltration \ | |
-R $ref \ | |
-V $opdir/$bn"dbSNP.vcf" \ | |
-O $opdir/$bn"filtered.vcf" \ | |
-window 35 \ | |
-cluster 3 \ | |
-filter "FS > 30.0 || QD < 2.0" \ | |
--filter-name FSQD 2>$opdir/$bn"5.VariantFilter.log" | |
#Convert VariantsToTable - dbSNP.vcf. | |
echo -e "["$(date)"]\tStep 6/7 - Convert VariantsToTable - dbSNP.vcf.." | |
$gatk VariantsToTable \ | |
-V $opdir/$bn"dbSNP.vcf" \ | |
-F CHROM -F POS -F TYPE -GF AD \ | |
-O $opdir/$bn"dbSNP.vcf.tsv" 2> $opdir/$bn"6.VariantsToTable.dbSNP.log" | |
#Convert VariantsToTable - filtered | |
echo -e "["$(date)"]\tStep 7/7 - Convert VariantsToTable - filt.." | |
$gatk VariantsToTable \ | |
-V $opdir/$bn"filtered.vcf" \ | |
-F CHROM -F POS -F TYPE -GF AD \ | |
-O $opdir/$bn"filt.vcf.tsv" 2> $opdir/$bn"7.VariantsToTable.filt.log" | |
echo -e "["$(date)"]\tDONE!" | |
fi | |
if [ "$#" -eq 4 ]; then | |
#STAR PairedEnd basic mode run | |
echo -e "["$(date)"]\tStep 1/10 - Aligning with STAR PairedEnd..." | |
STAR --outFileNamePrefix $opdir/$bn \ | |
--outSAMtype BAM Unsorted \ | |
--outSAMunmapped Within \ | |
--outSAMattrRGline ID:${bn%?} CN:lab LB:PairedEnd PL:Illumina PU:Unknown SM:${bn%?} \ | |
--outSAMstrandField intronMotif \ | |
--genomeDir $star_ref \ | |
--runThreadN $cpus \ | |
--readFilesIn $fwd $rev | |
mv $opdir/$bn"Log.out" $opdir/$bn"1.STAR.Log.out" | |
mv $opdir/$bn"Log.progress.out" $opdir/$bn"1.STAR.Log.progress.out" | |
mv $opdir/$bn"Log.final.out" $opdir/$bn"1.STAR.Log.final.out" | |
#samtools sort | |
echo -e "["$(date)"]\tStep 1/7 - Sortin with samtools.." | |
samtools sort -T $opdir/$bn --threads $cpus $opdir/$bn"Aligned.out.bam" -o $opdir/$bn"sorted.bam" | |
#samtools index | |
echo -e "["$(date)"]\tStep 2/10 - Indexing with samtools.." | |
samtools index $opdir/$bn"sorted.bam" | |
#picard mark duplicates | |
echo -e "["$(date)"]\tStep 3/10 - Marking duplicates.." | |
$picard MarkDuplicates \ | |
I=$opdir/$bn"sorted.bam" \ | |
O=$opdir/$bn"dupMarked.bam" \ | |
M=$opdir/$bn"dup.metrics" \ | |
TMP_DIR=$SCRATCH_GLOBAL/tmp \ | |
CREATE_INDEX=true \ | |
VALIDATION_STRINGENCY=SILENT 2> $opdir/$bn"3.MarkDuplicates.log" | |
# rm $opdir/$bn"sorted.bam" | |
# rm $opdir/$bn"sorted.bam.bai" | |
#SplitNCigarReads | |
echo -e "["$(date)"]\tStep 4/10 - Spliting reads.." | |
$gatk SplitNCigarReads \ | |
-R $ref \ | |
-I $opdir/$bn"dupMarked.bam" \ | |
-O $opdir/$bn"split.bam" \ | |
-skip-mq-transform \ | |
--tmp-dir=$SCRATCH_GLOBAL/tmp 2> $opdir/$bn"4.SplitNCigarReads.log" | |
# rm $opdir/$bn"dupMarked.bam" | |
# rm $opdir/$bn"dupMarked.bai" | |
#Building BaseRecalibrator | |
echo -e "["$(date)"]\tStep 6/10 - Building BaseRecalibrator.." | |
$gatk BaseRecalibrator \ | |
-R $ref \ | |
-I $opdir/$bn"split.bam" \ | |
-O $opdir/$bn"recal.data.csv" \ | |
--known-sites $KGIndels \ | |
--known-sites $millsIndels \ | |
--known-sites $dbSNP138 2> $opdir/$bn"6.BaseRecalibrator.log" | |
#Perform ApplyBQSR | |
# This tool replaces the use of PrintReads for the application of base quality score recalibration as practiced | |
# in earlier versions of GATK (2.x and 3.x). | |
echo -e "["$(date)"]\tStep 7/10 - Performing BQSR.." | |
$gatk ApplyBQSR \ | |
-R $ref \ | |
-I $opdir/$bn"split.bam" \ | |
-O $opdir/$bn"processed.bam" \ | |
--bqsr-recal-file $opdir/$bn"recal.data.csv" 2> $opdir/$bn"7.ApplyBQSR.log" | |
# rm $opdir/$bn"split.bam" | |
# rm $opdir/$bn"split.bai" | |
#Run HaplotypeCaller | |
echo -e "["$(date)"]\tStep 8/10 - Running HaplotypeCaller.." | |
$gatk HaplotypeCaller \ | |
-R $ref \ | |
-I $opdir/$bn"processed.bam" \ | |
-O $opdir/$bn"vcf" \ | |
-stand-call-conf 20.0 \ | |
-D $dbSNP138 \ | |
--dont-use-soft-clipped-bases 2> $opdir/$bn"8.HaplotypeCaller.log" | |
#Filter variants | |
echo -e "["$(date)"]\tStep 9/10 - Filtering Variants.." | |
$gatk VariantFiltration \ | |
-R $ref \ | |
-V $opdir/$bn"vcf" \ | |
-O $opdir/$bn"filtered.vcf" \ | |
-window 35 \ | |
-cluster 3 \ | |
-filter "FS > 30.0 || QD < 2.0" \ | |
--filter-name FSQD 2>$opdir/$bn"9.VariantFilter.log" | |
#Convert VariantsToTable | |
echo -e "["$(date)"]\tStep 10/10 - Convert VariantsToTable.." | |
$gatk VariantsToTable \ | |
-V $opdir/$bn"filtered.vcf" \ | |
-F CHROM -F POS -F TYPE -GF AD \ | |
-O $opdir/$bn".vcf.tsv" 2> $opdir/$bn"10.VariantsToTable.log" | |
echo -e "["$(date)"]\tDONE!" | |
fi | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment