edawson/v3.7_accelerated_annotation_filtering_and_qc.sh

## v3.7_accelerated_annotation_filtering_and_qc.sh
#!/bin/bash

########################
## In this gist, we'll reuse the commands from our 3.6 tutorial to align reads and generate BAM files.
## Check out the full post at https://medium.com/@johnnyisraeli/accelerating-germline-and-somatic-genomic-analysis-of-whole-genomes-and-exomes-with-nvidia-clara-e3deeae2acc9
and Gists at:
## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-full_pipeline_and_data_prep_parabricks3-6-sh
## and
## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-step_1_align_reads_parabricks3-6-sh
###########

###########
## We'll run this tutorial on a GCP VM with 64 vCPUs, 240GB of RAM, and 8x NVIDIA V100 GPUs
## To save costs, you can also run this on a GCP VM with 32 vCPUS, 120GB of RAM, and 4x V100 GPUs
###########


## After aligning our reads, we'll rerun the variant calling stages of our past gist
## since we've updated the haplotypecaller and DeepVariant tools. We'll
## also run Strelka2 as an additional variant caller.
##
## After that, we'll merge our VCFs to generate a union callset and an intersection VCF
## with variants called by all three variant callers, annotate our new intersection VCF,
## and remove variants that fail certain criteria for population frequency.
## Finally, we'll run our vcfqc and vcfqcbybam tools to generate simple quality control reports.
#############


################
## HaplotypeCaller
## This step should take roughly 15 minutes on our 8xV100 VM.
################
time pbrun haplotypecaller \
--ref ~/refs/Homo_sapiens_assembly38.fasta \
--in-bam   HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
--in-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt \
--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf

################
## DeepVariant
## This step should take approximately 20 minutes on an 8xV100 VM
################
 time pbrun deepvariant \
 --ref Homo_sapiens_assembly38.fasta  \
 --in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam  \
 --out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf

 ###############
 ## Strelka
 ## This step should take ~10 minutes on a 64-core VM.
 ###############
 time pbrun strelka \
 --ref Homo_sapiens_assembly38.fasta \
 --in-bams HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
 --out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka \
 --num-threads 64


 ## Copy strelka results to current directory.
 cp HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka.strelka_work/results/variants/variants.vcf.gz* .

 ## BGZIP and tabix-index the deepvariant VCFs
 bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf
 tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz

 ## BGZIP and tabix index the haplotypecaller VCFs
 bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf
 tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz


 ## Run the votebasedvcfmerger tool to generate a union and intersection VCF.
 time pbrun votebasedvcfmerger \
 --in-vcf strelka:variants.vcf.gz \
 --in-vcf deepvariant:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz \
 --in-vcf haplotypecaller:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz \
 --min-votes 3
 --out-dir HG002.realign.vbvm

## The HG002.realign.vbvm directory should now contain a
## unionVCF.vcf file with the union callset of HaplotypeCaller, Strelka, and DeepVariant
## and aa filteredVCF.vcf file with only calls produced by all three callers.

## Annotate the intersection VCF with gnomAD, ClinVar, 1000 Genomes
## Download our annotation VCFs and tabix indices
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi

wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi

wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz.tbi

## Download an Ensembl GTF to annotate the VCF file with gene names
wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
## Unzip the GTF file and add the "chr" prefix to the chromosome names (Ensembl excludes this prefix by default.
gunzip Homo_sapiens.GRCh38.105.gtf.gz
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' Homo_sapiens.GRCh38.105.gtf > Homo_sapiens.GRCh38.105.chr.gtf

time pbrun snpswift \
--input-vcf HG002.realign.vbvm/filteredVCF.vcf \
--anno-vcf 1000Genomes:ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz \
--anno-vcf gnomad_v2.1.1:gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz \
--anno-vcf ClinVar:clinvar.vcf.gz \
--ensembl Homo_sapiens.GRCh38.105.chr.gtf  \
--output-vcf HG002.realign.3callers.annotated.vcf

##################
## frequencyfiltration
## Next we'll filter our VCF to remove variants with 1000Genomes allele frequency > 0.05
## and gnomAD AF < 0.05
##################
time pbrun frequencyfiltration \
--in-vcf HG002.realign.3callers.annotated.vcf \
--and-expression "1000Genomes_AF < 0.05" \
--and-expression "gnomad_v2.1.1_AF < 0.05" \
--out-vcf HG002.realign.3callers.annotated.filtered.vcf

##################
## Finally, we'll run our automated vcfqc tool to generate some
## basic QC stats. The vcfqcbybam tool could also be run
## to produce QC stats using an auxilliary BAM file (e.g., when variant calls don't have the desired fields).
##################
time pbrun vcfqc --in-vcf HG002.realign.3callers.annotated.filtered.vcf \
--output-dir HG002.realign.3callers.annotated.filtered.qc \
--depth haplotypecaller_DP --allele-depth deepvariant_AD
	#!/bin/bash

	########################
	## In this gist, we'll reuse the commands from our 3.6 tutorial to align reads and generate BAM files.
	## Check out the full post at https://medium.com/@johnnyisraeli/accelerating-germline-and-somatic-genomic-analysis-of-whole-genomes-and-exomes-with-nvidia-clara-e3deeae2acc9
	and Gists at:
	## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-full_pipeline_and_data_prep_parabricks3-6-sh
	## and
	## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-step_1_align_reads_parabricks3-6-sh
	###########

	###########
	## We'll run this tutorial on a GCP VM with 64 vCPUs, 240GB of RAM, and 8x NVIDIA V100 GPUs
	## To save costs, you can also run this on a GCP VM with 32 vCPUS, 120GB of RAM, and 4x V100 GPUs
	###########


	## After aligning our reads, we'll rerun the variant calling stages of our past gist
	## since we've updated the haplotypecaller and DeepVariant tools. We'll
	## also run Strelka2 as an additional variant caller.
	##
	## After that, we'll merge our VCFs to generate a union callset and an intersection VCF
	## with variants called by all three variant callers, annotate our new intersection VCF,
	## and remove variants that fail certain criteria for population frequency.
	## Finally, we'll run our vcfqc and vcfqcbybam tools to generate simple quality control reports.
	#############


	################
	## HaplotypeCaller
	## This step should take roughly 15 minutes on our 8xV100 VM.
	################
	time pbrun haplotypecaller \
	--ref ~/refs/Homo_sapiens_assembly38.fasta \
	--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--in-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt \
	--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf

	################
	## DeepVariant
	## This step should take approximately 20 minutes on an 8xV100 VM
	################
	time pbrun deepvariant \
	--ref Homo_sapiens_assembly38.fasta \
	--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf

	###############
	## Strelka
	## This step should take ~10 minutes on a 64-core VM.
	###############
	time pbrun strelka \
	--ref Homo_sapiens_assembly38.fasta \
	--in-bams HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \
	--out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka \
	--num-threads 64


	## Copy strelka results to current directory.
	cp HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka.strelka_work/results/variants/variants.vcf.gz* .

	## BGZIP and tabix-index the deepvariant VCFs
	bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf
	tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz

	## BGZIP and tabix index the haplotypecaller VCFs
	bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf
	tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz


	## Run the votebasedvcfmerger tool to generate a union and intersection VCF.
	time pbrun votebasedvcfmerger \
	--in-vcf strelka:variants.vcf.gz \
	--in-vcf deepvariant:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz \
	--in-vcf haplotypecaller:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz \
	--min-votes 3
	--out-dir HG002.realign.vbvm

	## The HG002.realign.vbvm directory should now contain a
	## unionVCF.vcf file with the union callset of HaplotypeCaller, Strelka, and DeepVariant
	## and aa filteredVCF.vcf file with only calls produced by all three callers.

	## Annotate the intersection VCF with gnomAD, ClinVar, 1000 Genomes
	## Download our annotation VCFs and tabix indices
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi

	wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz
	wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi

	wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz
	wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz.tbi

	## Download an Ensembl GTF to annotate the VCF file with gene names
	wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz
	## Unzip the GTF file and add the "chr" prefix to the chromosome names (Ensembl excludes this prefix by default.
	gunzip Homo_sapiens.GRCh38.105.gtf.gz
	awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' Homo_sapiens.GRCh38.105.gtf > Homo_sapiens.GRCh38.105.chr.gtf

	time pbrun snpswift \
	--input-vcf HG002.realign.vbvm/filteredVCF.vcf \
	--anno-vcf 1000Genomes:ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz \
	--anno-vcf gnomad_v2.1.1:gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz \
	--anno-vcf ClinVar:clinvar.vcf.gz \
	--ensembl Homo_sapiens.GRCh38.105.chr.gtf \
	--output-vcf HG002.realign.3callers.annotated.vcf

	##################
	## frequencyfiltration
	## Next we'll filter our VCF to remove variants with 1000Genomes allele frequency > 0.05
	## and gnomAD AF < 0.05
	##################
	time pbrun frequencyfiltration \
	--in-vcf HG002.realign.3callers.annotated.vcf \
	--and-expression "1000Genomes_AF < 0.05" \
	--and-expression "gnomad_v2.1.1_AF < 0.05" \
	--out-vcf HG002.realign.3callers.annotated.filtered.vcf

	##################
	## Finally, we'll run our automated vcfqc tool to generate some
	## basic QC stats. The vcfqcbybam tool could also be run
	## to produce QC stats using an auxilliary BAM file (e.g., when variant calls don't have the desired fields).
	##################
	time pbrun vcfqc --in-vcf HG002.realign.3callers.annotated.filtered.vcf \
	--output-dir HG002.realign.3callers.annotated.filtered.qc \
	--depth haplotypecaller_DP --allele-depth deepvariant_AD