Last active
May 27, 2022 06:10
-
-
Save edawson/cd11fb22c89ee4005d6374f0be1f1c64 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
######################## | |
## In this gist, we'll reuse the commands from our 3.6 tutorial to align reads and generate BAM files. | |
## Check out the full post at https://medium.com/@johnnyisraeli/accelerating-germline-and-somatic-genomic-analysis-of-whole-genomes-and-exomes-with-nvidia-clara-e3deeae2acc9 | |
and Gists at: | |
## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-full_pipeline_and_data_prep_parabricks3-6-sh | |
## and | |
## https://gist.github.com/edawson/e84b2785db75d3c0aea9cc6a59969d45#file-step_1_align_reads_parabricks3-6-sh | |
########### | |
########### | |
## We'll run this tutorial on a GCP VM with 64 vCPUs, 240GB of RAM, and 8x NVIDIA V100 GPUs | |
## To save costs, you can also run this on a GCP VM with 32 vCPUS, 120GB of RAM, and 4x V100 GPUs | |
########### | |
## After aligning our reads, we'll rerun the variant calling stages of our past gist | |
## since we've updated the haplotypecaller and DeepVariant tools. We'll | |
## also run Strelka2 as an additional variant caller. | |
## | |
## After that, we'll merge our VCFs to generate a union callset and an intersection VCF | |
## with variants called by all three variant callers, annotate our new intersection VCF, | |
## and remove variants that fail certain criteria for population frequency. | |
## Finally, we'll run our vcfqc and vcfqcbybam tools to generate simple quality control reports. | |
############# | |
################ | |
## HaplotypeCaller | |
## This step should take roughly 15 minutes on our 8xV100 VM. | |
################ | |
time pbrun haplotypecaller \ | |
--ref ~/refs/Homo_sapiens_assembly38.fasta \ | |
--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \ | |
--in-recal-file HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.BQSR-REPORT.txt \ | |
--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf | |
################ | |
## DeepVariant | |
## This step should take approximately 20 minutes on an 8xV100 VM | |
################ | |
time pbrun deepvariant \ | |
--ref Homo_sapiens_assembly38.fasta \ | |
--in-bam HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \ | |
--out-variants HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf | |
############### | |
## Strelka | |
## This step should take ~10 minutes on a 64-core VM. | |
############### | |
time pbrun strelka \ | |
--ref Homo_sapiens_assembly38.fasta \ | |
--in-bams HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.bam \ | |
--out-prefix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka \ | |
--num-threads 64 | |
## Copy strelka results to current directory. | |
cp HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.strelka.strelka_work/results/variants/variants.vcf.gz* . | |
## BGZIP and tabix-index the deepvariant VCFs | |
bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf | |
tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz | |
## BGZIP and tabix index the haplotypecaller VCFs | |
bgzip -@16 HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf | |
tabix HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz | |
## Run the votebasedvcfmerger tool to generate a union and intersection VCF. | |
time pbrun votebasedvcfmerger \ | |
--in-vcf strelka:variants.vcf.gz \ | |
--in-vcf deepvariant:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.deepvariant.vcf.gz \ | |
--in-vcf haplotypecaller:HG002.hiseqx.pcr-free.30x.dedup.grch37.bam2fq.hg38.pb.haplotypecaller.vcf.gz \ | |
--min-votes 3 | |
--out-dir HG002.realign.vbvm | |
## The HG002.realign.vbvm directory should now contain a | |
## unionVCF.vcf file with the union callset of HaplotypeCaller, Strelka, and DeepVariant | |
## and aa filteredVCF.vcf file with only calls produced by all three callers. | |
## Annotate the intersection VCF with gnomAD, ClinVar, 1000 Genomes | |
## Download our annotation VCFs and tabix indices | |
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz | |
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi | |
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz | |
wget https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/liftover_grch38/vcf/exomes/gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz.tbi | |
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz | |
wget http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz.tbi | |
## Download an Ensembl GTF to annotate the VCF file with gene names | |
wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz | |
## Unzip the GTF file and add the "chr" prefix to the chromosome names (Ensembl excludes this prefix by default. | |
gunzip Homo_sapiens.GRCh38.105.gtf.gz | |
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' Homo_sapiens.GRCh38.105.gtf > Homo_sapiens.GRCh38.105.chr.gtf | |
time pbrun snpswift \ | |
--input-vcf HG002.realign.vbvm/filteredVCF.vcf \ | |
--anno-vcf 1000Genomes:ALL.wgs.shapeit2_integrated_v1a.GRCh38.20181129.sites.vcf.gz \ | |
--anno-vcf gnomad_v2.1.1:gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz \ | |
--anno-vcf ClinVar:clinvar.vcf.gz \ | |
--ensembl Homo_sapiens.GRCh38.105.chr.gtf \ | |
--output-vcf HG002.realign.3callers.annotated.vcf | |
################## | |
## frequencyfiltration | |
## Next we'll filter our VCF to remove variants with 1000Genomes allele frequency > 0.05 | |
## and gnomAD AF < 0.05 | |
################## | |
time pbrun frequencyfiltration \ | |
--in-vcf HG002.realign.3callers.annotated.vcf \ | |
--and-expression "1000Genomes_AF < 0.05" \ | |
--and-expression "gnomad_v2.1.1_AF < 0.05" \ | |
--out-vcf HG002.realign.3callers.annotated.filtered.vcf | |
################## | |
## Finally, we'll run our automated vcfqc tool to generate some | |
## basic QC stats. The vcfqcbybam tool could also be run | |
## to produce QC stats using an auxilliary BAM file (e.g., when variant calls don't have the desired fields). | |
################## | |
time pbrun vcfqc --in-vcf HG002.realign.3callers.annotated.filtered.vcf \ | |
--output-dir HG002.realign.3callers.annotated.filtered.qc \ | |
--depth haplotypecaller_DP --allele-depth deepvariant_AD |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment