edawson/full_pipeline_and_data_prep_Parabricks3.6.sh

## full_pipeline_and_data_prep_Parabricks3.6.sh
#!/bin/bash

## Download the HG002 30X BAMs, kindly sequenced and shared by Google
gsutil cp gs://brain-genomics-public/research/sequencing/fastq/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.R1.fastq.gz .
gsutil cp gs://brain-genomics-public/research/sequencing/fastq/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.R2.fastq.gz .

## Download GRCh38
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

## Make a .fai index using samtools faidx
samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

## Create the BWA indices
bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

## Download dbSNP to use as your known-sites file. Don't worry about it's tabix index as we
## need to first fix the missing "chr" prefix to match the reference, then recompress/reindex.
wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/dbsnp_146.hg38.vcf.gz

## We'll need to tabix index our dbSNP file
tabix dbsnp_146.hg38.vcf.gz

## step_1_align_reads_Parabricks3.6.sh
## Align reads using Parabricks fq2bam, which includes BWA mem, BQSR, markduplicates, and sorting / indexing of the bam
## Assumes pbrun has been copied to /usr/local/bin or some other acessible path
pbrun fq2bam \
  --in-fq HG002.hiseqx.pcr-free.30x.R1.fastq.gz HG002.hiseqx.pcr-free.30x.R2.fastq.gz \
  --ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
  --known-sites dbsnp_146.hg38.vcf.gzz \
  --out-bam HG002.hiseqx.pcr-free.30x.pb.bam \
  --out-recal-file HG002.hiseqx.pcr-free.30x.pb.recal.txt \
  --num-gpu 4

## step_2_deepvariant_Parabricks3.6.sh
#!/bin/bash
pbrun deepvariant \
  --ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
  --in-bam HG002.hiseqx.pcr-free.30x.pb.bam \
  --out-variants HG002.hiseqx.pcr-free.30x.pb.deepvariant.vcf \
  --num-gpu 4

## step_3_haplotypecaller_Parabricks3.6.sh
#!/bin/bash
pbrun haplotypecaller \
  --ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
  --in-bam HG002.hiseqx.pcr-free.30x.pb.bam \
  --in-recal-file HG002.hiseqx.pcr-free.30x.pb.recal.txt \
  --out-variants HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
  --num-gpu 4

## step_4_mutect2_Parabricks3.6.sh
#!/bin/bash

## Since it's so fast, and sometimes we'll want to generate mutect calls (e.g. for a Panel of Normals),
## I've shown how to run accelerated MuTect2 below.
pbrun mutectcaller \
  --ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
  --in-tumor-bam HG002.hiseqx.pcr-free.30x.pb.bam \
  --tumor-name sample \
  --out-vcf HG002.hiseqx.pcr-free.30x.pb.mutect2.vcf \
  --num-gpu 4

## step_5_post_calling_Parabricks3.6.sh
#!/bin/bash

## In this portion, we'll first generate a quality control report for our
## HaplotypeCaller and DeepVariant VCFs.
## We'll then merge our HaplotypeCaller and DeepVariant calls
## into a single VCF using the vote-based VCF merger (VBVM) tool. We'll
## then annotate our calls with information from dbSNP using the VCFANNO tool and
## filter them based on allele frequency using the FREQUENCYFILTRATION tool.


## VCFQC relies on fields generated by individual callers.
## We pass them in on the command line to tell the tool
## what variable names are used by each caller.
## DeepVariant has well-formed fields for mapq, depth, vaf, and allele depth.
pbrun vcfqc \
    --in-vcf HG002.hiseqx.pcr-free.30x.pb.deepvariant.vcf \
    --mapq MQ \
    --depth DP \
    --vaf VAF \
    --allele-depth AD

 ## Perform QC only on the HaplotypeCaller depth and AF fields.
 pbrun vcfqc \
    --in-vcf ../HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
    --output-dir HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.qc \
    --depth DP \
    --vaf AF

## We can also run VCFQCBYBAM, which can pull information from a BAM file
## to calculate depth and allele depth if they aren't present in the VCF.
## You'll also need the reference the BAM was aligned to.
pbrun vcfqcbybam \
    --ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
    --in-bam HG002.hiseqx.pcr-free.30x.pb.bam \
    --in-vcf HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf  \
    --out-file HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.pileup \
    --output-dir HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcfqcbybam

## The VBVM tool merges VCFs, annotating each variant
pbrun vbvm \
    --in-vcf DeepVariant:HG002.hiseqx.pcr-free.30x.pb.deepvariant.vcf
    --in-vcf HaplotypeCaller:HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
    --min-votes 2 \
    --out-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.vcf

## We will download the clinvar VCF and its index to add it to our annotations.
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi

## The VCFANNO tool annotates a VCF file with INFO fields from
## database VCF files (such as ClinVar, ExAC, 1000 Genomes, dbSNP,
## or COSMIC. Here, we'll add ClinVar and dbSNP annotations. Note
## that dbSNP receives a distinct command line flag to address
## some formatting issues in the database. Also note that we pass
## a prefix with each file that is added to the INFO fields in the
## output VCF.
pbrun vcfanno \
    --in-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.vcf \
    --dbsnp dbSNP:dbsnp_146.hg38.vcf.gz \
    --annotations clinvar:clinvar.vcf.gz \
    --out-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.annotated.vcf


## FREQUENCYFILTRATION allows us to quickly filter variants using queries
## composed of boolean AND and OR expressions. Here, we'll keep only variants
## that have a dbSNP_CAF frequency less than 0.05 and a clinvar_AF_EXAC frequency
## of less than 0.05. We'll write any variants that fail these filters to failed_variants.vcf.
## We won't drop missing data (i.e. variants with no clinvar/dbSNP annotations), but we
## could do so using the --drop-missing flag.
pbrun frequencyfiltration \
    --in-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.annotated.vcf \
    --and-expression "dbSNP_CAF < 0.05" \
    --and-expression "clinvar_AF_EXAC < 0.05" \
    --excluded-vcf failed_variants.vcf \
    --out-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.annotated.filtered.vcf
	#!/bin/bash

	## Download the HG002 30X BAMs, kindly sequenced and shared by Google
	gsutil cp gs://brain-genomics-public/research/sequencing/fastq/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.R1.fastq.gz .
	gsutil cp gs://brain-genomics-public/research/sequencing/fastq/hiseqx/wgs_pcr_free/30x/HG002.hiseqx.pcr-free.30x.R2.fastq.gz .

	## Download GRCh38
	wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

	## Make a .fai index using samtools faidx
	samtools faidx GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

	## Create the BWA indices
	bwa index GCA_000001405.15_GRCh38_no_alt_analysis_set.fna

	## Download dbSNP to use as your known-sites file. Don't worry about it's tabix index as we
	## need to first fix the missing "chr" prefix to match the reference, then recompress/reindex.
	wget ftp://gsapubftp-anonymous@ftp.broadinstitute.org/bundle/hg38/dbsnp_146.hg38.vcf.gz

	## We'll need to tabix index our dbSNP file
	tabix dbsnp_146.hg38.vcf.gz
	## Align reads using Parabricks fq2bam, which includes BWA mem, BQSR, markduplicates, and sorting / indexing of the bam
	## Assumes pbrun has been copied to /usr/local/bin or some other acessible path
	pbrun fq2bam \
	--in-fq HG002.hiseqx.pcr-free.30x.R1.fastq.gz HG002.hiseqx.pcr-free.30x.R2.fastq.gz \
	--ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
	--known-sites dbsnp_146.hg38.vcf.gzz \
	--out-bam HG002.hiseqx.pcr-free.30x.pb.bam \
	--out-recal-file HG002.hiseqx.pcr-free.30x.pb.recal.txt \
	--num-gpu 4
	#!/bin/bash
	pbrun deepvariant \
	--ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
	--in-bam HG002.hiseqx.pcr-free.30x.pb.bam \
	--out-variants HG002.hiseqx.pcr-free.30x.pb.deepvariant.vcf \
	--num-gpu 4
	#!/bin/bash
	pbrun haplotypecaller \
	--ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
	--in-bam HG002.hiseqx.pcr-free.30x.pb.bam \
	--in-recal-file HG002.hiseqx.pcr-free.30x.pb.recal.txt \
	--out-variants HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
	--num-gpu 4
	#!/bin/bash

	## Since it's so fast, and sometimes we'll want to generate mutect calls (e.g. for a Panel of Normals),
	## I've shown how to run accelerated MuTect2 below.
	pbrun mutectcaller \
	--ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
	--in-tumor-bam HG002.hiseqx.pcr-free.30x.pb.bam \
	--tumor-name sample \
	--out-vcf HG002.hiseqx.pcr-free.30x.pb.mutect2.vcf \
	--num-gpu 4
	#!/bin/bash

	## In this portion, we'll first generate a quality control report for our
	## HaplotypeCaller and DeepVariant VCFs.
	## We'll then merge our HaplotypeCaller and DeepVariant calls
	## into a single VCF using the vote-based VCF merger (VBVM) tool. We'll
	## then annotate our calls with information from dbSNP using the VCFANNO tool and
	## filter them based on allele frequency using the FREQUENCYFILTRATION tool.


	## VCFQC relies on fields generated by individual callers.
	## We pass them in on the command line to tell the tool
	## what variable names are used by each caller.
	## DeepVariant has well-formed fields for mapq, depth, vaf, and allele depth.
	pbrun vcfqc \
	--in-vcf HG002.hiseqx.pcr-free.30x.pb.deepvariant.vcf \
	--mapq MQ \
	--depth DP \
	--vaf VAF \
	--allele-depth AD

	## Perform QC only on the HaplotypeCaller depth and AF fields.
	pbrun vcfqc \
	--in-vcf ../HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
	--output-dir HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.qc \
	--depth DP \
	--vaf AF

	## We can also run VCFQCBYBAM, which can pull information from a BAM file
	## to calculate depth and allele depth if they aren't present in the VCF.
	## You'll also need the reference the BAM was aligned to.
	pbrun vcfqcbybam \
	--ref GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
	--in-bam HG002.hiseqx.pcr-free.30x.pb.bam \
	--in-vcf HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
	--out-file HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.pileup \
	--output-dir HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcfqcbybam

	## The VBVM tool merges VCFs, annotating each variant
	pbrun vbvm \
	--in-vcf DeepVariant:HG002.hiseqx.pcr-free.30x.pb.deepvariant.vcf
	--in-vcf HaplotypeCaller:HG002.hiseqx.pcr-free.30x.pb.haplotypecaller.vcf \
	--min-votes 2 \
	--out-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.vcf

	## We will download the clinvar VCF and its index to add it to our annotations.
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi

	## The VCFANNO tool annotates a VCF file with INFO fields from
	## database VCF files (such as ClinVar, ExAC, 1000 Genomes, dbSNP,
	## or COSMIC. Here, we'll add ClinVar and dbSNP annotations. Note
	## that dbSNP receives a distinct command line flag to address
	## some formatting issues in the database. Also note that we pass
	## a prefix with each file that is added to the INFO fields in the
	## output VCF.
	pbrun vcfanno \
	--in-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.vcf \
	--dbsnp dbSNP:dbsnp_146.hg38.vcf.gz \
	--annotations clinvar:clinvar.vcf.gz \
	--out-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.annotated.vcf


	## FREQUENCYFILTRATION allows us to quickly filter variants using queries
	## composed of boolean AND and OR expressions. Here, we'll keep only variants
	## that have a dbSNP_CAF frequency less than 0.05 and a clinvar_AF_EXAC frequency
	## of less than 0.05. We'll write any variants that fail these filters to failed_variants.vcf.
	## We won't drop missing data (i.e. variants with no clinvar/dbSNP annotations), but we
	## could do so using the --drop-missing flag.
	pbrun frequencyfiltration \
	--in-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.annotated.vcf \
	--and-expression "dbSNP_CAF < 0.05" \
	--and-expression "clinvar_AF_EXAC < 0.05" \
	--excluded-vcf failed_variants.vcf \
	--out-vcf HG002.hiseqx.pcr-free.30x.pb.intersection.annotated.filtered.vcf