ricketts/annotation_pipeline_download_prep_data_PB3.8.sh

## annotation_pipeline_download_prep_data_PB3.8.sh
#!/bin/bash

## Download the HG002 30X VCFs shared by google
gsutil cp gs://brain-genomics-public/research/sequencing/grch38/vcf/novaseq/wgs_pcr_free/30x/HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.vcf.gz .

## Split multi allelics using bcftools norm
## (Assumes bcftools is installed and in your PATH)
bcftools norm -m- -Oz --output HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.vcf.gz

## Tabix index our VCF file
tabix HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz

##Download ClinVar VCF Database
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz

##unzip and add missing chr to ClinVar file
gunzip clinvar.vcf.gz
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' < clinvar.vcf > clinvar.chr.vcf

## Compress and tabix index our ClinVar vcf
bgzip clinvar.chr.vcf
tabix clinvar.chr.vcf.gz

## Let's download ENSEMBL GTF annotations
wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gtf.gz

## Let's unzip and add missing 'chr'
gunzip Homo_sapiens.GRCh38.105.chr.gtf.gz
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' < Homo_sapiens.GRCh38.105.chr.gtf > Homo_sapiens.GRCh38.105.withchr.gtf

## annotation_snpswift_PB3.8.sh
#!/bin/bash

## Now's lets annotate with CLinVar (note: you can use multiple --anno-vcf to use multiple anno sources)
pbrun snpswift \
    --input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
    --anno-vcf ClinVar:./clinvar.chr.vcf.gz \
    --output-vcf HG002.clinvar.vcf

## Check out a few ClinVar annotations by searchig for the prefix 'ClinVar' in annotated VCF

cat HG002.clinvar.vcf | grep -v '#' | grep ClinVar | head

## We can also use our ENSEMBL GTF to add Gene names and Gene IDs to each record in our VCF

pbrun snpswift \
    --input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
    --ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \
    --output-vcf HG002.ensembl.vcf

## Check out some of the ENSEMBL annotations

grep ENSEMBL_GENE_NAME HG002.ensembl.vcf | head

## We can jointly annotate with ClinVar and ENSEMBL

pbrun snpswift \
    --input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
    --anno-vcf ClinVar:./clinvar.chr.vcf.gz \
    --ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \
    --output-vcf HG002.cv.ensembl.vcf

## -----
## Custom DB/Tab delimited DB annotation
## -----

## Download a tab separated annotation database (let's use the HGVD) and extract
wget https://www.hgvd.genome.med.kyoto-u.ac.jp/HGVD1210-V2_30-dbSNP150.tar.gz
tar -xvf HGVD1210-V2_30-dbSNP150.tar.gz

## DBexome20170802.tab contains information on over 500K variants
## Prepare the database for use with snpswift based on requirements found in the documentation (add column headers and ensure columns in required order)

## Assuming DBexome20170802.tab is ready for use, let's annotate our VCF with information from this DB
##
## Run the command 'head -2 DBexome20170802.tab' and it should look similar to the outout below:
## #Chr    Position        Ref     Alt     rsID_freq       Sample  Filter  Mean_depth      SD_depth        RR      RA      AA      NR      NA      Gene
## chr1    XXXXX   X       X       rsXXXXXXXXX     XX      PASS,XXXXX XX.XX   123.3  X       X       XX      3       XX     XXXXX

pbrun snpswift \
    --input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
    --anno-tsv HGVD:./DBexome20170802.tab \
    --output-vcf HG002.hgvd.vcf

## Let's look at some of the annotations now added to our VCF file
## Each attribute should be added from the tab separated db with HGVD_ prefix
cat HG002.hgvd.vcf | grep -v '#' | grep HGVD | head

## Now let's put it all together and annotate our VCF with a VCF, GTF and a Tab separated database (.tab/.tsv)
pbrun snpswift \
    --input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
    --anno-vcf ClinVar:./clinvar.chr.vcf.gz \
    --ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \
    --anno-tsv HGVD:./DBexome20170802.tab \
    --output-vcf HG002.annotated.vcf

## csq_pred_PB3.8.sh
#!/bin/bash

## Let's run consequence prediction on our variants annotated through snpswift to get functional information in each transcript

##This requires gene annotations in GFF3 format, let's download that from ENSEMBL to our directory and unzip

wget http://ftp.ensembl.org/pub/release-105/gff3/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gff3.gz
gunzip Homo_sapiens.GRCh38.105.chr.gff3.gz

##Download GRCh38

wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

##Run bcftoolscsq through parabricks
pbrun bcftoolscsq \
    --in-file HG002.clinvar.vcf \
    --out-file HG002.annotated.csq.vcf \
    --ref ./GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
    --gff-annot ./Homo_sapiens.GRCh38.105.chr.gff3 \
    --phase a

##Take a look at a few of the predicted variant consequences by searching for the 'BCSQ' tag

grep BCSQ HG002.annotated.csq.vcf | head

## Let's look at the first 10 variants for which a loss of the stop codon has been predicted by bcftoolscsq
## Note bcftoolscsq predicts consequence of a variant for each known transcript of a gene

grep stop_lost HG002.annotated.csq.vcf | head
	#!/bin/bash

	## Download the HG002 30X VCFs shared by google
	gsutil cp gs://brain-genomics-public/research/sequencing/grch38/vcf/novaseq/wgs_pcr_free/30x/HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.vcf.gz .

	## Split multi allelics using bcftools norm
	## (Assumes bcftools is installed and in your PATH)
	bcftools norm -m- -Oz --output HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.vcf.gz

	## Tabix index our VCF file
	tabix HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz

	##Download ClinVar VCF Database
	wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz

	##unzip and add missing chr to ClinVar file
	gunzip clinvar.vcf.gz
	awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' < clinvar.vcf > clinvar.chr.vcf

	## Compress and tabix index our ClinVar vcf
	bgzip clinvar.chr.vcf
	tabix clinvar.chr.vcf.gz

	## Let's download ENSEMBL GTF annotations
	wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gtf.gz

	## Let's unzip and add missing 'chr'
	gunzip Homo_sapiens.GRCh38.105.chr.gtf.gz
	awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' < Homo_sapiens.GRCh38.105.chr.gtf > Homo_sapiens.GRCh38.105.withchr.gtf
	#!/bin/bash

	## Now's lets annotate with CLinVar (note: you can use multiple --anno-vcf to use multiple anno sources)
	pbrun snpswift \
	--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
	--anno-vcf ClinVar:./clinvar.chr.vcf.gz \
	--output-vcf HG002.clinvar.vcf

	## Check out a few ClinVar annotations by searchig for the prefix 'ClinVar' in annotated VCF

	cat HG002.clinvar.vcf \| grep -v '#' \| grep ClinVar \| head

	## We can also use our ENSEMBL GTF to add Gene names and Gene IDs to each record in our VCF

	pbrun snpswift \
	--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
	--ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \
	--output-vcf HG002.ensembl.vcf

	## Check out some of the ENSEMBL annotations

	grep ENSEMBL_GENE_NAME HG002.ensembl.vcf \| head

	## We can jointly annotate with ClinVar and ENSEMBL

	pbrun snpswift \
	--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
	--anno-vcf ClinVar:./clinvar.chr.vcf.gz \
	--ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \
	--output-vcf HG002.cv.ensembl.vcf

	## -----
	## Custom DB/Tab delimited DB annotation
	## -----

	## Download a tab separated annotation database (let's use the HGVD) and extract
	wget https://www.hgvd.genome.med.kyoto-u.ac.jp/HGVD1210-V2_30-dbSNP150.tar.gz
	tar -xvf HGVD1210-V2_30-dbSNP150.tar.gz

	## DBexome20170802.tab contains information on over 500K variants
	## Prepare the database for use with snpswift based on requirements found in the documentation (add column headers and ensure columns in required order)

	## Assuming DBexome20170802.tab is ready for use, let's annotate our VCF with information from this DB
	##
	## Run the command 'head -2 DBexome20170802.tab' and it should look similar to the outout below:
	## #Chr Position Ref Alt rsID_freq Sample Filter Mean_depth SD_depth RR RA AA NR NA Gene
	## chr1 XXXXX X X rsXXXXXXXXX XX PASS,XXXXX XX.XX 123.3 X X XX 3 XX XXXXX

	pbrun snpswift \
	--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
	--anno-tsv HGVD:./DBexome20170802.tab \
	--output-vcf HG002.hgvd.vcf

	## Let's look at some of the annotations now added to our VCF file
	## Each attribute should be added from the tab separated db with HGVD_ prefix
	cat HG002.hgvd.vcf \| grep -v '#' \| grep HGVD \| head

	## Now let's put it all together and annotate our VCF with a VCF, GTF and a Tab separated database (.tab/.tsv)
	pbrun snpswift \
	--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \
	--anno-vcf ClinVar:./clinvar.chr.vcf.gz \
	--ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \
	--anno-tsv HGVD:./DBexome20170802.tab \
	--output-vcf HG002.annotated.vcf
	#!/bin/bash

	## Let's run consequence prediction on our variants annotated through snpswift to get functional information in each transcript

	##This requires gene annotations in GFF3 format, let's download that from ENSEMBL to our directory and unzip

	wget http://ftp.ensembl.org/pub/release-105/gff3/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gff3.gz
	gunzip Homo_sapiens.GRCh38.105.chr.gff3.gz

	##Download GRCh38

	wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz
	gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz

	##Run bcftoolscsq through parabricks
	pbrun bcftoolscsq \
	--in-file HG002.clinvar.vcf \
	--out-file HG002.annotated.csq.vcf \
	--ref ./GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \
	--gff-annot ./Homo_sapiens.GRCh38.105.chr.gff3 \
	--phase a

	##Take a look at a few of the predicted variant consequences by searching for the 'BCSQ' tag

	grep BCSQ HG002.annotated.csq.vcf \| head

	## Let's look at the first 10 variants for which a loss of the stop codon has been predicted by bcftoolscsq
	## Note bcftoolscsq predicts consequence of a variant for each known transcript of a gene

	grep stop_lost HG002.annotated.csq.vcf \| head