Last active
July 5, 2022 11:38
-
-
Save ricketts/5588ccf0c2846c53615e77d6da452c88 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## Download the HG002 30X VCFs shared by google | |
gsutil cp gs://brain-genomics-public/research/sequencing/grch38/vcf/novaseq/wgs_pcr_free/30x/HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.vcf.gz . | |
## Split multi allelics using bcftools norm | |
## (Assumes bcftools is installed and in your PATH) | |
bcftools norm -m- -Oz --output HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.vcf.gz | |
## Tabix index our VCF file | |
tabix HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz | |
##Download ClinVar VCF Database | |
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz | |
##unzip and add missing chr to ClinVar file | |
gunzip clinvar.vcf.gz | |
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' < clinvar.vcf > clinvar.chr.vcf | |
## Compress and tabix index our ClinVar vcf | |
bgzip clinvar.chr.vcf | |
tabix clinvar.chr.vcf.gz | |
## Let's download ENSEMBL GTF annotations | |
wget http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gtf.gz | |
## Let's unzip and add missing 'chr' | |
gunzip Homo_sapiens.GRCh38.105.chr.gtf.gz | |
awk '{if($0 !~ /^#/) print "chr"$0; else print $0}' < Homo_sapiens.GRCh38.105.chr.gtf > Homo_sapiens.GRCh38.105.withchr.gtf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## Now's lets annotate with CLinVar (note: you can use multiple --anno-vcf to use multiple anno sources) | |
pbrun snpswift \ | |
--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \ | |
--anno-vcf ClinVar:./clinvar.chr.vcf.gz \ | |
--output-vcf HG002.clinvar.vcf | |
## Check out a few ClinVar annotations by searchig for the prefix 'ClinVar' in annotated VCF | |
cat HG002.clinvar.vcf | grep -v '#' | grep ClinVar | head | |
## We can also use our ENSEMBL GTF to add Gene names and Gene IDs to each record in our VCF | |
pbrun snpswift \ | |
--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \ | |
--ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \ | |
--output-vcf HG002.ensembl.vcf | |
## Check out some of the ENSEMBL annotations | |
grep ENSEMBL_GENE_NAME HG002.ensembl.vcf | head | |
## We can jointly annotate with ClinVar and ENSEMBL | |
pbrun snpswift \ | |
--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \ | |
--anno-vcf ClinVar:./clinvar.chr.vcf.gz \ | |
--ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \ | |
--output-vcf HG002.cv.ensembl.vcf | |
## ----- | |
## Custom DB/Tab delimited DB annotation | |
## ----- | |
## Download a tab separated annotation database (let's use the HGVD) and extract | |
wget https://www.hgvd.genome.med.kyoto-u.ac.jp/HGVD1210-V2_30-dbSNP150.tar.gz | |
tar -xvf HGVD1210-V2_30-dbSNP150.tar.gz | |
## DBexome20170802.tab contains information on over 500K variants | |
## Prepare the database for use with snpswift based on requirements found in the documentation (add column headers and ensure columns in required order) | |
## Assuming DBexome20170802.tab is ready for use, let's annotate our VCF with information from this DB | |
## | |
## Run the command 'head -2 DBexome20170802.tab' and it should look similar to the outout below: | |
## #Chr Position Ref Alt rsID_freq Sample Filter Mean_depth SD_depth RR RA AA NR NA Gene | |
## chr1 XXXXX X X rsXXXXXXXXX XX PASS,XXXXX XX.XX 123.3 X X XX 3 XX XXXXX | |
pbrun snpswift \ | |
--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \ | |
--anno-tsv HGVD:./DBexome20170802.tab \ | |
--output-vcf HG002.hgvd.vcf | |
## Let's look at some of the annotations now added to our VCF file | |
## Each attribute should be added from the tab separated db with HGVD_ prefix | |
cat HG002.hgvd.vcf | grep -v '#' | grep HGVD | head | |
## Now let's put it all together and annotate our VCF with a VCF, GTF and a Tab separated database (.tab/.tsv) | |
pbrun snpswift \ | |
--input-vcf HG002.novaseq.pcr-free.30x.deepvariant-v1.0.grch38.split.vcf.gz \ | |
--anno-vcf ClinVar:./clinvar.chr.vcf.gz \ | |
--ensembl ./Homo_sapiens.GRCh38.105.withchr.gtf \ | |
--anno-tsv HGVD:./DBexome20170802.tab \ | |
--output-vcf HG002.annotated.vcf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
## Let's run consequence prediction on our variants annotated through snpswift to get functional information in each transcript | |
##This requires gene annotations in GFF3 format, let's download that from ENSEMBL to our directory and unzip | |
wget http://ftp.ensembl.org/pub/release-105/gff3/homo_sapiens/Homo_sapiens.GRCh38.105.chr.gff3.gz | |
gunzip Homo_sapiens.GRCh38.105.chr.gff3.gz | |
##Download GRCh38 | |
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | |
gunzip GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | |
##Run bcftoolscsq through parabricks | |
pbrun bcftoolscsq \ | |
--in-file HG002.clinvar.vcf \ | |
--out-file HG002.annotated.csq.vcf \ | |
--ref ./GCA_000001405.15_GRCh38_no_alt_analysis_set.fna \ | |
--gff-annot ./Homo_sapiens.GRCh38.105.chr.gff3 \ | |
--phase a | |
##Take a look at a few of the predicted variant consequences by searching for the 'BCSQ' tag | |
grep BCSQ HG002.annotated.csq.vcf | head | |
## Let's look at the first 10 variants for which a loss of the stop codon has been predicted by bcftoolscsq | |
## Note bcftoolscsq predicts consequence of a variant for each known transcript of a gene | |
grep stop_lost HG002.annotated.csq.vcf | head |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment