Skip to content

Instantly share code, notes, and snippets.

@fo40225
Last active January 12, 2024 07:16
Show Gist options
  • Save fo40225/d3c8f1648ca2455fbf8c2fcdadbd228f to your computer and use it in GitHub Desktop.
Save fo40225/d3c8f1648ca2455fbf8c2fcdadbd228f to your computer and use it in GitHub Desktop.
run Variant Effect Predictor
# ubuntu
sudo apt install -y build-essential unzip parallel curl git zlib1g-dev libbz2-dev liblzma-dev libncurses5-dev libperl-dev libgsl-dev
sudo cpan DBI Try::Tiny LWP::Simple Set::IntervalTree PerlIO::gzip
# windows ActivePerl 5.24 https://downloads.activestate.com/ActivePerl/releases/5.24.3.2404/ActivePerl-5.24.3.2404-MSWin32-x64-404865.exe
# ppm install Set-IntervalTree
# https://www.activestate.com/blog/goodbye-ppm-hello-state-tool
# http://strawberryperl.com/download/5.32.0.1/strawberry-perl-5.32.0.1-64bit.msi
cpan Set::IntervalTree
wget -O ensembl-vep-release-110.1.zip https://github.com/Ensembl/ensembl-vep/archive/release/110.1.zip
unzip ensembl-vep-release-110.1.zip
cd ensembl-vep-release-110.1
export PERL5LIB=${PWD}
# ubuntu 16.04 (perl 5.22)
perl INSTALL.pl --CACHEDIR .vep
# ubuntu 18.04 (newer perl)
perl INSTALL.pl --NO_TEST --CACHEDIR .vep
# windows
perl INSTALL.pl --NO_HTSLIB --NO_TEST --CACHEDIR .vep
# skip cache&fasta?
# install plugin MaxEntScan + AlphaMissense
# MaxEntScan
wget http://hollywood.mit.edu/burgelab/maxent/download/fordownload.tar.gz
tar axvf fordownload.tar.gz
# AlphaMissense
wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg19.tsv.gz
wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz
tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg19.tsv.gz
tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz
# Cache
cd .vep
#### wget ftp://ftp.ensembl.org/pub/grch37/release-110/variation/vep/homo_sapiens*GRCh37*
wget ftp://ftp.ensembl.org/pub/release-110/variation/vep/homo_sapiens*
ls *.tar.gz | parallel tar axf
rm *.tar.gz
# FASTA
cd homo_sapiens/110_GRCh37
#### wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.toplevel.fa.gz
wget ftp://ftp.ensembl.org/pub/grch37/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.toplevel.fa.gz
gunzip Homo_sapiens.GRCh37.dna.toplevel.fa.gz
cd ..
cd 110_GRCh38
wget ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz
gunzip Homo_sapiens.GRCh38.dna.toplevel.fa.gz
cd ../..
# windows use ^ instead of \
# fork didn't work on windows
perl vep -i examples/homo_sapiens_GRCh37.vcf -o homo_sapiens_GRCh37.txt \
--sift b --polyphen b --ccds --symbol --numbers --domains --regulatory --canonical --protein --biotype --uniprot --tsl --appris --gene_phenotype --af --af_1kg --af_esp --af_gnomad --max_af --pubmed --var_synonyms --variant_class \
--assembly GRCh37 \
--fork $(expr $(nproc) - 1) \
--dir .vep \
--offline \
--fasta .vep/homo_sapiens/110_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa \
--merged \
--buffer_size 131072 \
--plugin MaxEntScan,fordownload \
--plugin AlphaMissense,file=AlphaMissense_hg19.tsv.gz \
--tab
# speedup
git clone https://github.com/Ensembl/ensembl-xs.git -b 2.3.2
cd ensembl-xs
perl Makefile.PL
make -j $(nproc)
sudo make install
# == pre-indexed ==
# wget ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens*
# wget ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna_index/Homo_sapiens.GRCh38.dna.toplevel.fa.gz*
# ==
export PATH=${PWD}/htslib:${PATH}
bgzip -l 9 -@ $(nproc) Homo_sapiens.GRCh37.dna.toplevel.fa
samtools faidx Homo_sapiens.GRCh37.dna.toplevel.fa.gz
bgzip -l 9 -@ $(nproc) Homo_sapiens.GRCh38.dna.toplevel.fa
samtools faidx Homo_sapiens.GRCh38.dna.toplevel.fa.gz
# export LD_LIBRARY_PATH=${PWD}/htslib
# ./vep ... --fasta Homo_sapiens.GRCh37.dna.toplevel.fa.gz
# editing convert_cache.pl:L424, 64 is cpus
# my $bgzipout = `$bgzip -l 1 -@ 64 $outfilepath 2>&1`;
# It takes very LONG time, is fork broken?
perl convert_cache.pl --species all --version all --dir .vep --remove
#find .vep -name all_vars.gz.csi | parallel rm
#find .vep -name all_vars.gz | parallel gunzip
#find .vep -name all_vars | parallel bgzip -l 9
#find .vep -name all_vars.gz | parallel tabix -C -s 1 -b 5 -e 5 -f
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment