Last active
January 12, 2024 07:16
-
-
Save fo40225/d3c8f1648ca2455fbf8c2fcdadbd228f to your computer and use it in GitHub Desktop.
run Variant Effect Predictor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ubuntu | |
sudo apt install -y build-essential unzip parallel curl git zlib1g-dev libbz2-dev liblzma-dev libncurses5-dev libperl-dev libgsl-dev | |
sudo cpan DBI Try::Tiny LWP::Simple Set::IntervalTree PerlIO::gzip | |
# windows ActivePerl 5.24 https://downloads.activestate.com/ActivePerl/releases/5.24.3.2404/ActivePerl-5.24.3.2404-MSWin32-x64-404865.exe | |
# ppm install Set-IntervalTree | |
# https://www.activestate.com/blog/goodbye-ppm-hello-state-tool | |
# http://strawberryperl.com/download/5.32.0.1/strawberry-perl-5.32.0.1-64bit.msi | |
cpan Set::IntervalTree | |
wget -O ensembl-vep-release-110.1.zip https://github.com/Ensembl/ensembl-vep/archive/release/110.1.zip | |
unzip ensembl-vep-release-110.1.zip | |
cd ensembl-vep-release-110.1 | |
export PERL5LIB=${PWD} | |
# ubuntu 16.04 (perl 5.22) | |
perl INSTALL.pl --CACHEDIR .vep | |
# ubuntu 18.04 (newer perl) | |
perl INSTALL.pl --NO_TEST --CACHEDIR .vep | |
# windows | |
perl INSTALL.pl --NO_HTSLIB --NO_TEST --CACHEDIR .vep | |
# skip cache&fasta? | |
# install plugin MaxEntScan + AlphaMissense | |
# MaxEntScan | |
wget http://hollywood.mit.edu/burgelab/maxent/download/fordownload.tar.gz | |
tar axvf fordownload.tar.gz | |
# AlphaMissense | |
wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg19.tsv.gz | |
wget https://storage.googleapis.com/dm_alphamissense/AlphaMissense_hg38.tsv.gz | |
tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg19.tsv.gz | |
tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz | |
# Cache | |
cd .vep | |
#### wget ftp://ftp.ensembl.org/pub/grch37/release-110/variation/vep/homo_sapiens*GRCh37* | |
wget ftp://ftp.ensembl.org/pub/release-110/variation/vep/homo_sapiens* | |
ls *.tar.gz | parallel tar axf | |
rm *.tar.gz | |
# FASTA | |
cd homo_sapiens/110_GRCh37 | |
#### wget ftp://ftp.ensembl.org/pub/release-75/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.75.dna.toplevel.fa.gz | |
wget ftp://ftp.ensembl.org/pub/grch37/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh37.dna.toplevel.fa.gz | |
gunzip Homo_sapiens.GRCh37.dna.toplevel.fa.gz | |
cd .. | |
cd 110_GRCh38 | |
wget ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz | |
gunzip Homo_sapiens.GRCh38.dna.toplevel.fa.gz | |
cd ../.. | |
# windows use ^ instead of \ | |
# fork didn't work on windows | |
perl vep -i examples/homo_sapiens_GRCh37.vcf -o homo_sapiens_GRCh37.txt \ | |
--sift b --polyphen b --ccds --symbol --numbers --domains --regulatory --canonical --protein --biotype --uniprot --tsl --appris --gene_phenotype --af --af_1kg --af_esp --af_gnomad --max_af --pubmed --var_synonyms --variant_class \ | |
--assembly GRCh37 \ | |
--fork $(expr $(nproc) - 1) \ | |
--dir .vep \ | |
--offline \ | |
--fasta .vep/homo_sapiens/110_GRCh37/Homo_sapiens.GRCh37.dna.toplevel.fa \ | |
--merged \ | |
--buffer_size 131072 \ | |
--plugin MaxEntScan,fordownload \ | |
--plugin AlphaMissense,file=AlphaMissense_hg19.tsv.gz \ | |
--tab | |
# speedup | |
git clone https://github.com/Ensembl/ensembl-xs.git -b 2.3.2 | |
cd ensembl-xs | |
perl Makefile.PL | |
make -j $(nproc) | |
sudo make install | |
# == pre-indexed == | |
# wget ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens* | |
# wget ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna_index/Homo_sapiens.GRCh38.dna.toplevel.fa.gz* | |
# == | |
export PATH=${PWD}/htslib:${PATH} | |
bgzip -l 9 -@ $(nproc) Homo_sapiens.GRCh37.dna.toplevel.fa | |
samtools faidx Homo_sapiens.GRCh37.dna.toplevel.fa.gz | |
bgzip -l 9 -@ $(nproc) Homo_sapiens.GRCh38.dna.toplevel.fa | |
samtools faidx Homo_sapiens.GRCh38.dna.toplevel.fa.gz | |
# export LD_LIBRARY_PATH=${PWD}/htslib | |
# ./vep ... --fasta Homo_sapiens.GRCh37.dna.toplevel.fa.gz | |
# editing convert_cache.pl:L424, 64 is cpus | |
# my $bgzipout = `$bgzip -l 1 -@ 64 $outfilepath 2>&1`; | |
# It takes very LONG time, is fork broken? | |
perl convert_cache.pl --species all --version all --dir .vep --remove | |
#find .vep -name all_vars.gz.csi | parallel rm | |
#find .vep -name all_vars.gz | parallel gunzip | |
#find .vep -name all_vars | parallel bgzip -l 9 | |
#find .vep -name all_vars.gz | parallel tabix -C -s 1 -b 5 -e 5 -f |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment