crazyhottommy/vcf2gemini_db.sh

## vcf2gemini_db.sh
#! /bin/bash

# this script is used to decompose vcf file and normalize the vcf file
# see here https://gemini.readthedocs.org/en/latest/index.html
# and load it to gemini database

# put the following three lines to every bash script to catch errors
set -e
set -u
set -o pipefail -o errexit -o nounset

if [ "$#" -ne 1 ]  # exactly 1 arguments?
then
        echo "error: arguments not correct, you provide $#, 1 required"
        echo "usage: vcf2gemini_db.sh input_vcf"
        exit 1
fi

input_vcf=$1
PREFIX=$(basename $input_vcf .vcf)
REF=/Users/mtang1/annotations/human/speed_seq_genome_fasta_hg19/human_g1k_v37.fasta

cat $input_vcf \
    | vt decompose -s - \
    | vt normalize -r $REF -  > "${PREFIX}_vt_norm.vcf"  # always quote your variable

variant_effect_predictor.pl -i "${PREFIX}_vt_norm.vcf" \
    --offline \
    --assembly GRCh37 \
    --sift b \
    --polyphen b \
    --symbol \
    --numbers \
    --biotype \
    --total_length \
    -o STDOUT \
    --vcf \
    --fields Consequence,Codons,Amino_acids,Gene,SYMBOL,Feature,EXON,PolyPhen,SIFT,Protein_position,BIOTYPE \
    | sort -k1,1 -k2,2n \
    | bgzip -c > "${PREFIX}_VEP.vcf.gz"

tabix "${PREFIX}_VEP.vcf.gz"

##load to gemini
echo "now, loading the vt decomposed, normalized, bgziped, tabix indexed vcf into Gemini"
gemini load --cores 2 -t VEP -v "${PREFIX}_VEP.vcf.gz" "${PREFIX}.db"
	#! /bin/bash

	# this script is used to decompose vcf file and normalize the vcf file
	# see here https://gemini.readthedocs.org/en/latest/index.html
	# and load it to gemini database

	# put the following three lines to every bash script to catch errors
	set -e
	set -u
	set -o pipefail -o errexit -o nounset

	if [ "$#" -ne 1 ] # exactly 1 arguments?
	then
	echo "error: arguments not correct, you provide $#, 1 required"
	echo "usage: vcf2gemini_db.sh input_vcf"
	exit 1
	fi

	input_vcf=$1
	PREFIX=$(basename $input_vcf .vcf)
	REF=/Users/mtang1/annotations/human/speed_seq_genome_fasta_hg19/human_g1k_v37.fasta

	cat $input_vcf \
	\| vt decompose -s - \
	\| vt normalize -r $REF - > "${PREFIX}_vt_norm.vcf" # always quote your variable

	variant_effect_predictor.pl -i "${PREFIX}_vt_norm.vcf" \
	--offline \
	--assembly GRCh37 \
	--sift b \
	--polyphen b \
	--symbol \
	--numbers \
	--biotype \
	--total_length \
	-o STDOUT \
	--vcf \
	--fields Consequence,Codons,Amino_acids,Gene,SYMBOL,Feature,EXON,PolyPhen,SIFT,Protein_position,BIOTYPE \
	\| sort -k1,1 -k2,2n \
	\| bgzip -c > "${PREFIX}_VEP.vcf.gz"

	tabix "${PREFIX}_VEP.vcf.gz"

	##load to gemini
	echo "now, loading the vt decomposed, normalized, bgziped, tabix indexed vcf into Gemini"
	gemini load --cores 2 -t VEP -v "${PREFIX}_VEP.vcf.gz" "${PREFIX}.db"