pontikos/CADD.sh

## CADD.sh
#!/bin/bash
#CADDpath="`dirname \"$0\"`"
#CADDpath="`( cd \"$CADDpath/..\" && pwd )`"
set +x

CADDpath=/share/apps/genomics/CADD_v1.3/
if [ -z "$VEPpath" ] ; then source ${CADDpath}/bin/config.sh; fi
echo $VEPpath

NCORES=`expr $(lscpu | grep '^CPU(s):' | cut -f2 -d: | tr -d ' ') - 2`

input=$1
output=$2

temp=${output%.vcf.gz}

echo prepare input
zcat $input | ${CADDpath}/bin/src/VCF2vepVCF.py | sort -k1,1 -k2,2n -k3,3 -k4,4 | uniq > ${temp}_clean.tmp

echo score pre-scored SNVs
cat ${temp}_clean.tmp | ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/whole_genome_SNVs.tsv.gz --found_out=${temp}_scored_snvs.tmp > ${temp}_indels.tmp

echo score pre-scored indels
cat ${temp}_indels.tmp | ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/InDels.tsv.gz --found_out=${temp}_scored_indels.tmp > ${temp}_novel.tmp


echo annotate novel indels with VEP
cat ${temp}_novel.tmp | perl ${VEPpath} --quiet --cache --offline --dir ${CADDpath}/annotations/vep/ --buffer 1000 --no_stats --species homo_sapiens --db_version=75 --format vcf --regulatory --sift b --polyphen b --per_gene --ccds --domains --numbers --canonical --total_length --force_overwrite --fork=${NCORES} --output_file ${temp}_VEP.tmp

echo score VEP-annotated novel indels
cat ${temp}_VEP.tmp | awk 'BEGIN{ FS="\t"; OFS="\t"; }{ if ($1 ~ /^#/) { if ($1 ~ /^#Up/) { sub("#","",$1); print "#Chrom","Start","End",$0 } else { print } } else { split($2,a,":"); split(a[2],b,"-"); if (length(b) == 2) { print a[1],b[1],b[2],$0 } else { print a[1],b[1],b[1],$0 } }}' | ${CADDpath}/bin/src/annotateVEP.py --all --PIPE | ${CADDpath}/bin/src/score_linModel.py --noMap --noSegDup -m ${CADDpath}/bin/src/model/model_coefficients.txt | ${CADDpath}/bin/src/max_line_hierarchy.py | ${CADDpath}/bin/src/appendPHREDscore.py -t ${CADDpath}/bin/src/model/conversion_table_ext.tsv | awk 'BEGIN{ OFS="\t"; FS="\t" }{ if ($0 !~ /^#/) { print $1,$2,$3,$5,$115,$116 } }' | uniq > ${temp}_scored_novel.tmp

echo combining scored output of SNVs, indels and novel indels

cat <(echo "## CADD v1.3 (c) University of Washington and Hudson-Alpha Institute for Biotechnology 2013-2015. All rights reserved.") <(echo -e "#CHROM\tPOS\tREF\tALT\tRawScore\tPHRED") <(cat ${temp}_scored_*.tmp | sort -k1,1 -k2,2n -k3,3 -k4,4) | /share/apps/genomics/htslib-1.1/bin/bgzip -c > ${output}
tabix -p vcf ${output}

rm ${temp}_*.tmp
	#!/bin/bash
	#CADDpath="`dirname \"$0\"`"
	#CADDpath="`( cd \"$CADDpath/..\" && pwd )`"
	set +x

	CADDpath=/share/apps/genomics/CADD_v1.3/
	if [ -z "$VEPpath" ] ; then source ${CADDpath}/bin/config.sh; fi
	echo $VEPpath

	NCORES=`expr $(lscpu \| grep '^CPU(s):' \| cut -f2 -d: \| tr -d ' ') - 2`

	input=$1
	output=$2

	temp=${output%.vcf.gz}

	echo prepare input
	zcat $input \| ${CADDpath}/bin/src/VCF2vepVCF.py \| sort -k1,1 -k2,2n -k3,3 -k4,4 \| uniq > ${temp}_clean.tmp

	echo score pre-scored SNVs
	cat ${temp}_clean.tmp \| ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/whole_genome_SNVs.tsv.gz --found_out=${temp}_scored_snvs.tmp > ${temp}_indels.tmp

	echo score pre-scored indels
	cat ${temp}_indels.tmp \| ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/InDels.tsv.gz --found_out=${temp}_scored_indels.tmp > ${temp}_novel.tmp


	echo annotate novel indels with VEP
	cat ${temp}_novel.tmp \| perl ${VEPpath} --quiet --cache --offline --dir ${CADDpath}/annotations/vep/ --buffer 1000 --no_stats --species homo_sapiens --db_version=75 --format vcf --regulatory --sift b --polyphen b --per_gene --ccds --domains --numbers --canonical --total_length --force_overwrite --fork=${NCORES} --output_file ${temp}_VEP.tmp

	echo score VEP-annotated novel indels
	cat ${temp}_VEP.tmp \| awk 'BEGIN{ FS="\t"; OFS="\t"; }{ if ($1 ~ /^#/) { if ($1 ~ /^#Up/) { sub("#","",$1); print "#Chrom","Start","End",$0 } else { print } } else { split($2,a,":"); split(a[2],b,"-"); if (length(b) == 2) { print a[1],b[1],b[2],$0 } else { print a[1],b[1],b[1],$0 } }}' \| ${CADDpath}/bin/src/annotateVEP.py --all --PIPE \| ${CADDpath}/bin/src/score_linModel.py --noMap --noSegDup -m ${CADDpath}/bin/src/model/model_coefficients.txt \| ${CADDpath}/bin/src/max_line_hierarchy.py \| ${CADDpath}/bin/src/appendPHREDscore.py -t ${CADDpath}/bin/src/model/conversion_table_ext.tsv \| awk 'BEGIN{ OFS="\t"; FS="\t" }{ if ($0 !~ /^#/) { print $1,$2,$3,$5,$115,$116 } }' \| uniq > ${temp}_scored_novel.tmp

	echo combining scored output of SNVs, indels and novel indels

	cat <(echo "## CADD v1.3 (c) University of Washington and Hudson-Alpha Institute for Biotechnology 2013-2015. All rights reserved.") <(echo -e "#CHROM\tPOS\tREF\tALT\tRawScore\tPHRED") <(cat ${temp}_scored_*.tmp \| sort -k1,1 -k2,2n -k3,3 -k4,4) \| /share/apps/genomics/htslib-1.1/bin/bgzip -c > ${output}
	tabix -p vcf ${output}

	rm ${temp}_*.tmp