Skip to content

Instantly share code, notes, and snippets.

@pontikos
Last active January 15, 2019 16:28
Show Gist options
  • Save pontikos/f5ce942cecfd9b587f5bf08cd0dbf99c to your computer and use it in GitHub Desktop.
Save pontikos/f5ce942cecfd9b587f5bf08cd0dbf99c to your computer and use it in GitHub Desktop.
nicer cadd script than the one provided
#!/bin/bash
#CADDpath="`dirname \"$0\"`"
#CADDpath="`( cd \"$CADDpath/..\" && pwd )`"
set +x
CADDpath=/share/apps/genomics/CADD_v1.3/
if [ -z "$VEPpath" ] ; then source ${CADDpath}/bin/config.sh; fi
echo $VEPpath
NCORES=`expr $(lscpu | grep '^CPU(s):' | cut -f2 -d: | tr -d ' ') - 2`
input=$1
output=$2
temp=${output%.vcf.gz}
echo prepare input
zcat $input | ${CADDpath}/bin/src/VCF2vepVCF.py | sort -k1,1 -k2,2n -k3,3 -k4,4 | uniq > ${temp}_clean.tmp
echo score pre-scored SNVs
cat ${temp}_clean.tmp | ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/whole_genome_SNVs.tsv.gz --found_out=${temp}_scored_snvs.tmp > ${temp}_indels.tmp
echo score pre-scored indels
cat ${temp}_indels.tmp | ${CADDpath}/bin/src/extract_scored.py -p ${CADDpath}/prescored/InDels.tsv.gz --found_out=${temp}_scored_indels.tmp > ${temp}_novel.tmp
echo annotate novel indels with VEP
cat ${temp}_novel.tmp | perl ${VEPpath} --quiet --cache --offline --dir ${CADDpath}/annotations/vep/ --buffer 1000 --no_stats --species homo_sapiens --db_version=75 --format vcf --regulatory --sift b --polyphen b --per_gene --ccds --domains --numbers --canonical --total_length --force_overwrite --fork=${NCORES} --output_file ${temp}_VEP.tmp
echo score VEP-annotated novel indels
cat ${temp}_VEP.tmp | awk 'BEGIN{ FS="\t"; OFS="\t"; }{ if ($1 ~ /^#/) { if ($1 ~ /^#Up/) { sub("#","",$1); print "#Chrom","Start","End",$0 } else { print } } else { split($2,a,":"); split(a[2],b,"-"); if (length(b) == 2) { print a[1],b[1],b[2],$0 } else { print a[1],b[1],b[1],$0 } }}' | ${CADDpath}/bin/src/annotateVEP.py --all --PIPE | ${CADDpath}/bin/src/score_linModel.py --noMap --noSegDup -m ${CADDpath}/bin/src/model/model_coefficients.txt | ${CADDpath}/bin/src/max_line_hierarchy.py | ${CADDpath}/bin/src/appendPHREDscore.py -t ${CADDpath}/bin/src/model/conversion_table_ext.tsv | awk 'BEGIN{ OFS="\t"; FS="\t" }{ if ($0 !~ /^#/) { print $1,$2,$3,$5,$115,$116 } }' | uniq > ${temp}_scored_novel.tmp
echo combining scored output of SNVs, indels and novel indels
cat <(echo "## CADD v1.3 (c) University of Washington and Hudson-Alpha Institute for Biotechnology 2013-2015. All rights reserved.") <(echo -e "#CHROM\tPOS\tREF\tALT\tRawScore\tPHRED") <(cat ${temp}_scored_*.tmp | sort -k1,1 -k2,2n -k3,3 -k4,4) | /share/apps/genomics/htslib-1.1/bin/bgzip -c > ${output}
tabix -p vcf ${output}
rm ${temp}_*.tmp
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment