|
#! /bin/sh |
|
|
|
# Sentieon script for basic DNAseq (FastQ to VCF) as instructed on https://support.sentieon.com/manual/DNAseq_usage/dnaseq/ |
|
# |
|
# # How To Use This Script |
|
# * Put it wherever, it's all based on absolute paths |
|
# * Open it in an editor of your choosing |
|
# * Fill in all the <BLANKS> in the `exports` section |
|
# * Change (or leave as is) the `bwt_max_mem` and `NUMBER_THREADS` env vars to suite your needs |
|
# * Make sure to raise file descriptor limits to the allowed max by running `ulimit -n unlimited` |
|
# * Run it |
|
# |
|
# # Expected Directory Structure |
|
# * SENTIEON_PROJECT_HOME |
|
# * YOUR LICENSE FILE |
|
# * data |
|
# * reference |
|
# * REFERENCE FILES |
|
# * pon |
|
# * PON FILES |
|
# * EXTRACTED SENTIEON DIR |
|
# * output |
|
# |
|
# Author: Adam Lev-Libfeld (adam.inf@gmail.com) |
|
|
|
export SENTIEON_PROJECT_HOME=/home/user/<YOUR PROJECT ROOT> |
|
export SENTIEON_LICENSE=$SENTIEON_PROJECT_HOME/<YOUR LICENSE FILE>.lic |
|
export SENTIEON_BIN=$SENTIEON_PROJECT_HOME/<EXTRACTED SENTIEON DIR>/bin/sentieon |
|
export SENTION_OUT_DIR=$SENTIEON_PROJECT_HOME/output |
|
export SENTION_DATA_DIR=$SENTIEON_PROJECT_HOME/data |
|
|
|
export REFERENCE=$SENTION_DATA_DIR/reference/<YOUR REFERENCE>.fasta |
|
export PON_FILE=$SENTION_DATA_DIR/pon/<YOUR PON FILE>.hdf5 |
|
|
|
export SAMPLE1=$2 |
|
export SAMPLE2=$3 |
|
|
|
export GROUP_NAME=$1 |
|
export SAMPLE_NAME=$1 |
|
|
|
export bwt_max_mem=8G |
|
export NUMBER_THREADS=8 |
|
|
|
# remove from this list any annotations that you don't need |
|
export ANNOTATION_FLAGS="--annotation QD,FS,ReadPosRankSum,SAC,AC,AF,AN,BaseQRankSum,ClippingRankSum,DP,ExcessHet,InbreedingCoeff,MLEAC,MLEAF,MQ,MQ0,MQRankSum,RAW_MQ,SOR,AS_BaseQRankSum,AS_FS,AS_InbreedingCoeff,AS_MQRankSum,AS_QD,AS_MQ,AS_ReadPosRankSum,AS_SOR" |
|
|
|
#UNCOMMENT to Clean previous run outputs |
|
#rm -rf $SENTION_OUT_DIR/* |
|
#echo "Done cleaning. Starting run" |
|
|
|
#START RUN |
|
echo -n "Started at "; date +%H:%M:%S |
|
|
|
#ALIGN |
|
($SENTIEON_BIN bwa mem -M -R '@RG\tID:'$GROUP_NAME'\tSM:'$SAMPLE_NAME -t $NUMBER_THREADS $REFERENCE $SAMPLE1 $SAMPLE2 || echo -n 'error' ) | $SENTIEON_BIN util sort -r $REFERENCE -o $SENTION_OUT_DIR/sorted.bam -t $NUMBER_THREADS --sam2bam -i - |
|
|
|
#REMOVE DUP |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -i $SENTION_OUT_DIR/sorted.bam --algo LocusCollector --fun score_info $SENTION_OUT_DIR/SCORE.gz |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -i $SENTION_OUT_DIR/sorted.bam --algo Dedup --rmdup --score_info $SENTION_OUT_DIR/SCORE.gz --metrics $SENTION_OUT_DIR/dedup_metrics.txt $SENTION_OUT_DIR/dedup.bam |
|
|
|
#Indel realignment |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -r $REFERENCE -i $SENTION_OUT_DIR/dedup.bam --algo Realigner $SENTION_OUT_DIR/realigned.bam |
|
|
|
#Base quality score recalibration (BQSR) |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -r $REFERENCE -i $SENTION_OUT_DIR/realigned.bam --algo QualCal $SENTION_OUT_DIR/recal_data.table |
|
|
|
#VARCALLING |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -r $REFERENCE -i $SENTION_OUT_DIR/realigned.bam -q $SENTION_OUT_DIR/recal_data.table --algo Haplotyper ${ACCURACY_FLAGS} $SENTION_OUT_DIR/${SAMPLE_NAME}_variants.vcf |
|
|
|
#STRUCTURAL VARIANTS |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -r $REFERENCE -i $SENTION_OUT_DIR/realigned.bam -q $SENTION_OUT_DIR/recal_data.table --algo DNAscope --var_type bnd $SENTION_OUT_DIR/structural_variants.vcf.tmp |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -r $REFERENCE --algo SVSolver -v $SENTION_OUT_DIR/structural_variants.vcf.tmp $SENTION_OUT_DIR/${SAMPLE_NAME}_structural_variants.vcf |
|
|
|
#CNV |
|
$SENTIEON_BIN driver -t $NUMBER_THREADS -r $REFERENCE -i $SENTION_OUT_DIR/realigned.bam -q $SENTION_OUT_DIR/recal_data.table --algo CNV --pon $PON_FILE $SENTION_OUT_DIR/${SAMPLE_NAME}_$SAMPLE_NAME.cnv |
|
|
|
echo -n "Finished at "; date +%H:%M:%S |