mzdravkov/genomics_disease_annotation_pipeline.sh

## genomics_disease_annotation_pipeline.sh
# recessive.awk can be found here: https://gist.github.com/mzdravkov/443868e111263f8521268434436434e4
# dominant.awk can be found here: https://gist.github.com/mzdravkov/85409f4d0e7d5234f350a9a814a283bf

if [[ $# -lt 2 ]]; then
	echo "./pipeline.sh CASE_NUMBER recessive/dominant"
	exit;
fi

test_case=$1
pattern=$2

reports_dir=reports/case_${test_case}

mkdir -p $reports_dir
mkdir -p bams

fastqc --threads 6 --outdir $reports_dir input_reads/case${test_case}_*

# Align, index and run qualimap for each input fq file
for file in input_reads/case${test_case}_*; do
	base_name=$(basename $file)
	fname=${base_name%.*.*}
	bam=bams/${fname}.bam

	bowtie2 -U $file \
		-x reference/uni \
		--rg-id $fname \
		--rg "SM:${fname}" \
		--threads 4 \
	| samtools view -b -@ 4 \
	| samtools sort -@ 4 -o $bam

	samtools index -@ 4 $bam
	qualimap bamqc -gff exons16.bed \
		       --outdir $reports_dir/${fname} \
		       -bam $bam
done

multiqc --outdir $reports_dir $reports_dir

vcf=case_${test_case}.vcf

# If we're looking for a autosomal recessive disease,
# then we want to hae at least two alternative alleles.
# For dominant, one alternative allele is sufficient.
min_alternate_count=2
if [[ $2 == "dominant" ]]; then
	min_alternate_count=1
fi

freebayes -f reference/universe.fasta \
	--min-base-quality 25 \
	--min-mapping-quality 1 \
        --min-alternate-count $min_alternate_count \
	--targets exons16Padded_sorted.bed \
	bams/case${test_case}_child.bam \
	bams/case${test_case}_father.bam \
	bams/case${test_case}_mother.bam \
       | bcftools sort > $vcf

filtered_vcf=case_${test_case}_filtered.vcf

awk -F "\t" -f $pattern.awk $vcf > $filtered_vcf

# As we know the diseases that we're looking for
# we can easily annotate the VCF with ClinVar info
# and use it to filter only variants that are
# potentially associated with those diseases.
diseases_filter="(^#)|(polycystic)|(dyskinesia)|(cylindromatosis)|(KBG)|(Rubinstein)|(Taybi)|(Townes)|(Brocks)|(Tuberous)"
if [[ $2 == "recessive" ]]; then
	diseases_filter="(^#)|(Bardet)|(Biedl)|(hypomagnesemia)|(Fanconi)|(Joubert)|(Meckel)|(MHC)|(Mucopolysaccharidosis)|(Thalassemia)"
fi

# Use SnpEff to annotate the VCF. We then remove
# non-relevant variants (low quality or low impact).
# The resulting VCF is then annotated with ClinVar
# to get the diseases associated with the variants.
java -Xmx8g -jar snpEff/snpEff.jar -v \
     -stats $reports_dir/case_${test_case}_annotation.html \
     GRCh37.75 $filtered_vcf \
  | java -jar snpEff/SnpSift.jar \
         filter "QUAL > 20 & (ANN[*].IMPACT = 'HIGH' | ANN[*].IMPACT = 'MODERATE')"  \
  | java -Xmx8g -jar snpEff/SnpSift.jar \
         annotate \
         -v \
         -sorted \
         clinvar.vcf \
  | grep -i -E $diseases_filter \
  > case_${test_case}_annotated.vcf
	# recessive.awk can be found here: https://gist.github.com/mzdravkov/443868e111263f8521268434436434e4
	# dominant.awk can be found here: https://gist.github.com/mzdravkov/85409f4d0e7d5234f350a9a814a283bf

	if [[ $# -lt 2 ]]; then
	echo "./pipeline.sh CASE_NUMBER recessive/dominant"
	exit;
	fi

	test_case=$1
	pattern=$2

	reports_dir=reports/case_${test_case}

	mkdir -p $reports_dir
	mkdir -p bams

	fastqc --threads 6 --outdir $reports_dir input_reads/case${test_case}_*

	# Align, index and run qualimap for each input fq file
	for file in input_reads/case${test_case}_*; do
	base_name=$(basename $file)
	fname=${base_name%..}
	bam=bams/${fname}.bam

	bowtie2 -U $file \
	-x reference/uni \
	--rg-id $fname \
	--rg "SM:${fname}" \
	--threads 4 \
	\| samtools view -b -@ 4 \
	\| samtools sort -@ 4 -o $bam

	samtools index -@ 4 $bam
	qualimap bamqc -gff exons16.bed \
	--outdir $reports_dir/${fname} \
	-bam $bam
	done

	multiqc --outdir $reports_dir $reports_dir

	vcf=case_${test_case}.vcf

	# If we're looking for a autosomal recessive disease,
	# then we want to hae at least two alternative alleles.
	# For dominant, one alternative allele is sufficient.
	min_alternate_count=2
	if [[ $2 == "dominant" ]]; then
	min_alternate_count=1
	fi

	freebayes -f reference/universe.fasta \
	--min-base-quality 25 \
	--min-mapping-quality 1 \
	--min-alternate-count $min_alternate_count \
	--targets exons16Padded_sorted.bed \
	bams/case${test_case}_child.bam \
	bams/case${test_case}_father.bam \
	bams/case${test_case}_mother.bam \
	\| bcftools sort > $vcf

	filtered_vcf=case_${test_case}_filtered.vcf

	awk -F "\t" -f $pattern.awk $vcf > $filtered_vcf

	# As we know the diseases that we're looking for
	# we can easily annotate the VCF with ClinVar info
	# and use it to filter only variants that are
	# potentially associated with those diseases.
	diseases_filter="(^#)\|(polycystic)\|(dyskinesia)\|(cylindromatosis)\|(KBG)\|(Rubinstein)\|(Taybi)\|(Townes)\|(Brocks)\|(Tuberous)"
	if [[ $2 == "recessive" ]]; then
	diseases_filter="(^#)\|(Bardet)\|(Biedl)\|(hypomagnesemia)\|(Fanconi)\|(Joubert)\|(Meckel)\|(MHC)\|(Mucopolysaccharidosis)\|(Thalassemia)"
	fi

	# Use SnpEff to annotate the VCF. We then remove
	# non-relevant variants (low quality or low impact).
	# The resulting VCF is then annotated with ClinVar
	# to get the diseases associated with the variants.
	java -Xmx8g -jar snpEff/snpEff.jar -v \
	-stats $reports_dir/case_${test_case}_annotation.html \
	GRCh37.75 $filtered_vcf \
	\| java -jar snpEff/SnpSift.jar \
	filter "QUAL > 20 & (ANN[].IMPACT = 'HIGH' \| ANN[].IMPACT = 'MODERATE')" \
	\| java -Xmx8g -jar snpEff/SnpSift.jar \
	annotate \
	-v \
	-sorted \
	clinvar.vcf \
	\| grep -i -E $diseases_filter \
	> case_${test_case}_annotated.vcf