Last active
April 13, 2023 21:55
-
-
Save mzdravkov/a60f7906bb4f67dad6a8b694daae3f44 to your computer and use it in GitHub Desktop.
A pipeline for aligning, variant calling and annotating genomes for diagnosing rare genetic diseases.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# recessive.awk can be found here: https://gist.github.com/mzdravkov/443868e111263f8521268434436434e4 | |
# dominant.awk can be found here: https://gist.github.com/mzdravkov/85409f4d0e7d5234f350a9a814a283bf | |
if [[ $# -lt 2 ]]; then | |
echo "./pipeline.sh CASE_NUMBER recessive/dominant" | |
exit; | |
fi | |
test_case=$1 | |
pattern=$2 | |
reports_dir=reports/case_${test_case} | |
mkdir -p $reports_dir | |
mkdir -p bams | |
fastqc --threads 6 --outdir $reports_dir input_reads/case${test_case}_* | |
# Align, index and run qualimap for each input fq file | |
for file in input_reads/case${test_case}_*; do | |
base_name=$(basename $file) | |
fname=${base_name%.*.*} | |
bam=bams/${fname}.bam | |
bowtie2 -U $file \ | |
-x reference/uni \ | |
--rg-id $fname \ | |
--rg "SM:${fname}" \ | |
--threads 4 \ | |
| samtools view -b -@ 4 \ | |
| samtools sort -@ 4 -o $bam | |
samtools index -@ 4 $bam | |
qualimap bamqc -gff exons16.bed \ | |
--outdir $reports_dir/${fname} \ | |
-bam $bam | |
done | |
multiqc --outdir $reports_dir $reports_dir | |
vcf=case_${test_case}.vcf | |
# If we're looking for a autosomal recessive disease, | |
# then we want to hae at least two alternative alleles. | |
# For dominant, one alternative allele is sufficient. | |
min_alternate_count=2 | |
if [[ $2 == "dominant" ]]; then | |
min_alternate_count=1 | |
fi | |
freebayes -f reference/universe.fasta \ | |
--min-base-quality 25 \ | |
--min-mapping-quality 1 \ | |
--min-alternate-count $min_alternate_count \ | |
--targets exons16Padded_sorted.bed \ | |
bams/case${test_case}_child.bam \ | |
bams/case${test_case}_father.bam \ | |
bams/case${test_case}_mother.bam \ | |
| bcftools sort > $vcf | |
filtered_vcf=case_${test_case}_filtered.vcf | |
awk -F "\t" -f $pattern.awk $vcf > $filtered_vcf | |
# As we know the diseases that we're looking for | |
# we can easily annotate the VCF with ClinVar info | |
# and use it to filter only variants that are | |
# potentially associated with those diseases. | |
diseases_filter="(^#)|(polycystic)|(dyskinesia)|(cylindromatosis)|(KBG)|(Rubinstein)|(Taybi)|(Townes)|(Brocks)|(Tuberous)" | |
if [[ $2 == "recessive" ]]; then | |
diseases_filter="(^#)|(Bardet)|(Biedl)|(hypomagnesemia)|(Fanconi)|(Joubert)|(Meckel)|(MHC)|(Mucopolysaccharidosis)|(Thalassemia)" | |
fi | |
# Use SnpEff to annotate the VCF. We then remove | |
# non-relevant variants (low quality or low impact). | |
# The resulting VCF is then annotated with ClinVar | |
# to get the diseases associated with the variants. | |
java -Xmx8g -jar snpEff/snpEff.jar -v \ | |
-stats $reports_dir/case_${test_case}_annotation.html \ | |
GRCh37.75 $filtered_vcf \ | |
| java -jar snpEff/SnpSift.jar \ | |
filter "QUAL > 20 & (ANN[*].IMPACT = 'HIGH' | ANN[*].IMPACT = 'MODERATE')" \ | |
| java -Xmx8g -jar snpEff/SnpSift.jar \ | |
annotate \ | |
-v \ | |
-sorted \ | |
clinvar.vcf \ | |
| grep -i -E $diseases_filter \ | |
> case_${test_case}_annotated.vcf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment