JamesKane/ena_align.sh

## ena_align.sh
#!/bin/bash

# USAGE: sh ena_align.sh
#
# This simple script was originally developed to automate aligning and filtering samples from ENA for ydna-warehouse.org.  It has
# become the default workflow for all NGS data needing to be standardized for keeping samples as consistent as possible from the
# menagerie of D2C vendors.
#
# The script is built on the assumption your FASTQ read data is pre-trimmed and organized with this structure.
# SAMPLE/SAMPLE_[1|2].fastq.gz
#
# The workflow creates an unsorted SAM from the pair of FASTQ files using all available CPU threads.
#
# Aligned reads are filtered to only reads on the GRCh38 chrY and chrM regions.  Then base quality scores are recalibrated via
# GATK to improve calling accurancy.  Finally, the file is encoded with CRAM to reduce disc usage for AWS S3 storage.
#
# The script finishes up by creating a gVCF for joint genotyping and some metrics of interest for comparison.
#
# DEPENDENCIES:
# GATK4 - https://github.com/broadinstitute/gatk/releases/download/4.1.9.0/gatk-4.1.9.0.zip
# Sambamba - https://github.com/biod/sambamba
# BWA MEM - https://github.com/lh3/bwa
# Samtools - https://github.com/samtools/
# GRCh38 Reference - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
# Mills & 1000G Gold Standard INDELS - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/other_mapping_resources/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
# DBSNP - Unsure where sourced.  Likely in the GATK resource bundle.  https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle
# GATK3 - There a few residual dependencies from legacy workflows.  If you can't find the JAR anymore via Googlefoo, remove the lines using it.

# All these need to be updated to match where you installed the dependencies.
gatk4=~/Applications/gatk-4.1.9.0/gatk
sambamba=~/Applications/sambamba/bin/sambamba-0.7.1
# There is a small alignment benefit to not using the full set here, but the core of 1KG samples used this during initial
# development of the warehouse's dataset.  It remains for legacy comparison reasons and it's only a couple hundred thousand
# callable base differences.
reference=/mnt/genomics/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
mills=/mnt/genomics/GRCh38_reference_genome/other_mapping_resources/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
dbsnp=/mnt/genomics/GRCh38_reference_genome/All_20180418.vcf.gz

sample=${PWD##*/}

# Align the FASTQ files with BWA.  Since ydna-warehouse.org really only wants chrY and chrM reads, we are piping the mapping results through a filter
# that discards regions outside the sex haplogroups.  Mates are presevered for potential realignment experiments with later reference releases.
#
# Users with workflows that need the entire genome should just pipe to a file and change the file names appropriately.
# Experimentally bwa mem can be switched out for minimap2 -ax sr -t $(nproc).  It's 3x faster but 4.5% less accurate.
bwa mem -t $(nproc) -M -R '@RG\tID:'${sample}'\tSM:'${sample}'\tLB:lib1\tPL:illumina\tPU:unit1' $reference ${sample}_?.fastq.gz | \
	$sambamba view -F "ref_name == 'chrY' or mate_ref_name =='chrY' or ref_name == 'chrY_KI270740v1_random' or mate_ref_name =='chrY_KI270740v1_random' or ref_name == 'chrM' or mate_ref_name =='chrM'" -f bam -S -o ${sample}.chrYM.bam /dev/stdin

# Sort the chrYM SAM by read name - using GATK here because the samtools sort fails some of GATK's validations
$gatk4 SortSam -I ${sample}.chrYM.bam -O ${sample}.ns.bam -SO queryname

# Mark the duplicates
$gatk4 --java-options "-Xmx4G" \
	MarkDuplicates -I ${sample}.ns.bam -O ${sample}.dedup.bam -METRICS_FILE metrics.txt

# Resort by coordinate
$gatk4 SortSam -I ${sample}.dedup.bam -O ${sample}.sorted.bam -SORT_ORDER coordinate

# Score bases for systematic error in the Illumina instruments
$gatk4 --java-options "-Xmx8G" BaseRecalibrator -R $reference -O recal_data.table -I ${sample}.sorted.bam \
	--known-sites $mills --known-sites $dbsnp

# Apply base quality score recalibration.  This will inflate the file size significantly, since the original base scores are also retained.
# See Heng Li's blog for why we want them to be recalibrated.
# https://lh3.github.io/2020/05/27/base-quality-scores-are-essential-to-short-read-variant-calling
# NOTE: We only look at a small fraction of the library in the designed flow of this script.  The results are not the best possible but better than
# not performing it.
$gatk4 --java-options "-Xmx8G" ApplyBQSR \
	-R $reference \
	-I ${sample}.sorted.bam \
	--bqsr-recal-file recal_data.table \
	-O ${sample}.recal.bam

# Re-encode the BAM as a CRAM file.  The resulting file will be about 50% of the BAM's size.
samtools view -T $reference -C -o ${sample}.GRCh38.cram ${sample}.recal.bam
samtools index ${sample}.GRCh38.cram

# Create a gVCF suitable for ydna-warehouse.org, which uses them for joint-genotype operations
$gatk4 --java-options "-Xmx8G" HaplotypeCaller -R $reference \
	-L chrY -L chrY_KI270740v1_random \
	-O ${sample}.b38.g.vcf.gz \
	-I ${sample}.recal.bam \
	-ERC GVCF

# Produce a BED regions file for callable loci histogram generation
java -jar ~/Applications/GenomeAnalysisTK.jar -T CallableLoci \
	-R $reference \
	-I ${sample}.recal.bam -summary table.txt -o callable_status.bed \
	-L chrY -L chrY_KI270740v1_random

# Collect misc. metrics about the raw data
$gatk4 CollectMultipleMetrics -I ${sample}.recal.bam -O read_metrics -R $reference

# GATK4 has a new version of this tool, but it doesn't work the same.
java -jar ~/Applications/GenomeAnalysisTK.jar -T DepthOfCoverage -R $reference -o coverage -I ${sample}.recal.bam -L chrY
	#!/bin/bash

	# USAGE: sh ena_align.sh
	#
	# This simple script was originally developed to automate aligning and filtering samples from ENA for ydna-warehouse.org. It has
	# become the default workflow for all NGS data needing to be standardized for keeping samples as consistent as possible from the
	# menagerie of D2C vendors.
	#
	# The script is built on the assumption your FASTQ read data is pre-trimmed and organized with this structure.
	# SAMPLE/SAMPLE_[1\|2].fastq.gz
	#
	# The workflow creates an unsorted SAM from the pair of FASTQ files using all available CPU threads.
	#
	# Aligned reads are filtered to only reads on the GRCh38 chrY and chrM regions. Then base quality scores are recalibrated via
	# GATK to improve calling accurancy. Finally, the file is encoded with CRAM to reduce disc usage for AWS S3 storage.
	#
	# The script finishes up by creating a gVCF for joint genotyping and some metrics of interest for comparison.
	#
	# DEPENDENCIES:
	# GATK4 - https://github.com/broadinstitute/gatk/releases/download/4.1.9.0/gatk-4.1.9.0.zip
	# Sambamba - https://github.com/biod/sambamba
	# BWA MEM - https://github.com/lh3/bwa
	# Samtools - https://github.com/samtools/
	# GRCh38 Reference - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
	# Mills & 1000G Gold Standard INDELS - ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/other_mapping_resources/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
	# DBSNP - Unsure where sourced. Likely in the GATK resource bundle. https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle
	# GATK3 - There a few residual dependencies from legacy workflows. If you can't find the JAR anymore via Googlefoo, remove the lines using it.

	# All these need to be updated to match where you installed the dependencies.
	gatk4=~/Applications/gatk-4.1.9.0/gatk
	sambamba=~/Applications/sambamba/bin/sambamba-0.7.1
	# There is a small alignment benefit to not using the full set here, but the core of 1KG samples used this during initial
	# development of the warehouse's dataset. It remains for legacy comparison reasons and it's only a couple hundred thousand
	# callable base differences.
	reference=/mnt/genomics/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
	mills=/mnt/genomics/GRCh38_reference_genome/other_mapping_resources/Mills_and_1000G_gold_standard.indels.b38.primary_assembly.vcf.gz
	dbsnp=/mnt/genomics/GRCh38_reference_genome/All_20180418.vcf.gz

	sample=${PWD##*/}

	# Align the FASTQ files with BWA. Since ydna-warehouse.org really only wants chrY and chrM reads, we are piping the mapping results through a filter
	# that discards regions outside the sex haplogroups. Mates are presevered for potential realignment experiments with later reference releases.
	#
	# Users with workflows that need the entire genome should just pipe to a file and change the file names appropriately.
	# Experimentally bwa mem can be switched out for minimap2 -ax sr -t $(nproc). It's 3x faster but 4.5% less accurate.
	bwa mem -t $(nproc) -M -R '@RG\tID:'${sample}'\tSM:'${sample}'\tLB:lib1\tPL:illumina\tPU:unit1' $reference ${sample}_?.fastq.gz \| \
	$sambamba view -F "ref_name == 'chrY' or mate_ref_name =='chrY' or ref_name == 'chrY_KI270740v1_random' or mate_ref_name =='chrY_KI270740v1_random' or ref_name == 'chrM' or mate_ref_name =='chrM'" -f bam -S -o ${sample}.chrYM.bam /dev/stdin

	# Sort the chrYM SAM by read name - using GATK here because the samtools sort fails some of GATK's validations
	$gatk4 SortSam -I ${sample}.chrYM.bam -O ${sample}.ns.bam -SO queryname

	# Mark the duplicates
	$gatk4 --java-options "-Xmx4G" \
	MarkDuplicates -I ${sample}.ns.bam -O ${sample}.dedup.bam -METRICS_FILE metrics.txt

	# Resort by coordinate
	$gatk4 SortSam -I ${sample}.dedup.bam -O ${sample}.sorted.bam -SORT_ORDER coordinate

	# Score bases for systematic error in the Illumina instruments
	$gatk4 --java-options "-Xmx8G" BaseRecalibrator -R $reference -O recal_data.table -I ${sample}.sorted.bam \
	--known-sites $mills --known-sites $dbsnp

	# Apply base quality score recalibration. This will inflate the file size significantly, since the original base scores are also retained.
	# See Heng Li's blog for why we want them to be recalibrated.
	# https://lh3.github.io/2020/05/27/base-quality-scores-are-essential-to-short-read-variant-calling
	# NOTE: We only look at a small fraction of the library in the designed flow of this script. The results are not the best possible but better than
	# not performing it.
	$gatk4 --java-options "-Xmx8G" ApplyBQSR \
	-R $reference \
	-I ${sample}.sorted.bam \
	--bqsr-recal-file recal_data.table \
	-O ${sample}.recal.bam

	# Re-encode the BAM as a CRAM file. The resulting file will be about 50% of the BAM's size.
	samtools view -T $reference -C -o ${sample}.GRCh38.cram ${sample}.recal.bam
	samtools index ${sample}.GRCh38.cram

	# Create a gVCF suitable for ydna-warehouse.org, which uses them for joint-genotype operations
	$gatk4 --java-options "-Xmx8G" HaplotypeCaller -R $reference \
	-L chrY -L chrY_KI270740v1_random \
	-O ${sample}.b38.g.vcf.gz \
	-I ${sample}.recal.bam \
	-ERC GVCF

	# Produce a BED regions file for callable loci histogram generation
	java -jar ~/Applications/GenomeAnalysisTK.jar -T CallableLoci \
	-R $reference \
	-I ${sample}.recal.bam -summary table.txt -o callable_status.bed \
	-L chrY -L chrY_KI270740v1_random

	# Collect misc. metrics about the raw data
	$gatk4 CollectMultipleMetrics -I ${sample}.recal.bam -O read_metrics -R $reference

	# GATK4 has a new version of this tool, but it doesn't work the same.
	java -jar ~/Applications/GenomeAnalysisTK.jar -T DepthOfCoverage -R $reference -o coverage -I ${sample}.recal.bam -L chrY