Created
January 23, 2017 16:06
-
-
Save tlangs/d65717a2387a531ecc8db439cab444fa to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Copyright Broad Institute, 2016 | |
## | |
## This WDL pipeline implements data pre-processing and initial variant calling (GVCF | |
## generation) according to the GATK Best Practices (June 2016) for germline SNP and | |
## Indel discovery in human whole-genome sequencing (WGS) data. | |
## | |
## Requirements/expectations : | |
## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format | |
## - One or more read groups, one per uBAM file, all belonging to a single sample (SM) | |
## - Input uBAM files must additionally comply with the following requirements: | |
## - - filenames all have the same suffix (we use ".unmapped.bam") | |
## - - files must pass validation by ValidateSamFile | |
## - - reads are provided in query-sorted order | |
## - - all reads must have an RG tag | |
## - Reference genome must be Hg38 with ALT contigs | |
## | |
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. | |
## For program versions, see docker containers. | |
## | |
## LICENSING : | |
## This script is released under the WDL source code license (BSD-3) (see LICENSE in | |
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may | |
## be subject to different licenses. Users are responsible for checking that they are | |
## authorized to run all programs before running this script. Please see the docker | |
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed | |
## licensing information pertaining to the included programs. | |
# PRIVATE # | |
# The internal production version of this WDL must have Private and Public block tags | |
# annotated to distinguish the content that can be made public versus what cannot. | |
# PUBLIC # | |
# TASK DEFINITIONS | |
# PRIVATE # | |
task CollectQualityYieldMetrics { | |
File input_bam | |
String metrics_filename | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx128m -jar /usr/gitc/picard.jar \ | |
CollectQualityYieldMetrics \ | |
INPUT=${input_bam} \ | |
OQ=true \ | |
OUTPUT=${metrics_filename} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
disks: "local-disk " + disk_size + " HDD" | |
memory: "2 GB" | |
preemptible: preemptible_tries | |
} | |
output { | |
File metrics = "${metrics_filename}" | |
} | |
} | |
# PUBLIC # | |
# Get version of BWA | |
task GetBwaVersion { | |
command { | |
/usr/gitc/bwa 2>&1 | \ | |
grep -e '^Version' | \ | |
sed 's/Version: //' | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "1 GB" | |
} | |
output { | |
String version = read_string(stdout()) | |
} | |
} | |
# PUBLIC # | |
# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment | |
task SamToFastqAndBwaMem { | |
File input_bam | |
String bwa_commandline | |
String output_bam_basename | |
File ref_fasta | |
File ref_fasta_index | |
File ref_dict | |
# This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit), | |
# listing the reference contigs that are "alternative". | |
File ref_alt | |
File ref_amb | |
File ref_ann | |
File ref_bwt | |
File ref_pac | |
File ref_sa | |
Int disk_size | |
Int preemptible_tries | |
command <<< | |
set -o pipefail | |
# set the bash variable needed for the command-line | |
bash_ref_fasta=${ref_fasta} | |
# if ref_alt has data in it, | |
if [ -s ${ref_alt} ]; then | |
java -Xmx3000m -jar /usr/gitc/picard.jar \ | |
SamToFastq \ | |
INPUT=${input_bam} \ | |
FASTQ=/dev/stdout \ | |
INTERLEAVE=true \ | |
NON_PF=true | \ | |
/usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \ | |
samtools view -1 - > ${output_bam_basename}.bam && \ | |
grep -m1 "read .* ALT contigs" ${output_bam_basename}.bwa.stderr.log | \ | |
grep -v "read 0 ALT contigs" | |
# else ref_alt is empty or could not be found | |
else | |
exit 1; | |
fi | |
>>> | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "14 GB" | |
cpu: "16" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_bam = "${output_bam_basename}.bam" | |
File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log" | |
} | |
} | |
# PUBLIC # | |
# Merge original input uBAM file with BWA-aligned BAM file | |
task MergeBamAlignment { | |
File unmapped_bam | |
String bwa_commandline | |
String bwa_version | |
File aligned_bam | |
String output_bam_basename | |
File ref_fasta | |
File ref_fasta_index | |
File ref_dict | |
Int disk_size | |
Int preemptible_tries | |
command { | |
# set the bash variable needed for the command-line | |
bash_ref_fasta=${ref_fasta} | |
java -Xmx3000m -jar /usr/gitc/picard.jar \ | |
MergeBamAlignment \ | |
VALIDATION_STRINGENCY=SILENT \ | |
EXPECTED_ORIENTATIONS=FR \ | |
ATTRIBUTES_TO_RETAIN=X0 \ | |
ALIGNED_BAM=${aligned_bam} \ | |
UNMAPPED_BAM=${unmapped_bam} \ | |
OUTPUT=${output_bam_basename}.bam \ | |
REFERENCE_SEQUENCE=${ref_fasta} \ | |
PAIRED_RUN=true \ | |
SORT_ORDER="unsorted" \ | |
IS_BISULFITE_SEQUENCE=false \ | |
ALIGNED_READS_ONLY=false \ | |
CLIP_ADAPTERS=false \ | |
MAX_RECORDS_IN_RAM=2000000 \ | |
ADD_MATE_CIGAR=true \ | |
MAX_INSERTIONS_OR_DELETIONS=-1 \ | |
PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ | |
PROGRAM_RECORD_ID="bwamem" \ | |
PROGRAM_GROUP_VERSION="${bwa_version}" \ | |
PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \ | |
PROGRAM_GROUP_NAME="bwamem" \ | |
UNMAP_CONTAMINANT_READS=true | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3500 MB" | |
cpu: "1" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_bam = "${output_bam_basename}.bam" | |
} | |
} | |
# PUBLIC # | |
# Sort BAM file by coordinate order and fix tag values for NM and UQ | |
task SortAndFixTags { | |
File input_bam | |
String output_bam_basename | |
File ref_dict | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx4000m -jar /usr/gitc/picard.jar \ | |
SortSam \ | |
INPUT=${input_bam} \ | |
OUTPUT=/dev/stdout \ | |
SORT_ORDER="coordinate" \ | |
CREATE_INDEX=false \ | |
CREATE_MD5_FILE=false | \ | |
java -Xmx500m -jar /usr/gitc/picard.jar \ | |
SetNmAndUqTags \ | |
INPUT=/dev/stdin \ | |
OUTPUT=${output_bam_basename}.bam \ | |
CREATE_INDEX=true \ | |
CREATE_MD5_FILE=true \ | |
REFERENCE_SEQUENCE=${ref_fasta} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
disks: "local-disk " + disk_size + " HDD" | |
cpu: "1" | |
memory: "5000 MB" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_bam = "${output_bam_basename}.bam" | |
File output_bam_index = "${output_bam_basename}.bai" | |
File output_bam_md5 = "${output_bam_basename}.bam.md5" | |
} | |
} | |
# PRIVATE # | |
# The tasks CollectUnsortedReadgroupBamQualityMetrics, CollectReadgroupBamQualityMetrics, | |
# and CollectAggregationMetrics are all variations of CollectMultipleMetrics. | |
# They have been separated because we cannot glob their outputs. | |
# When glob works again, they can be merged back into one task. | |
# PRIVATE # | |
task CollectUnsortedReadgroupBamQualityMetrics { | |
File input_bam | |
String output_bam_prefix | |
Int disk_size | |
command { | |
java -Xmx5000m -jar /usr/gitc/picard.jar \ | |
CollectMultipleMetrics \ | |
INPUT=${input_bam} \ | |
OUTPUT=${output_bam_prefix} \ | |
ASSUME_SORTED=true \ | |
PROGRAM="null" \ | |
PROGRAM="CollectBaseDistributionByCycle" \ | |
PROGRAM="CollectInsertSizeMetrics" \ | |
PROGRAM="MeanQualityByCycle" \ | |
PROGRAM="QualityScoreDistribution" \ | |
METRIC_ACCUMULATION_LEVEL="null" \ | |
METRIC_ACCUMULATION_LEVEL="ALL_READS" | |
touch ${output_bam_prefix}.insert_size_metrics | |
touch ${output_bam_prefix}.insert_size_histogram.pdf | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "7 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
} | |
output { | |
File base_distribution_by_cycle_pdf = "${output_bam_prefix}.base_distribution_by_cycle.pdf" | |
File base_distribution_by_cycle_metrics = "${output_bam_prefix}.base_distribution_by_cycle_metrics" | |
File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf" | |
File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics" | |
File quality_by_cycle_pdf = "${output_bam_prefix}.quality_by_cycle.pdf" | |
File quality_by_cycle_metrics = "${output_bam_prefix}.quality_by_cycle_metrics" | |
File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf" | |
File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics" | |
} | |
} | |
# PRIVATE # | |
task CollectReadgroupBamQualityMetrics { | |
File input_bam | |
File input_bam_index | |
String output_bam_prefix | |
File ref_dict | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
command { | |
java -Xmx5000m -jar /usr/gitc/picard.jar \ | |
CollectMultipleMetrics \ | |
INPUT=${input_bam} \ | |
REFERENCE_SEQUENCE=${ref_fasta} \ | |
OUTPUT=${output_bam_prefix} \ | |
ASSUME_SORTED=true \ | |
PROGRAM="null" \ | |
PROGRAM="CollectAlignmentSummaryMetrics" \ | |
PROGRAM="CollectGcBiasMetrics" \ | |
METRIC_ACCUMULATION_LEVEL="null" \ | |
METRIC_ACCUMULATION_LEVEL="READ_GROUP" | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "7 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
} | |
output { | |
File alignment_summary_metrics = "${output_bam_prefix}.alignment_summary_metrics" | |
File gc_bias_detail_metrics = "${output_bam_prefix}.gc_bias.detail_metrics" | |
File gc_bias_pdf = "${output_bam_prefix}.gc_bias.pdf" | |
File gc_bias_summary_metrics = "${output_bam_prefix}.gc_bias.summary_metrics" | |
} | |
} | |
# PRIVATE # | |
task CollectAggregationMetrics { | |
File input_bam | |
File input_bam_index | |
String output_bam_prefix | |
File ref_dict | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
command { | |
java -Xmx5000m -jar /usr/gitc/picard.jar \ | |
CollectMultipleMetrics \ | |
INPUT=${input_bam} \ | |
REFERENCE_SEQUENCE=${ref_fasta} \ | |
OUTPUT=${output_bam_prefix} \ | |
ASSUME_SORTED=true \ | |
PROGRAM="null" \ | |
PROGRAM="CollectAlignmentSummaryMetrics" \ | |
PROGRAM="CollectInsertSizeMetrics" \ | |
PROGRAM="CollectSequencingArtifactMetrics" \ | |
PROGRAM="CollectGcBiasMetrics" \ | |
PROGRAM="QualityScoreDistribution" \ | |
METRIC_ACCUMULATION_LEVEL="null" \ | |
METRIC_ACCUMULATION_LEVEL="SAMPLE" \ | |
METRIC_ACCUMULATION_LEVEL="LIBRARY" | |
touch ${output_bam_prefix}.insert_size_metrics | |
touch ${output_bam_prefix}.insert_size_histogram.pdf | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "7 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
} | |
output { | |
File alignment_summary_metrics = "${output_bam_prefix}.alignment_summary_metrics" | |
File bait_bias_detail_metrics = "${output_bam_prefix}.bait_bias_detail_metrics" | |
File bait_bias_summary_metrics = "${output_bam_prefix}.bait_bias_summary_metrics" | |
File gc_bias_detail_metrics = "${output_bam_prefix}.gc_bias.detail_metrics" | |
File gc_bias_pdf = "${output_bam_prefix}.gc_bias.pdf" | |
File gc_bias_summary_metrics = "${output_bam_prefix}.gc_bias.summary_metrics" | |
File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf" | |
File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics" | |
File pre_adapter_detail_metrics = "${output_bam_prefix}.pre_adapter_detail_metrics" | |
File pre_adapter_summary_metrics = "${output_bam_prefix}.pre_adapter_summary_metrics" | |
File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf" | |
File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics" | |
} | |
} | |
# PRIVATE # | |
task CrossCheckFingerprints { | |
Array[File] input_bams | |
Array[File] input_bam_indexes | |
File haplotype_database_file # if this file is empty (0-length) the workflow should not do fingerprint comparison (as there are no fingerprints for the sample) | |
String metrics_filename | |
Int disk_size | |
Int preemptible_tries | |
command <<< | |
if [ -s ${haplotype_database_file} ]; then | |
java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx2000m \ | |
-jar /usr/gitc/picard.jar \ | |
CrosscheckReadGroupFingerprints \ | |
OUTPUT=${metrics_filename} \ | |
HAPLOTYPE_MAP=${haplotype_database_file} \ | |
EXPECT_ALL_READ_GROUPS_TO_MATCH=true \ | |
INPUT=${sep=' INPUT=' input_bams} \ | |
LOD_THRESHOLD=-20.0 | |
else | |
echo "No haplotype_database_file. Skipping Fingerprint check" | |
touch ${metrics_filename} | |
fi | |
>>> | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "2 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File metrics = "${metrics_filename}" | |
} | |
} | |
# PRIVATE # | |
task CheckFingerprint { | |
File input_bam | |
File input_bam_index | |
File haplotype_database_file | |
File genotypes | |
String sample | |
Int disk_size | |
Int preemptible_tries | |
command <<< | |
if [ -s ${genotypes} ]; then | |
java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx1024m \ | |
-jar /usr/gitc/picard.jar \ | |
CheckFingerprint \ | |
INPUT=${input_bam} \ | |
OUTPUT=${sample} \ | |
GENOTYPES=${genotypes} \ | |
HAPLOTYPE_MAP=${haplotype_database_file} \ | |
SAMPLE_ALIAS=${sample} \ | |
IGNORE_READ_GROUPS=true | |
else | |
echo "No fingerprint found. Skipping Fingerprint check" | |
# We touch the outputs here in order to create 0 length files. Otherwise the task will fail since the expected outputs are not to be found | |
touch ${sample}.fingerprinting_summary_metrics | |
touch ${sample}.fingerprinting_detail_metrics | |
fi | |
>>> | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "1 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File summary_metrics = "${sample}.fingerprinting_summary_metrics" | |
File detail_metrics = "${sample}.fingerprinting_detail_metrics" | |
} | |
} | |
# PUBLIC # | |
# Mark duplicate reads to avoid counting non-independent observations | |
task MarkDuplicates { | |
Array[File] input_bams | |
String output_bam_basename | |
String metrics_filename | |
Int disk_size | |
# Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly | |
# This works because the output of BWA is query-grouped, and thus so is the output of MergeBamAlignment. | |
# While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" | |
command { | |
java -Xmx4000m -jar /usr/gitc/picard.jar \ | |
MarkDuplicates \ | |
INPUT=${sep=' INPUT=' input_bams} \ | |
OUTPUT=${output_bam_basename}.bam \ | |
METRICS_FILE=${metrics_filename} \ | |
VALIDATION_STRINGENCY=SILENT \ | |
OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ | |
ASSUME_SORT_ORDER="queryname" | |
CREATE_MD5_FILE=true | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "7 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
} | |
output { | |
File output_bam = "${output_bam_basename}.bam" | |
File duplicate_metrics = "${metrics_filename}" | |
} | |
} | |
# PUBLIC # | |
# Generate sets of intervals for scatter-gathering over chromosomes | |
task CreateSequenceGroupingTSV { | |
File ref_dict | |
Int preemptible_tries | |
# Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. It outputs to stdout | |
# where it is parsed into a wdl Array[Array[String]] | |
# e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]] | |
command <<< | |
python <<CODE | |
with open("${ref_dict}", "r") as ref_dict_file: | |
sequence_tuple_list = [] | |
longest_sequence = 0 | |
for line in ref_dict_file: | |
if line.startswith("@SQ"): | |
line_split = line.split("\t") | |
# (Sequence_Name, Sequence_Length) | |
sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1]))) | |
longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1] | |
# We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off | |
# the last element after a :, so we add this as a sacrificial element. | |
hg38_protection_tag = ":1+" | |
# initialize the tsv string with the first sequence | |
tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag | |
temp_size = sequence_tuple_list[0][1] | |
for sequence_tuple in sequence_tuple_list[1:]: | |
if temp_size + sequence_tuple[1] <= longest_sequence: | |
temp_size += sequence_tuple[1] | |
tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag | |
else: | |
tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag | |
temp_size = sequence_tuple[1] | |
print tsv_string | |
CODE | |
>>> | |
runtime { | |
docker: "python:2.7" | |
memory: "2 GB" | |
preemptible: preemptible_tries | |
} | |
output { | |
Array[Array[String]] sequence_grouping = read_tsv(stdout()) | |
} | |
} | |
# PUBLIC # | |
# Generate Base Quality Score Recalibration (BQSR) model | |
task BaseRecalibrator { | |
File input_bam | |
File input_bam_index | |
String recalibration_report_filename | |
Array[String] sequence_group_interval | |
File dbSNP_vcf | |
File dbSNP_vcf_index | |
Array[File] known_indels_sites_VCFs | |
Array[File] known_indels_sites_indices | |
File ref_dict | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ | |
-XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ | |
-Xloggc:gc_log.log -Dsamjdk.use_async_io=false -Xmx4000m \ | |
-jar /usr/gitc/GATK4.jar \ | |
BaseRecalibrator \ | |
-R ${ref_fasta} \ | |
-I ${input_bam} \ | |
--useOriginalQualities \ | |
-O ${recalibration_report_filename} \ | |
-knownSites ${dbSNP_vcf} \ | |
-knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \ | |
-L ${sep=" -L " sequence_group_interval} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "6 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File recalibration_report = "${recalibration_report_filename}" | |
#this output is only for GOTC STAGING to give some GC statistics to the GATK4 team | |
#File gc_logs = "gc_log.log" | |
} | |
} | |
# PUBLIC # | |
# Apply Base Quality Score Recalibration (BQSR) model | |
task ApplyBQSR { | |
File input_bam | |
File input_bam_index | |
String output_bam_basename | |
File recalibration_report | |
Array[String] sequence_group_interval | |
File ref_dict | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ | |
-XX:+PrintGCDetails -Xloggc:gc_log.log -Dsamjdk.use_async_io=false \ | |
-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx3000m \ | |
-jar /usr/gitc/GATK4.jar \ | |
ApplyBQSR \ | |
--createOutputBamMD5 \ | |
--addOutputSAMProgramRecord \ | |
-R ${ref_fasta} \ | |
-I ${input_bam} \ | |
--useOriginalQualities \ | |
-O ${output_bam_basename}.bam \ | |
-bqsr ${recalibration_report} \ | |
-SQQ 10 -SQQ 20 -SQQ 30 -SQQ 40 \ | |
--emit_original_quals \ | |
-L ${sep=" -L " sequence_group_interval} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3500 MB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File recalibrated_bam = "${output_bam_basename}.bam" | |
File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5" | |
#this output is only for GOTC STAGING to give some GC statistics to the GATK4 team | |
#File gc_logs = "gc_log.log" | |
} | |
} | |
# PUBLIC # | |
# Combine multiple recalibration tables from scattered BaseRecalibrator runs | |
task GatherBqsrReports { | |
Array[File] input_bqsr_reports | |
String output_report_filename | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx3000m -jar /usr/gitc/GATK4.jar \ | |
GatherBQSRReports \ | |
-I ${sep=' -I ' input_bqsr_reports} \ | |
-O ${output_report_filename} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3500 MB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_bqsr_report = "${output_report_filename}" | |
} | |
} | |
# PUBLIC # | |
# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs | |
task GatherBamFiles { | |
Array[File] input_bams | |
File input_unmapped_reads_bam | |
String output_bam_basename | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx2000m -jar /usr/gitc/picard.jar \ | |
GatherBamFiles \ | |
INPUT=${sep=' INPUT=' input_bams} \ | |
INPUT=${input_unmapped_reads_bam} \ | |
OUTPUT=${output_bam_basename}.bam \ | |
CREATE_INDEX=true \ | |
CREATE_MD5_FILE=true | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_bam = "${output_bam_basename}.bam" | |
File output_bam_index = "${output_bam_basename}.bai" | |
File output_bam_md5 = "${output_bam_basename}.bam.md5" | |
} | |
} | |
# PRIVATE # | |
task ValidateSamFile { | |
File input_bam | |
File? input_bam_index | |
String report_filename | |
File? ref_dict | |
File? ref_fasta | |
File? ref_fasta_index | |
Int? max_output | |
Array[String]? ignore | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx4000m -jar /usr/gitc/picard.jar \ | |
ValidateSamFile \ | |
INPUT=${input_bam} \ | |
OUTPUT=${report_filename} \ | |
${"REFERENCE_SEQUENCE=" + ref_fasta} \ | |
${"MAX_OUTPUT=" + max_output} \ | |
IGNORE=${default="null" sep=" IGNORE=" ignore} \ | |
MODE=VERBOSE \ | |
IS_BISULFITE_SEQUENCED=false | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "7 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File report = "${report_filename}" | |
} | |
} | |
# PRIVATE # | |
# TODO - CollectWgsMetrics and CollectRawWgsMetrics can be combined by a locus-iterating collector | |
task CollectWgsMetrics { | |
File input_bam | |
File input_bam_index | |
String metrics_filename | |
File wgs_coverage_interval_list | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
command { | |
java -Xmx2000m -jar /usr/gitc/picard.jar \ | |
CollectWgsMetrics \ | |
INPUT=${input_bam} \ | |
VALIDATION_STRINGENCY=SILENT \ | |
REFERENCE_SEQUENCE=${ref_fasta} \ | |
INTERVALS=${wgs_coverage_interval_list} \ | |
OUTPUT=${metrics_filename} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
} | |
output { | |
File metrics = "${metrics_filename}" | |
} | |
} | |
# PRIVATE # | |
task CollectRawWgsMetrics { | |
File input_bam | |
File input_bam_index | |
String metrics_filename | |
File wgs_coverage_interval_list | |
File ref_fasta | |
File ref_fasta_index | |
Int disk_size | |
command { | |
java -Xmx2000m -jar /usr/gitc/picard.jar \ | |
CollectRawWgsMetrics \ | |
INPUT=${input_bam} \ | |
VALIDATION_STRINGENCY=SILENT \ | |
REFERENCE_SEQUENCE=${ref_fasta} \ | |
INTERVALS=${wgs_coverage_interval_list} \ | |
OUTPUT=${metrics_filename} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
} | |
output { | |
File metrics = "${metrics_filename}" | |
} | |
} | |
# PRIVATE # | |
task CalculateReadGroupChecksum { | |
File input_bam | |
File input_bam_index | |
String read_group_md5_filename | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx1000m -jar /usr/gitc/picard.jar \ | |
CalculateReadGroupChecksum \ | |
INPUT=${input_bam} \ | |
OUTPUT=${read_group_md5_filename} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "2 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File md5_file = "${read_group_md5_filename}" | |
} | |
} | |
# PRIVATE # | |
# Notes on the contamination estimate: | |
# The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId | |
# | |
# In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM | |
# | |
# Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER | |
# But first, it is divided by an underestimation factor thusly: | |
# float(FREEMIX) / ContaminationUnderestimationFactor | |
# where the denominator is hardcoded in Zamboni: | |
# val ContaminationUnderestimationFactor = 0.75f | |
# | |
# Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted | |
# contamination estimate for use in variant calling | |
task CheckContamination { | |
File input_bam | |
File input_bam_index | |
File contamination_sites_vcf | |
File contamination_sites_vcf_index | |
String output_prefix | |
Int disk_size | |
Int preemptible_tries | |
# Having to do this as a 2-step command in heredoc syntax, adding a python step to read the metrics | |
# This is a hack until read_object() is supported by Cromwell. | |
# It relies on knowing that there is only one data row in the 2-row selfSM TSV file | |
# Piping output of verifyBamId to /dev/null so only stdout is from the python command | |
command <<< | |
/usr/gitc/verifyBamID \ | |
--verbose \ | |
--ignoreRG \ | |
--vcf ${contamination_sites_vcf} \ | |
--out ${output_prefix} \ | |
--bam ${input_bam} \ | |
1>/dev/null && | |
python3 <<CODE | |
import csv | |
import sys | |
with open('${output_prefix}.selfSM') as selfSM: | |
reader = csv.DictReader(selfSM, delimiter='\t') | |
i = 0 | |
for row in reader: | |
if 0 and float(row["FREELK0"])==0 and float(row["FREELK1"])==0: | |
# a zero value for the likelihoods implies no data. This usually indicates a problem rather than a real event. | |
# if the bam isn't really empty, this is probably due to the use of a incompatible reference build between | |
# vcf and bam. | |
sys.stderr.write("Found zero likelihoods. Bam is either very-very shallow, or aligned to the wrong reference (relative to the vcf).") | |
sys.exit(1) | |
print(float(row["FREEMIX"])/0.75) | |
i = i + 1 | |
# there should be exactly one row, and if this isn't the case the format of the output is unexpectedly different | |
# and the results are not reliable. | |
if i != 1: | |
sys.stderr.write("Found %d rows in .selfSM file. Was expecting exactly 1. This is an error"%(i)) | |
sys.exit(2) | |
CODE | |
>>> | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "2 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File selfSM = "${output_prefix}.selfSM" | |
File depthSM = "${output_prefix}.depthSM" | |
File log = "${output_prefix}.log" | |
# I would like to do the following, however: | |
# The object is read as a string | |
# explicit string->float coercion via float(), as shown below, is supported by Cromwell | |
# the interim value cannot be stored as a string and then assigned to a float. Variables intialized in output cannot be dereferenced in output. | |
# Float contamination = float(read_object(${output_prefix} + ".selfSM").FREEMIX) / 0.75 | |
# In the interim, get the value from the python hack above: | |
Float contamination = read_float(stdout()) | |
} | |
} | |
# PUBLIC # | |
# Call variants on a single sample with HaplotypeCaller to produce a GVCF | |
task HaplotypeCaller { | |
File input_bam | |
File input_bam_index | |
File interval_list | |
String gvcf_basename | |
File ref_dict | |
File ref_fasta | |
File ref_fasta_index | |
Float? contamination | |
Int disk_size | |
Int preemptible_tries | |
# tried to find lowest memory variable where it would still work, might change once tested on JES | |
command { | |
java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \ | |
-jar /usr/gitc/GATK35.jar \ | |
-T HaplotypeCaller \ | |
-R ${ref_fasta} \ | |
-o ${gvcf_basename}.vcf.gz \ | |
-I ${input_bam} \ | |
-L ${interval_list} \ | |
-ERC GVCF \ | |
--max_alternate_alleles 3 \ | |
-variant_index_parameter 128000 \ | |
-variant_index_type LINEAR \ | |
-contamination ${default=0 contamination} \ | |
--read_filter OverclippedRead | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "10 GB" | |
cpu: "1" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_gvcf = "${gvcf_basename}.vcf.gz" | |
File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi" | |
} | |
} | |
# PUBLIC # | |
# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs | |
task MergeVCFs { | |
Array[File] input_vcfs | |
Array[File] input_vcfs_indexes | |
String output_vcf_name | |
Int disk_size | |
Int preemptible_tries | |
# using MergeVcfs instead of GatherVcfs so we can create indices | |
# WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs. | |
command { | |
java -Xmx2g -jar /usr/gitc/picard.jar \ | |
MergeVcfs \ | |
INPUT=${sep=' INPUT=' input_vcfs} \ | |
OUTPUT=${output_vcf_name} | |
} | |
output { | |
File output_vcf = "${output_vcf_name}" | |
File output_vcf_index = "${output_vcf_name}.tbi" | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
} | |
# PRIVATE # | |
task ValidateGVCF { | |
File input_vcf | |
File input_vcf_index | |
File ref_fasta | |
File ref_fasta_index | |
File ref_dict | |
File dbSNP_vcf | |
File dbSNP_vcf_index | |
File wgs_calling_interval_list | |
Int disk_size | |
Int preemptible_tries | |
command { | |
java -Xmx8g -jar /usr/gitc/GATK36.jar \ | |
-T ValidateVariants \ | |
-V ${input_vcf} \ | |
-R ${ref_fasta} \ | |
-L ${wgs_calling_interval_list} \ | |
-gvcf \ | |
--validationTypeToExclude ALLELES \ | |
--reference_window_stop 208 -U \ | |
--dbsnp ${dbSNP_vcf} | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "10 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
} | |
# PRIVATE # | |
task CollectGvcfCallingMetrics { | |
File input_vcf | |
File input_vcf_index | |
String metrics_basename | |
File dbSNP_vcf | |
File dbSNP_vcf_index | |
File ref_dict | |
Int disk_size | |
File wgs_evaluation_interval_list | |
Int preemptible_tries | |
command { | |
java -Xmx2000m -jar /usr/gitc/picard.jar \ | |
CollectVariantCallingMetrics \ | |
INPUT=${input_vcf} \ | |
OUTPUT=${metrics_basename} \ | |
DBSNP=${dbSNP_vcf} \ | |
SEQUENCE_DICTIONARY=${ref_dict} \ | |
TARGET_INTERVALS=${wgs_evaluation_interval_list} \ | |
GVCF_INPUT=true | |
} | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File summary_metrics = "${metrics_basename}.variant_calling_summary_metrics" | |
File detail_metrics = "${metrics_basename}.variant_calling_detail_metrics" | |
} | |
} | |
# PUBLIC # | |
# Convert BAM file to CRAM format | |
task ConvertToCram { | |
File input_bam | |
File ref_fasta | |
File ref_fasta_index | |
String output_basename | |
Int disk_size | |
Int preemptible_tries | |
command <<< | |
samtools view -C -T ${ref_fasta} ${input_bam} | \ | |
tee ${output_basename}.cram | \ | |
md5sum > ${output_basename}.cram.md5 | |
# Create REF_CACHE. Used when indexing a CRAM | |
seq_cache_populate.pl -root ./ref/cache ${ref_fasta} | |
export REF_PATH=: | |
export REF_CACHE=./ref/cache/%2s/%2s/%s | |
samtools index ${output_basename}.cram | |
mv ${output_basename}.cram.crai ${output_basename}.crai | |
>>> | |
runtime { | |
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282" | |
memory: "3 GB" | |
cpu: "1" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_cram = "${output_basename}.cram" | |
File output_cram_index = "${output_basename}.crai" | |
File output_cram_md5 = "${output_basename}.cram.md5" | |
} | |
} | |
# PUBLIC # | |
# WORKFLOW DEFINITION | |
workflow PairedEndSingleSampleWorkflow { | |
# PRIVATE # | |
File contamination_sites_vcf | |
File contamination_sites_vcf_index | |
File fingerprint_genotypes_file # if this file is empty (0-length) the workflow should not do fingerprint comparison (as there are no fingerprints for the sample) | |
File haplotype_database_file | |
File wgs_evaluation_interval_list | |
File wgs_coverage_interval_list | |
# PUBLIC # | |
String sample_name | |
String final_gvcf_name | |
Array[File] flowcell_unmapped_bams | |
String unmapped_bam_suffix | |
Array[File] scattered_calling_intervals | |
File wgs_calling_interval_list | |
File ref_fasta | |
File ref_fasta_index | |
File ref_dict | |
File ref_alt | |
File ref_bwt | |
File ref_sa | |
File ref_amb | |
File ref_ann | |
File ref_pac | |
File dbSNP_vcf | |
File dbSNP_vcf_index | |
Array[File] known_indels_sites_VCFs | |
Array[File] known_indels_sites_indices | |
Int flowcell_small_disk | |
Int flowcell_medium_disk | |
Int agg_small_disk | |
Int agg_medium_disk | |
Int agg_large_disk | |
Int preemptible_tries | |
Int agg_preemptible_tries | |
String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 $bash_ref_fasta" | |
String recalibrated_bam_basename = sample_name + ".aligned.duplicates_marked.recalibrated" | |
# PUBLIC # | |
# Get the version of BWA to include in the PG record in the header of the BAM produced | |
# by MergeBamAlignment. | |
call GetBwaVersion | |
# PUBLIC # | |
# Align flowcell-level unmapped input bams in parallel | |
scatter (unmapped_bam in flowcell_unmapped_bams) { | |
# Because of a wdl/cromwell bug this is not currently valid so we have to sub(sub()) in each task | |
# String base_name = sub(sub(unmapped_bam, "gs://.*/", ""), unmapped_bam_suffix + "$", "") | |
String sub_strip_path = "gs://.*/" | |
String sub_strip_unmapped = unmapped_bam_suffix + "$" | |
# PRIVATE # | |
call CollectQualityYieldMetrics { | |
input: | |
input_bam = unmapped_bam, | |
metrics_filename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmapped.quality_yield_metrics", | |
disk_size = flowcell_small_disk, | |
preemptible_tries = preemptible_tries | |
} | |
# PUBLIC # | |
# Map reads to reference | |
call SamToFastqAndBwaMem { | |
input: | |
input_bam = unmapped_bam, | |
bwa_commandline = bwa_commandline, | |
output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmerged", | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
ref_dict = ref_dict, | |
ref_alt = ref_alt, | |
ref_bwt = ref_bwt, | |
ref_amb = ref_amb, | |
ref_ann = ref_ann, | |
ref_pac = ref_pac, | |
ref_sa = ref_sa, | |
disk_size = flowcell_medium_disk, | |
preemptible_tries = preemptible_tries | |
} | |
# PUBLIC # | |
# Merge original uBAM and BWA-aligned BAM | |
call MergeBamAlignment { | |
input: | |
unmapped_bam = unmapped_bam, | |
bwa_commandline = bwa_commandline, | |
bwa_version = GetBwaVersion.version, | |
aligned_bam = SamToFastqAndBwaMem.output_bam, | |
output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".aligned.unsorted", | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
ref_dict = ref_dict, | |
disk_size = flowcell_medium_disk, | |
preemptible_tries = preemptible_tries | |
} | |
# PUBLIC # | |
# Sort and fix tags in the merged BAM | |
call SortAndFixTags as SortAndFixReadGroupBam { | |
input: | |
input_bam = MergeBamAlignment.output_bam, | |
output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".sorted", | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = flowcell_medium_disk, | |
preemptible_tries = preemptible_tries | |
} | |
# PRIVATE # | |
# called to help in finding problems early. | |
# if too time consuming and not helpful, can be removed. | |
call ValidateSamFile as ValidateReadGroupSamFile { | |
input: | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
ref_dict = ref_dict, | |
input_bam = SortAndFixReadGroupBam.output_bam, | |
report_filename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".validation_report", | |
disk_size = flowcell_medium_disk, | |
preemptible_tries = preemptible_tries | |
} | |
# PRIVATE # | |
# no reference as the input here is unsorted, providing a reference would cause an error | |
call CollectUnsortedReadgroupBamQualityMetrics { | |
input: | |
input_bam = MergeBamAlignment.output_bam, | |
output_bam_prefix = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".readgroup", | |
disk_size = flowcell_medium_disk | |
} | |
# PUBLIC # | |
} | |
# PUBLIC # | |
# Aggregate aligned+merged flowcell BAM files and mark duplicates | |
call MarkDuplicates { | |
input: | |
input_bams = MergeBamAlignment.output_bam, | |
output_bam_basename = sample_name + ".aligned.unsorted.duplicates_marked", | |
metrics_filename = sample_name + ".duplicate_metrics", | |
disk_size = agg_large_disk | |
} | |
# PUBLIC # | |
# Sort aggregated+deduped BAM file and fix tags | |
call SortAndFixTags as SortAndFixSampleBam { | |
input: | |
input_bam = MarkDuplicates.output_bam, | |
output_bam_basename = sample_name + ".aligned.duplicate_marked.sorted", | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_large_disk, | |
preemptible_tries = 0 | |
} | |
# PRIVATE # | |
call CrossCheckFingerprints { | |
input: | |
input_bams = SortAndFixSampleBam.output_bam, | |
input_bam_indexes = SortAndFixSampleBam.output_bam_index, | |
haplotype_database_file = haplotype_database_file, | |
metrics_filename = sample_name + ".crosscheck", | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# Create list of sequences for scatter-gather parallelization | |
call CreateSequenceGroupingTSV { | |
input: | |
ref_dict = ref_dict, | |
preemptible_tries = preemptible_tries | |
} | |
# PRIVATE # | |
call CheckContamination as PreBqsrCheckContamination { | |
input: | |
input_bam = SortAndFixSampleBam.output_bam, | |
input_bam_index = SortAndFixSampleBam.output_bam_index, | |
contamination_sites_vcf = contamination_sites_vcf, | |
contamination_sites_vcf_index = contamination_sites_vcf_index, | |
output_prefix = sample_name + ".preBqsr", | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel | |
scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) { | |
# Generate the recalibration model by interval | |
call BaseRecalibrator { | |
input: | |
input_bam = SortAndFixSampleBam.output_bam, | |
input_bam_index = SortAndFixSampleBam.output_bam_index, | |
recalibration_report_filename = sample_name + ".recal_data.csv", | |
sequence_group_interval = subgroup, | |
dbSNP_vcf = dbSNP_vcf, | |
dbSNP_vcf_index = dbSNP_vcf_index, | |
known_indels_sites_VCFs = known_indels_sites_VCFs, | |
known_indels_sites_indices = known_indels_sites_indices, | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# Apply the recalibration model by interval | |
call ApplyBQSR { | |
input: | |
input_bam = SortAndFixSampleBam.output_bam, | |
input_bam_index = SortAndFixSampleBam.output_bam_index, | |
output_bam_basename = recalibrated_bam_basename, | |
recalibration_report = GatherBqsrReports.output_bqsr_report, | |
sequence_group_interval = subgroup, | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
} | |
# PUBLIC # | |
# Merge the recalibration reports resulting from by-interval recalibration | |
call GatherBqsrReports { | |
input: | |
input_bqsr_reports = BaseRecalibrator.recalibration_report, | |
output_report_filename = sample_name + ".recal_data.csv", | |
disk_size = flowcell_small_disk, | |
preemptible_tries = preemptible_tries | |
} | |
# PUBLIC # | |
# Do an additional round of recalibration on the unmapped reads (which would otherwise | |
# be left behind because they're not accounted for in the scatter intervals). This is | |
# done by running ApplyBQSR with "-L unmapped". | |
Array[String] unmapped_group_interval = ["unmapped"] | |
call ApplyBQSR as ApplyBQSRToUnmappedReads { | |
input: | |
input_bam = SortAndFixSampleBam.output_bam, | |
input_bam_index = SortAndFixSampleBam.output_bam_index, | |
output_bam_basename = recalibrated_bam_basename, | |
recalibration_report = GatherBqsrReports.output_bqsr_report, | |
sequence_group_interval = unmapped_group_interval, | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# Merge the recalibrated BAM files resulting from by-interval recalibration | |
# TODO: when we have capability of adding elements to arrays, can just have one array | |
# as an input and add the output of the above task to the scattered printreads bams | |
call GatherBamFiles { | |
input: | |
input_bams = ApplyBQSR.recalibrated_bam, | |
input_unmapped_reads_bam = ApplyBQSRToUnmappedReads.recalibrated_bam, | |
output_bam_basename = sample_name, | |
disk_size = agg_large_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PRIVATE # | |
call CollectReadgroupBamQualityMetrics { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
output_bam_prefix = sample_name + ".readgroup", | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk | |
} | |
# PRIVATE # | |
call ValidateSamFile as ValidateAggregatedSamFile { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
report_filename = sample_name + ".validation_report", | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PRIVATE # | |
call CollectAggregationMetrics { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
output_bam_prefix = sample_name, | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk | |
} | |
# PRIVATE # | |
call CheckFingerprint { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
haplotype_database_file = haplotype_database_file, | |
genotypes = fingerprint_genotypes_file, | |
sample = sample_name, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PRIVATE # | |
call CollectWgsMetrics { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
metrics_filename = sample_name + ".wgs_metrics", | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
wgs_coverage_interval_list = wgs_coverage_interval_list, | |
disk_size = agg_small_disk | |
} | |
# PRIVATE # | |
call CollectRawWgsMetrics { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
metrics_filename = sample_name + ".raw_wgs_metrics", | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
wgs_coverage_interval_list = wgs_coverage_interval_list, | |
disk_size = agg_small_disk | |
} | |
# PRIVATE # | |
call CalculateReadGroupChecksum { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5", | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# Convert the final merged recalibrated BAM file to CRAM format | |
call ConvertToCram { | |
input: | |
input_bam = GatherBamFiles.output_bam, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
output_basename = sample_name, | |
disk_size = agg_medium_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PRIVATE # | |
call ValidateSamFile as ValidateCram { | |
input: | |
input_bam = ConvertToCram.output_cram, | |
input_bam_index = ConvertToCram.output_cram_index, | |
report_filename = sample_name + ".cram.validation_report", | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
ignore = ["MISSING_TAG_NM"], | |
max_output = 1000000000, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# Call variants in parallel over WGS calling intervals | |
scatter (subInterval in scattered_calling_intervals) { | |
# PUBLIC # | |
# Generate GVCF by interval | |
call HaplotypeCaller { | |
input: | |
# PRIVATE # | |
contamination = PreBqsrCheckContamination.contamination, | |
# PUBLIC # | |
input_bam = GatherBamFiles.output_bam, | |
input_bam_index = GatherBamFiles.output_bam_index, | |
interval_list = subInterval, | |
gvcf_basename = sample_name, | |
ref_dict = ref_dict, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
} | |
# PUBLIC # | |
# Combine by-interval GVCFs into a single sample GVCF file | |
call MergeVCFs { | |
input: | |
input_vcfs = HaplotypeCaller.output_gvcf, | |
input_vcfs_indexes = HaplotypeCaller.output_gvcf_index, | |
output_vcf_name = final_gvcf_name, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PRIVATE # | |
call ValidateGVCF { | |
input: | |
input_vcf = MergeVCFs.output_vcf, | |
input_vcf_index = MergeVCFs.output_vcf_index, | |
dbSNP_vcf = dbSNP_vcf, | |
dbSNP_vcf_index = dbSNP_vcf_index, | |
ref_fasta = ref_fasta, | |
ref_fasta_index = ref_fasta_index, | |
ref_dict = ref_dict, | |
wgs_calling_interval_list = wgs_calling_interval_list, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PRIVATE # | |
call CollectGvcfCallingMetrics { | |
input: | |
input_vcf = MergeVCFs.output_vcf, | |
input_vcf_index = MergeVCFs.output_vcf_index, | |
metrics_basename = sample_name, | |
dbSNP_vcf = dbSNP_vcf, | |
dbSNP_vcf_index = dbSNP_vcf_index, | |
ref_dict = ref_dict, | |
wgs_evaluation_interval_list = wgs_evaluation_interval_list, | |
disk_size = agg_small_disk, | |
preemptible_tries = agg_preemptible_tries | |
} | |
# PUBLIC # | |
# NOTE TO COMMS: Outputs must be redacted manually pending bug fix!!! | |
# Outputs that will be retained when execution is complete | |
output { | |
CollectQualityYieldMetrics.* | |
ValidateReadGroupSamFile.* | |
CollectReadgroupBamQualityMetrics.* | |
CollectUnsortedReadgroupBamQualityMetrics.* | |
CrossCheckFingerprints.* | |
ValidateCram.* | |
CalculateReadGroupChecksum.* | |
ValidateAggregatedSamFile.* | |
CollectAggregationMetrics.* | |
CheckFingerprint.* | |
CollectWgsMetrics.* | |
CollectRawWgsMetrics.* | |
PreBqsrCheckContamination.* | |
CollectGvcfCallingMetrics.* | |
MarkDuplicates.duplicate_metrics | |
GatherBqsrReports.* | |
ConvertToCram.* | |
MergeVCFs.* | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment