Skip to content

Instantly share code, notes, and snippets.

@tlangs
Created January 23, 2017 16:06
Show Gist options
  • Save tlangs/d65717a2387a531ecc8db439cab444fa to your computer and use it in GitHub Desktop.
Save tlangs/d65717a2387a531ecc8db439cab444fa to your computer and use it in GitHub Desktop.
## Copyright Broad Institute, 2016
##
## This WDL pipeline implements data pre-processing and initial variant calling (GVCF
## generation) according to the GATK Best Practices (June 2016) for germline SNP and
## Indel discovery in human whole-genome sequencing (WGS) data.
##
## Requirements/expectations :
## - Human whole-genome pair-end sequencing data in unmapped BAM (uBAM) format
## - One or more read groups, one per uBAM file, all belonging to a single sample (SM)
## - Input uBAM files must additionally comply with the following requirements:
## - - filenames all have the same suffix (we use ".unmapped.bam")
## - - files must pass validation by ValidateSamFile
## - - reads are provided in query-sorted order
## - - all reads must have an RG tag
## - Reference genome must be Hg38 with ALT contigs
##
## Runtime parameters are optimized for Broad's Google Cloud Platform implementation.
## For program versions, see docker containers.
##
## LICENSING :
## This script is released under the WDL source code license (BSD-3) (see LICENSE in
## https://github.com/broadinstitute/wdl). Note however that the programs it calls may
## be subject to different licenses. Users are responsible for checking that they are
## authorized to run all programs before running this script. Please see the docker
## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
## licensing information pertaining to the included programs.
# PRIVATE #
# The internal production version of this WDL must have Private and Public block tags
# annotated to distinguish the content that can be made public versus what cannot.
# PUBLIC #
# TASK DEFINITIONS
# PRIVATE #
task CollectQualityYieldMetrics {
File input_bam
String metrics_filename
Int disk_size
Int preemptible_tries
command {
java -Xmx128m -jar /usr/gitc/picard.jar \
CollectQualityYieldMetrics \
INPUT=${input_bam} \
OQ=true \
OUTPUT=${metrics_filename}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
disks: "local-disk " + disk_size + " HDD"
memory: "2 GB"
preemptible: preemptible_tries
}
output {
File metrics = "${metrics_filename}"
}
}
# PUBLIC #
# Get version of BWA
task GetBwaVersion {
command {
/usr/gitc/bwa 2>&1 | \
grep -e '^Version' | \
sed 's/Version: //'
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "1 GB"
}
output {
String version = read_string(stdout())
}
}
# PUBLIC #
# Read unmapped BAM, convert on-the-fly to FASTQ and stream to BWA MEM for alignment
task SamToFastqAndBwaMem {
File input_bam
String bwa_commandline
String output_bam_basename
File ref_fasta
File ref_fasta_index
File ref_dict
# This is the .alt file from bwa-kit (https://github.com/lh3/bwa/tree/master/bwakit),
# listing the reference contigs that are "alternative".
File ref_alt
File ref_amb
File ref_ann
File ref_bwt
File ref_pac
File ref_sa
Int disk_size
Int preemptible_tries
command <<<
set -o pipefail
# set the bash variable needed for the command-line
bash_ref_fasta=${ref_fasta}
# if ref_alt has data in it,
if [ -s ${ref_alt} ]; then
java -Xmx3000m -jar /usr/gitc/picard.jar \
SamToFastq \
INPUT=${input_bam} \
FASTQ=/dev/stdout \
INTERLEAVE=true \
NON_PF=true | \
/usr/gitc/${bwa_commandline} /dev/stdin - 2> >(tee ${output_bam_basename}.bwa.stderr.log >&2) | \
samtools view -1 - > ${output_bam_basename}.bam && \
grep -m1 "read .* ALT contigs" ${output_bam_basename}.bwa.stderr.log | \
grep -v "read 0 ALT contigs"
# else ref_alt is empty or could not be found
else
exit 1;
fi
>>>
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "14 GB"
cpu: "16"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File output_bam = "${output_bam_basename}.bam"
File bwa_stderr_log = "${output_bam_basename}.bwa.stderr.log"
}
}
# PUBLIC #
# Merge original input uBAM file with BWA-aligned BAM file
task MergeBamAlignment {
File unmapped_bam
String bwa_commandline
String bwa_version
File aligned_bam
String output_bam_basename
File ref_fasta
File ref_fasta_index
File ref_dict
Int disk_size
Int preemptible_tries
command {
# set the bash variable needed for the command-line
bash_ref_fasta=${ref_fasta}
java -Xmx3000m -jar /usr/gitc/picard.jar \
MergeBamAlignment \
VALIDATION_STRINGENCY=SILENT \
EXPECTED_ORIENTATIONS=FR \
ATTRIBUTES_TO_RETAIN=X0 \
ALIGNED_BAM=${aligned_bam} \
UNMAPPED_BAM=${unmapped_bam} \
OUTPUT=${output_bam_basename}.bam \
REFERENCE_SEQUENCE=${ref_fasta} \
PAIRED_RUN=true \
SORT_ORDER="unsorted" \
IS_BISULFITE_SEQUENCE=false \
ALIGNED_READS_ONLY=false \
CLIP_ADAPTERS=false \
MAX_RECORDS_IN_RAM=2000000 \
ADD_MATE_CIGAR=true \
MAX_INSERTIONS_OR_DELETIONS=-1 \
PRIMARY_ALIGNMENT_STRATEGY=MostDistant \
PROGRAM_RECORD_ID="bwamem" \
PROGRAM_GROUP_VERSION="${bwa_version}" \
PROGRAM_GROUP_COMMAND_LINE="${bwa_commandline}" \
PROGRAM_GROUP_NAME="bwamem" \
UNMAP_CONTAMINANT_READS=true
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3500 MB"
cpu: "1"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File output_bam = "${output_bam_basename}.bam"
}
}
# PUBLIC #
# Sort BAM file by coordinate order and fix tag values for NM and UQ
task SortAndFixTags {
File input_bam
String output_bam_basename
File ref_dict
File ref_fasta
File ref_fasta_index
Int disk_size
Int preemptible_tries
command {
java -Xmx4000m -jar /usr/gitc/picard.jar \
SortSam \
INPUT=${input_bam} \
OUTPUT=/dev/stdout \
SORT_ORDER="coordinate" \
CREATE_INDEX=false \
CREATE_MD5_FILE=false | \
java -Xmx500m -jar /usr/gitc/picard.jar \
SetNmAndUqTags \
INPUT=/dev/stdin \
OUTPUT=${output_bam_basename}.bam \
CREATE_INDEX=true \
CREATE_MD5_FILE=true \
REFERENCE_SEQUENCE=${ref_fasta}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
disks: "local-disk " + disk_size + " HDD"
cpu: "1"
memory: "5000 MB"
preemptible: preemptible_tries
}
output {
File output_bam = "${output_bam_basename}.bam"
File output_bam_index = "${output_bam_basename}.bai"
File output_bam_md5 = "${output_bam_basename}.bam.md5"
}
}
# PRIVATE #
# The tasks CollectUnsortedReadgroupBamQualityMetrics, CollectReadgroupBamQualityMetrics,
# and CollectAggregationMetrics are all variations of CollectMultipleMetrics.
# They have been separated because we cannot glob their outputs.
# When glob works again, they can be merged back into one task.
# PRIVATE #
task CollectUnsortedReadgroupBamQualityMetrics {
File input_bam
String output_bam_prefix
Int disk_size
command {
java -Xmx5000m -jar /usr/gitc/picard.jar \
CollectMultipleMetrics \
INPUT=${input_bam} \
OUTPUT=${output_bam_prefix} \
ASSUME_SORTED=true \
PROGRAM="null" \
PROGRAM="CollectBaseDistributionByCycle" \
PROGRAM="CollectInsertSizeMetrics" \
PROGRAM="MeanQualityByCycle" \
PROGRAM="QualityScoreDistribution" \
METRIC_ACCUMULATION_LEVEL="null" \
METRIC_ACCUMULATION_LEVEL="ALL_READS"
touch ${output_bam_prefix}.insert_size_metrics
touch ${output_bam_prefix}.insert_size_histogram.pdf
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "7 GB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File base_distribution_by_cycle_pdf = "${output_bam_prefix}.base_distribution_by_cycle.pdf"
File base_distribution_by_cycle_metrics = "${output_bam_prefix}.base_distribution_by_cycle_metrics"
File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf"
File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics"
File quality_by_cycle_pdf = "${output_bam_prefix}.quality_by_cycle.pdf"
File quality_by_cycle_metrics = "${output_bam_prefix}.quality_by_cycle_metrics"
File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf"
File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics"
}
}
# PRIVATE #
task CollectReadgroupBamQualityMetrics {
File input_bam
File input_bam_index
String output_bam_prefix
File ref_dict
File ref_fasta
File ref_fasta_index
Int disk_size
command {
java -Xmx5000m -jar /usr/gitc/picard.jar \
CollectMultipleMetrics \
INPUT=${input_bam} \
REFERENCE_SEQUENCE=${ref_fasta} \
OUTPUT=${output_bam_prefix} \
ASSUME_SORTED=true \
PROGRAM="null" \
PROGRAM="CollectAlignmentSummaryMetrics" \
PROGRAM="CollectGcBiasMetrics" \
METRIC_ACCUMULATION_LEVEL="null" \
METRIC_ACCUMULATION_LEVEL="READ_GROUP"
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "7 GB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File alignment_summary_metrics = "${output_bam_prefix}.alignment_summary_metrics"
File gc_bias_detail_metrics = "${output_bam_prefix}.gc_bias.detail_metrics"
File gc_bias_pdf = "${output_bam_prefix}.gc_bias.pdf"
File gc_bias_summary_metrics = "${output_bam_prefix}.gc_bias.summary_metrics"
}
}
# PRIVATE #
task CollectAggregationMetrics {
File input_bam
File input_bam_index
String output_bam_prefix
File ref_dict
File ref_fasta
File ref_fasta_index
Int disk_size
command {
java -Xmx5000m -jar /usr/gitc/picard.jar \
CollectMultipleMetrics \
INPUT=${input_bam} \
REFERENCE_SEQUENCE=${ref_fasta} \
OUTPUT=${output_bam_prefix} \
ASSUME_SORTED=true \
PROGRAM="null" \
PROGRAM="CollectAlignmentSummaryMetrics" \
PROGRAM="CollectInsertSizeMetrics" \
PROGRAM="CollectSequencingArtifactMetrics" \
PROGRAM="CollectGcBiasMetrics" \
PROGRAM="QualityScoreDistribution" \
METRIC_ACCUMULATION_LEVEL="null" \
METRIC_ACCUMULATION_LEVEL="SAMPLE" \
METRIC_ACCUMULATION_LEVEL="LIBRARY"
touch ${output_bam_prefix}.insert_size_metrics
touch ${output_bam_prefix}.insert_size_histogram.pdf
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "7 GB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File alignment_summary_metrics = "${output_bam_prefix}.alignment_summary_metrics"
File bait_bias_detail_metrics = "${output_bam_prefix}.bait_bias_detail_metrics"
File bait_bias_summary_metrics = "${output_bam_prefix}.bait_bias_summary_metrics"
File gc_bias_detail_metrics = "${output_bam_prefix}.gc_bias.detail_metrics"
File gc_bias_pdf = "${output_bam_prefix}.gc_bias.pdf"
File gc_bias_summary_metrics = "${output_bam_prefix}.gc_bias.summary_metrics"
File insert_size_histogram_pdf = "${output_bam_prefix}.insert_size_histogram.pdf"
File insert_size_metrics = "${output_bam_prefix}.insert_size_metrics"
File pre_adapter_detail_metrics = "${output_bam_prefix}.pre_adapter_detail_metrics"
File pre_adapter_summary_metrics = "${output_bam_prefix}.pre_adapter_summary_metrics"
File quality_distribution_pdf = "${output_bam_prefix}.quality_distribution.pdf"
File quality_distribution_metrics = "${output_bam_prefix}.quality_distribution_metrics"
}
}
# PRIVATE #
task CrossCheckFingerprints {
Array[File] input_bams
Array[File] input_bam_indexes
File haplotype_database_file # if this file is empty (0-length) the workflow should not do fingerprint comparison (as there are no fingerprints for the sample)
String metrics_filename
Int disk_size
Int preemptible_tries
command <<<
if [ -s ${haplotype_database_file} ]; then
java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx2000m \
-jar /usr/gitc/picard.jar \
CrosscheckReadGroupFingerprints \
OUTPUT=${metrics_filename} \
HAPLOTYPE_MAP=${haplotype_database_file} \
EXPECT_ALL_READ_GROUPS_TO_MATCH=true \
INPUT=${sep=' INPUT=' input_bams} \
LOD_THRESHOLD=-20.0
else
echo "No haplotype_database_file. Skipping Fingerprint check"
touch ${metrics_filename}
fi
>>>
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "2 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File metrics = "${metrics_filename}"
}
}
# PRIVATE #
task CheckFingerprint {
File input_bam
File input_bam_index
File haplotype_database_file
File genotypes
String sample
Int disk_size
Int preemptible_tries
command <<<
if [ -s ${genotypes} ]; then
java -Dsamjdk.buffer_size=131072 -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx1024m \
-jar /usr/gitc/picard.jar \
CheckFingerprint \
INPUT=${input_bam} \
OUTPUT=${sample} \
GENOTYPES=${genotypes} \
HAPLOTYPE_MAP=${haplotype_database_file} \
SAMPLE_ALIAS=${sample} \
IGNORE_READ_GROUPS=true
else
echo "No fingerprint found. Skipping Fingerprint check"
# We touch the outputs here in order to create 0 length files. Otherwise the task will fail since the expected outputs are not to be found
touch ${sample}.fingerprinting_summary_metrics
touch ${sample}.fingerprinting_detail_metrics
fi
>>>
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "1 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File summary_metrics = "${sample}.fingerprinting_summary_metrics"
File detail_metrics = "${sample}.fingerprinting_detail_metrics"
}
}
# PUBLIC #
# Mark duplicate reads to avoid counting non-independent observations
task MarkDuplicates {
Array[File] input_bams
String output_bam_basename
String metrics_filename
Int disk_size
# Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly
# This works because the output of BWA is query-grouped, and thus so is the output of MergeBamAlignment.
# While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname"
command {
java -Xmx4000m -jar /usr/gitc/picard.jar \
MarkDuplicates \
INPUT=${sep=' INPUT=' input_bams} \
OUTPUT=${output_bam_basename}.bam \
METRICS_FILE=${metrics_filename} \
VALIDATION_STRINGENCY=SILENT \
OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \
ASSUME_SORT_ORDER="queryname"
CREATE_MD5_FILE=true
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "7 GB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File output_bam = "${output_bam_basename}.bam"
File duplicate_metrics = "${metrics_filename}"
}
}
# PUBLIC #
# Generate sets of intervals for scatter-gathering over chromosomes
task CreateSequenceGroupingTSV {
File ref_dict
Int preemptible_tries
# Use python to create the Sequencing Groupings used for BQSR and PrintReads Scatter. It outputs to stdout
# where it is parsed into a wdl Array[Array[String]]
# e.g. [["1"], ["2"], ["3", "4"], ["5"], ["6", "7", "8"]]
command <<<
python <<CODE
with open("${ref_dict}", "r") as ref_dict_file:
sequence_tuple_list = []
longest_sequence = 0
for line in ref_dict_file:
if line.startswith("@SQ"):
line_split = line.split("\t")
# (Sequence_Name, Sequence_Length)
sequence_tuple_list.append((line_split[1].split("SN:")[1], int(line_split[2].split("LN:")[1])))
longest_sequence = sorted(sequence_tuple_list, key=lambda x: x[1], reverse=True)[0][1]
# We are adding this to the intervals because hg38 has contigs named with embedded colons and a bug in GATK strips off
# the last element after a :, so we add this as a sacrificial element.
hg38_protection_tag = ":1+"
# initialize the tsv string with the first sequence
tsv_string = sequence_tuple_list[0][0] + hg38_protection_tag
temp_size = sequence_tuple_list[0][1]
for sequence_tuple in sequence_tuple_list[1:]:
if temp_size + sequence_tuple[1] <= longest_sequence:
temp_size += sequence_tuple[1]
tsv_string += "\t" + sequence_tuple[0] + hg38_protection_tag
else:
tsv_string += "\n" + sequence_tuple[0] + hg38_protection_tag
temp_size = sequence_tuple[1]
print tsv_string
CODE
>>>
runtime {
docker: "python:2.7"
memory: "2 GB"
preemptible: preemptible_tries
}
output {
Array[Array[String]] sequence_grouping = read_tsv(stdout())
}
}
# PUBLIC #
# Generate Base Quality Score Recalibration (BQSR) model
task BaseRecalibrator {
File input_bam
File input_bam_index
String recalibration_report_filename
Array[String] sequence_group_interval
File dbSNP_vcf
File dbSNP_vcf_index
Array[File] known_indels_sites_VCFs
Array[File] known_indels_sites_indices
File ref_dict
File ref_fasta
File ref_fasta_index
Int disk_size
Int preemptible_tries
command {
java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \
-XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \
-Xloggc:gc_log.log -Dsamjdk.use_async_io=false -Xmx4000m \
-jar /usr/gitc/GATK4.jar \
BaseRecalibrator \
-R ${ref_fasta} \
-I ${input_bam} \
--useOriginalQualities \
-O ${recalibration_report_filename} \
-knownSites ${dbSNP_vcf} \
-knownSites ${sep=" -knownSites " known_indels_sites_VCFs} \
-L ${sep=" -L " sequence_group_interval}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "6 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File recalibration_report = "${recalibration_report_filename}"
#this output is only for GOTC STAGING to give some GC statistics to the GATK4 team
#File gc_logs = "gc_log.log"
}
}
# PUBLIC #
# Apply Base Quality Score Recalibration (BQSR) model
task ApplyBQSR {
File input_bam
File input_bam_index
String output_bam_basename
File recalibration_report
Array[String] sequence_group_interval
File ref_dict
File ref_fasta
File ref_fasta_index
Int disk_size
Int preemptible_tries
command {
java -XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \
-XX:+PrintGCDetails -Xloggc:gc_log.log -Dsamjdk.use_async_io=false \
-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx3000m \
-jar /usr/gitc/GATK4.jar \
ApplyBQSR \
--createOutputBamMD5 \
--addOutputSAMProgramRecord \
-R ${ref_fasta} \
-I ${input_bam} \
--useOriginalQualities \
-O ${output_bam_basename}.bam \
-bqsr ${recalibration_report} \
-SQQ 10 -SQQ 20 -SQQ 30 -SQQ 40 \
--emit_original_quals \
-L ${sep=" -L " sequence_group_interval}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3500 MB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File recalibrated_bam = "${output_bam_basename}.bam"
File recalibrated_bam_checksum = "${output_bam_basename}.bam.md5"
#this output is only for GOTC STAGING to give some GC statistics to the GATK4 team
#File gc_logs = "gc_log.log"
}
}
# PUBLIC #
# Combine multiple recalibration tables from scattered BaseRecalibrator runs
task GatherBqsrReports {
Array[File] input_bqsr_reports
String output_report_filename
Int disk_size
Int preemptible_tries
command {
java -Xmx3000m -jar /usr/gitc/GATK4.jar \
GatherBQSRReports \
-I ${sep=' -I ' input_bqsr_reports} \
-O ${output_report_filename}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3500 MB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File output_bqsr_report = "${output_report_filename}"
}
}
# PUBLIC #
# Combine multiple recalibrated BAM files from scattered ApplyRecalibration runs
task GatherBamFiles {
Array[File] input_bams
File input_unmapped_reads_bam
String output_bam_basename
Int disk_size
Int preemptible_tries
command {
java -Xmx2000m -jar /usr/gitc/picard.jar \
GatherBamFiles \
INPUT=${sep=' INPUT=' input_bams} \
INPUT=${input_unmapped_reads_bam} \
OUTPUT=${output_bam_basename}.bam \
CREATE_INDEX=true \
CREATE_MD5_FILE=true
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File output_bam = "${output_bam_basename}.bam"
File output_bam_index = "${output_bam_basename}.bai"
File output_bam_md5 = "${output_bam_basename}.bam.md5"
}
}
# PRIVATE #
task ValidateSamFile {
File input_bam
File? input_bam_index
String report_filename
File? ref_dict
File? ref_fasta
File? ref_fasta_index
Int? max_output
Array[String]? ignore
Int disk_size
Int preemptible_tries
command {
java -Xmx4000m -jar /usr/gitc/picard.jar \
ValidateSamFile \
INPUT=${input_bam} \
OUTPUT=${report_filename} \
${"REFERENCE_SEQUENCE=" + ref_fasta} \
${"MAX_OUTPUT=" + max_output} \
IGNORE=${default="null" sep=" IGNORE=" ignore} \
MODE=VERBOSE \
IS_BISULFITE_SEQUENCED=false
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "7 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File report = "${report_filename}"
}
}
# PRIVATE #
# TODO - CollectWgsMetrics and CollectRawWgsMetrics can be combined by a locus-iterating collector
task CollectWgsMetrics {
File input_bam
File input_bam_index
String metrics_filename
File wgs_coverage_interval_list
File ref_fasta
File ref_fasta_index
Int disk_size
command {
java -Xmx2000m -jar /usr/gitc/picard.jar \
CollectWgsMetrics \
INPUT=${input_bam} \
VALIDATION_STRINGENCY=SILENT \
REFERENCE_SEQUENCE=${ref_fasta} \
INTERVALS=${wgs_coverage_interval_list} \
OUTPUT=${metrics_filename}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File metrics = "${metrics_filename}"
}
}
# PRIVATE #
task CollectRawWgsMetrics {
File input_bam
File input_bam_index
String metrics_filename
File wgs_coverage_interval_list
File ref_fasta
File ref_fasta_index
Int disk_size
command {
java -Xmx2000m -jar /usr/gitc/picard.jar \
CollectRawWgsMetrics \
INPUT=${input_bam} \
VALIDATION_STRINGENCY=SILENT \
REFERENCE_SEQUENCE=${ref_fasta} \
INTERVALS=${wgs_coverage_interval_list} \
OUTPUT=${metrics_filename}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
}
output {
File metrics = "${metrics_filename}"
}
}
# PRIVATE #
task CalculateReadGroupChecksum {
File input_bam
File input_bam_index
String read_group_md5_filename
Int disk_size
Int preemptible_tries
command {
java -Xmx1000m -jar /usr/gitc/picard.jar \
CalculateReadGroupChecksum \
INPUT=${input_bam} \
OUTPUT=${read_group_md5_filename}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "2 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File md5_file = "${read_group_md5_filename}"
}
}
# PRIVATE #
# Notes on the contamination estimate:
# The contamination value is read from the FREEMIX field of the selfSM file output by verifyBamId
#
# In Zamboni production, this value is stored directly in METRICS.AGGREGATION_CONTAM
#
# Contamination is also stored in GVCF_CALLING and thereby passed to HAPLOTYPE_CALLER
# But first, it is divided by an underestimation factor thusly:
# float(FREEMIX) / ContaminationUnderestimationFactor
# where the denominator is hardcoded in Zamboni:
# val ContaminationUnderestimationFactor = 0.75f
#
# Here, I am handling this by returning both the original selfSM file for reporting, and the adjusted
# contamination estimate for use in variant calling
task CheckContamination {
File input_bam
File input_bam_index
File contamination_sites_vcf
File contamination_sites_vcf_index
String output_prefix
Int disk_size
Int preemptible_tries
# Having to do this as a 2-step command in heredoc syntax, adding a python step to read the metrics
# This is a hack until read_object() is supported by Cromwell.
# It relies on knowing that there is only one data row in the 2-row selfSM TSV file
# Piping output of verifyBamId to /dev/null so only stdout is from the python command
command <<<
/usr/gitc/verifyBamID \
--verbose \
--ignoreRG \
--vcf ${contamination_sites_vcf} \
--out ${output_prefix} \
--bam ${input_bam} \
1>/dev/null &&
python3 <<CODE
import csv
import sys
with open('${output_prefix}.selfSM') as selfSM:
reader = csv.DictReader(selfSM, delimiter='\t')
i = 0
for row in reader:
if 0 and float(row["FREELK0"])==0 and float(row["FREELK1"])==0:
# a zero value for the likelihoods implies no data. This usually indicates a problem rather than a real event.
# if the bam isn't really empty, this is probably due to the use of a incompatible reference build between
# vcf and bam.
sys.stderr.write("Found zero likelihoods. Bam is either very-very shallow, or aligned to the wrong reference (relative to the vcf).")
sys.exit(1)
print(float(row["FREEMIX"])/0.75)
i = i + 1
# there should be exactly one row, and if this isn't the case the format of the output is unexpectedly different
# and the results are not reliable.
if i != 1:
sys.stderr.write("Found %d rows in .selfSM file. Was expecting exactly 1. This is an error"%(i))
sys.exit(2)
CODE
>>>
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "2 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File selfSM = "${output_prefix}.selfSM"
File depthSM = "${output_prefix}.depthSM"
File log = "${output_prefix}.log"
# I would like to do the following, however:
# The object is read as a string
# explicit string->float coercion via float(), as shown below, is supported by Cromwell
# the interim value cannot be stored as a string and then assigned to a float. Variables intialized in output cannot be dereferenced in output.
# Float contamination = float(read_object(${output_prefix} + ".selfSM").FREEMIX) / 0.75
# In the interim, get the value from the python hack above:
Float contamination = read_float(stdout())
}
}
# PUBLIC #
# Call variants on a single sample with HaplotypeCaller to produce a GVCF
task HaplotypeCaller {
File input_bam
File input_bam_index
File interval_list
String gvcf_basename
File ref_dict
File ref_fasta
File ref_fasta_index
Float? contamination
Int disk_size
Int preemptible_tries
# tried to find lowest memory variable where it would still work, might change once tested on JES
command {
java -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Xmx8000m \
-jar /usr/gitc/GATK35.jar \
-T HaplotypeCaller \
-R ${ref_fasta} \
-o ${gvcf_basename}.vcf.gz \
-I ${input_bam} \
-L ${interval_list} \
-ERC GVCF \
--max_alternate_alleles 3 \
-variant_index_parameter 128000 \
-variant_index_type LINEAR \
-contamination ${default=0 contamination} \
--read_filter OverclippedRead
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "10 GB"
cpu: "1"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File output_gvcf = "${gvcf_basename}.vcf.gz"
File output_gvcf_index = "${gvcf_basename}.vcf.gz.tbi"
}
}
# PUBLIC #
# Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs
task MergeVCFs {
Array[File] input_vcfs
Array[File] input_vcfs_indexes
String output_vcf_name
Int disk_size
Int preemptible_tries
# using MergeVcfs instead of GatherVcfs so we can create indices
# WARNING 2015-10-28 15:01:48 GatherVcfs Index creation not currently supported when gathering block compressed VCFs.
command {
java -Xmx2g -jar /usr/gitc/picard.jar \
MergeVcfs \
INPUT=${sep=' INPUT=' input_vcfs} \
OUTPUT=${output_vcf_name}
}
output {
File output_vcf = "${output_vcf_name}"
File output_vcf_index = "${output_vcf_name}.tbi"
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
}
# PRIVATE #
task ValidateGVCF {
File input_vcf
File input_vcf_index
File ref_fasta
File ref_fasta_index
File ref_dict
File dbSNP_vcf
File dbSNP_vcf_index
File wgs_calling_interval_list
Int disk_size
Int preemptible_tries
command {
java -Xmx8g -jar /usr/gitc/GATK36.jar \
-T ValidateVariants \
-V ${input_vcf} \
-R ${ref_fasta} \
-L ${wgs_calling_interval_list} \
-gvcf \
--validationTypeToExclude ALLELES \
--reference_window_stop 208 -U \
--dbsnp ${dbSNP_vcf}
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "10 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
}
# PRIVATE #
task CollectGvcfCallingMetrics {
File input_vcf
File input_vcf_index
String metrics_basename
File dbSNP_vcf
File dbSNP_vcf_index
File ref_dict
Int disk_size
File wgs_evaluation_interval_list
Int preemptible_tries
command {
java -Xmx2000m -jar /usr/gitc/picard.jar \
CollectVariantCallingMetrics \
INPUT=${input_vcf} \
OUTPUT=${metrics_basename} \
DBSNP=${dbSNP_vcf} \
SEQUENCE_DICTIONARY=${ref_dict} \
TARGET_INTERVALS=${wgs_evaluation_interval_list} \
GVCF_INPUT=true
}
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File summary_metrics = "${metrics_basename}.variant_calling_summary_metrics"
File detail_metrics = "${metrics_basename}.variant_calling_detail_metrics"
}
}
# PUBLIC #
# Convert BAM file to CRAM format
task ConvertToCram {
File input_bam
File ref_fasta
File ref_fasta_index
String output_basename
Int disk_size
Int preemptible_tries
command <<<
samtools view -C -T ${ref_fasta} ${input_bam} | \
tee ${output_basename}.cram | \
md5sum > ${output_basename}.cram.md5
# Create REF_CACHE. Used when indexing a CRAM
seq_cache_populate.pl -root ./ref/cache ${ref_fasta}
export REF_PATH=:
export REF_CACHE=./ref/cache/%2s/%2s/%s
samtools index ${output_basename}.cram
mv ${output_basename}.cram.crai ${output_basename}.crai
>>>
runtime {
docker: "broadinstitute/genomes-in-the-cloud:2.2.4-1469632282"
memory: "3 GB"
cpu: "1"
disks: "local-disk " + disk_size + " HDD"
preemptible: preemptible_tries
}
output {
File output_cram = "${output_basename}.cram"
File output_cram_index = "${output_basename}.crai"
File output_cram_md5 = "${output_basename}.cram.md5"
}
}
# PUBLIC #
# WORKFLOW DEFINITION
workflow PairedEndSingleSampleWorkflow {
# PRIVATE #
File contamination_sites_vcf
File contamination_sites_vcf_index
File fingerprint_genotypes_file # if this file is empty (0-length) the workflow should not do fingerprint comparison (as there are no fingerprints for the sample)
File haplotype_database_file
File wgs_evaluation_interval_list
File wgs_coverage_interval_list
# PUBLIC #
String sample_name
String final_gvcf_name
Array[File] flowcell_unmapped_bams
String unmapped_bam_suffix
Array[File] scattered_calling_intervals
File wgs_calling_interval_list
File ref_fasta
File ref_fasta_index
File ref_dict
File ref_alt
File ref_bwt
File ref_sa
File ref_amb
File ref_ann
File ref_pac
File dbSNP_vcf
File dbSNP_vcf_index
Array[File] known_indels_sites_VCFs
Array[File] known_indels_sites_indices
Int flowcell_small_disk
Int flowcell_medium_disk
Int agg_small_disk
Int agg_medium_disk
Int agg_large_disk
Int preemptible_tries
Int agg_preemptible_tries
String bwa_commandline="bwa mem -K 100000000 -p -v 3 -t 16 $bash_ref_fasta"
String recalibrated_bam_basename = sample_name + ".aligned.duplicates_marked.recalibrated"
# PUBLIC #
# Get the version of BWA to include in the PG record in the header of the BAM produced
# by MergeBamAlignment.
call GetBwaVersion
# PUBLIC #
# Align flowcell-level unmapped input bams in parallel
scatter (unmapped_bam in flowcell_unmapped_bams) {
# Because of a wdl/cromwell bug this is not currently valid so we have to sub(sub()) in each task
# String base_name = sub(sub(unmapped_bam, "gs://.*/", ""), unmapped_bam_suffix + "$", "")
String sub_strip_path = "gs://.*/"
String sub_strip_unmapped = unmapped_bam_suffix + "$"
# PRIVATE #
call CollectQualityYieldMetrics {
input:
input_bam = unmapped_bam,
metrics_filename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmapped.quality_yield_metrics",
disk_size = flowcell_small_disk,
preemptible_tries = preemptible_tries
}
# PUBLIC #
# Map reads to reference
call SamToFastqAndBwaMem {
input:
input_bam = unmapped_bam,
bwa_commandline = bwa_commandline,
output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".unmerged",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
ref_alt = ref_alt,
ref_bwt = ref_bwt,
ref_amb = ref_amb,
ref_ann = ref_ann,
ref_pac = ref_pac,
ref_sa = ref_sa,
disk_size = flowcell_medium_disk,
preemptible_tries = preemptible_tries
}
# PUBLIC #
# Merge original uBAM and BWA-aligned BAM
call MergeBamAlignment {
input:
unmapped_bam = unmapped_bam,
bwa_commandline = bwa_commandline,
bwa_version = GetBwaVersion.version,
aligned_bam = SamToFastqAndBwaMem.output_bam,
output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".aligned.unsorted",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
disk_size = flowcell_medium_disk,
preemptible_tries = preemptible_tries
}
# PUBLIC #
# Sort and fix tags in the merged BAM
call SortAndFixTags as SortAndFixReadGroupBam {
input:
input_bam = MergeBamAlignment.output_bam,
output_bam_basename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".sorted",
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = flowcell_medium_disk,
preemptible_tries = preemptible_tries
}
# PRIVATE #
# called to help in finding problems early.
# if too time consuming and not helpful, can be removed.
call ValidateSamFile as ValidateReadGroupSamFile {
input:
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
input_bam = SortAndFixReadGroupBam.output_bam,
report_filename = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".validation_report",
disk_size = flowcell_medium_disk,
preemptible_tries = preemptible_tries
}
# PRIVATE #
# no reference as the input here is unsorted, providing a reference would cause an error
call CollectUnsortedReadgroupBamQualityMetrics {
input:
input_bam = MergeBamAlignment.output_bam,
output_bam_prefix = sub(sub(unmapped_bam, sub_strip_path, ""), sub_strip_unmapped, "") + ".readgroup",
disk_size = flowcell_medium_disk
}
# PUBLIC #
}
# PUBLIC #
# Aggregate aligned+merged flowcell BAM files and mark duplicates
call MarkDuplicates {
input:
input_bams = MergeBamAlignment.output_bam,
output_bam_basename = sample_name + ".aligned.unsorted.duplicates_marked",
metrics_filename = sample_name + ".duplicate_metrics",
disk_size = agg_large_disk
}
# PUBLIC #
# Sort aggregated+deduped BAM file and fix tags
call SortAndFixTags as SortAndFixSampleBam {
input:
input_bam = MarkDuplicates.output_bam,
output_bam_basename = sample_name + ".aligned.duplicate_marked.sorted",
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_large_disk,
preemptible_tries = 0
}
# PRIVATE #
call CrossCheckFingerprints {
input:
input_bams = SortAndFixSampleBam.output_bam,
input_bam_indexes = SortAndFixSampleBam.output_bam_index,
haplotype_database_file = haplotype_database_file,
metrics_filename = sample_name + ".crosscheck",
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# Create list of sequences for scatter-gather parallelization
call CreateSequenceGroupingTSV {
input:
ref_dict = ref_dict,
preemptible_tries = preemptible_tries
}
# PRIVATE #
call CheckContamination as PreBqsrCheckContamination {
input:
input_bam = SortAndFixSampleBam.output_bam,
input_bam_index = SortAndFixSampleBam.output_bam_index,
contamination_sites_vcf = contamination_sites_vcf,
contamination_sites_vcf_index = contamination_sites_vcf_index,
output_prefix = sample_name + ".preBqsr",
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# Perform Base Quality Score Recalibration (BQSR) on the sorted BAM in parallel
scatter (subgroup in CreateSequenceGroupingTSV.sequence_grouping) {
# Generate the recalibration model by interval
call BaseRecalibrator {
input:
input_bam = SortAndFixSampleBam.output_bam,
input_bam_index = SortAndFixSampleBam.output_bam_index,
recalibration_report_filename = sample_name + ".recal_data.csv",
sequence_group_interval = subgroup,
dbSNP_vcf = dbSNP_vcf,
dbSNP_vcf_index = dbSNP_vcf_index,
known_indels_sites_VCFs = known_indels_sites_VCFs,
known_indels_sites_indices = known_indels_sites_indices,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# Apply the recalibration model by interval
call ApplyBQSR {
input:
input_bam = SortAndFixSampleBam.output_bam,
input_bam_index = SortAndFixSampleBam.output_bam_index,
output_bam_basename = recalibrated_bam_basename,
recalibration_report = GatherBqsrReports.output_bqsr_report,
sequence_group_interval = subgroup,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
}
# PUBLIC #
# Merge the recalibration reports resulting from by-interval recalibration
call GatherBqsrReports {
input:
input_bqsr_reports = BaseRecalibrator.recalibration_report,
output_report_filename = sample_name + ".recal_data.csv",
disk_size = flowcell_small_disk,
preemptible_tries = preemptible_tries
}
# PUBLIC #
# Do an additional round of recalibration on the unmapped reads (which would otherwise
# be left behind because they're not accounted for in the scatter intervals). This is
# done by running ApplyBQSR with "-L unmapped".
Array[String] unmapped_group_interval = ["unmapped"]
call ApplyBQSR as ApplyBQSRToUnmappedReads {
input:
input_bam = SortAndFixSampleBam.output_bam,
input_bam_index = SortAndFixSampleBam.output_bam_index,
output_bam_basename = recalibrated_bam_basename,
recalibration_report = GatherBqsrReports.output_bqsr_report,
sequence_group_interval = unmapped_group_interval,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# Merge the recalibrated BAM files resulting from by-interval recalibration
# TODO: when we have capability of adding elements to arrays, can just have one array
# as an input and add the output of the above task to the scattered printreads bams
call GatherBamFiles {
input:
input_bams = ApplyBQSR.recalibrated_bam,
input_unmapped_reads_bam = ApplyBQSRToUnmappedReads.recalibrated_bam,
output_bam_basename = sample_name,
disk_size = agg_large_disk,
preemptible_tries = agg_preemptible_tries
}
# PRIVATE #
call CollectReadgroupBamQualityMetrics {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
output_bam_prefix = sample_name + ".readgroup",
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk
}
# PRIVATE #
call ValidateSamFile as ValidateAggregatedSamFile {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
report_filename = sample_name + ".validation_report",
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PRIVATE #
call CollectAggregationMetrics {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
output_bam_prefix = sample_name,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk
}
# PRIVATE #
call CheckFingerprint {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
haplotype_database_file = haplotype_database_file,
genotypes = fingerprint_genotypes_file,
sample = sample_name,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PRIVATE #
call CollectWgsMetrics {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
metrics_filename = sample_name + ".wgs_metrics",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
wgs_coverage_interval_list = wgs_coverage_interval_list,
disk_size = agg_small_disk
}
# PRIVATE #
call CollectRawWgsMetrics {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
metrics_filename = sample_name + ".raw_wgs_metrics",
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
wgs_coverage_interval_list = wgs_coverage_interval_list,
disk_size = agg_small_disk
}
# PRIVATE #
call CalculateReadGroupChecksum {
input:
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
read_group_md5_filename = recalibrated_bam_basename + ".bam.read_group_md5",
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# Convert the final merged recalibrated BAM file to CRAM format
call ConvertToCram {
input:
input_bam = GatherBamFiles.output_bam,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
output_basename = sample_name,
disk_size = agg_medium_disk,
preemptible_tries = agg_preemptible_tries
}
# PRIVATE #
call ValidateSamFile as ValidateCram {
input:
input_bam = ConvertToCram.output_cram,
input_bam_index = ConvertToCram.output_cram_index,
report_filename = sample_name + ".cram.validation_report",
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ignore = ["MISSING_TAG_NM"],
max_output = 1000000000,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# Call variants in parallel over WGS calling intervals
scatter (subInterval in scattered_calling_intervals) {
# PUBLIC #
# Generate GVCF by interval
call HaplotypeCaller {
input:
# PRIVATE #
contamination = PreBqsrCheckContamination.contamination,
# PUBLIC #
input_bam = GatherBamFiles.output_bam,
input_bam_index = GatherBamFiles.output_bam_index,
interval_list = subInterval,
gvcf_basename = sample_name,
ref_dict = ref_dict,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
}
# PUBLIC #
# Combine by-interval GVCFs into a single sample GVCF file
call MergeVCFs {
input:
input_vcfs = HaplotypeCaller.output_gvcf,
input_vcfs_indexes = HaplotypeCaller.output_gvcf_index,
output_vcf_name = final_gvcf_name,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PRIVATE #
call ValidateGVCF {
input:
input_vcf = MergeVCFs.output_vcf,
input_vcf_index = MergeVCFs.output_vcf_index,
dbSNP_vcf = dbSNP_vcf,
dbSNP_vcf_index = dbSNP_vcf_index,
ref_fasta = ref_fasta,
ref_fasta_index = ref_fasta_index,
ref_dict = ref_dict,
wgs_calling_interval_list = wgs_calling_interval_list,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PRIVATE #
call CollectGvcfCallingMetrics {
input:
input_vcf = MergeVCFs.output_vcf,
input_vcf_index = MergeVCFs.output_vcf_index,
metrics_basename = sample_name,
dbSNP_vcf = dbSNP_vcf,
dbSNP_vcf_index = dbSNP_vcf_index,
ref_dict = ref_dict,
wgs_evaluation_interval_list = wgs_evaluation_interval_list,
disk_size = agg_small_disk,
preemptible_tries = agg_preemptible_tries
}
# PUBLIC #
# NOTE TO COMMS: Outputs must be redacted manually pending bug fix!!!
# Outputs that will be retained when execution is complete
output {
CollectQualityYieldMetrics.*
ValidateReadGroupSamFile.*
CollectReadgroupBamQualityMetrics.*
CollectUnsortedReadgroupBamQualityMetrics.*
CrossCheckFingerprints.*
ValidateCram.*
CalculateReadGroupChecksum.*
ValidateAggregatedSamFile.*
CollectAggregationMetrics.*
CheckFingerprint.*
CollectWgsMetrics.*
CollectRawWgsMetrics.*
PreBqsrCheckContamination.*
CollectGvcfCallingMetrics.*
MarkDuplicates.duplicate_metrics
GatherBqsrReports.*
ConvertToCram.*
MergeVCFs.*
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment