Created
August 21, 2018 16:00
-
-
Save weinstockj/30e0d99d11e9a2633cf7602b74cbf5fe to your computer and use it in GitHub Desktop.
WDL script to filter VCF files based on a bed file and gzip and tabix the results
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
task subset_vcf { | |
File vcf | |
File tabix | |
File bed | |
String basename | |
Int disk_size | |
Int preemptible_tries | |
command <<< | |
echo "basename is: ${basename}" | |
echo "disk_size is: ${disk_size}" | |
ln -s ${bed} sites.bed | |
ln -s ${vcf} input.vcf.gz | |
ln -s ${tabix} input.vcf.gz.tbi | |
bedtools intersect -header -a input.vcf.gz -b sites.bed | bgzip > ${basename}.recab.cram-filtered.known.vcf.gz | |
tabix -p vcf ${basename}.recab.cram-filtered.known.vcf.gz | |
>>> | |
runtime { | |
docker: "jweinstk/bedtools_and_tabix:latest" | |
cpu: "1" | |
memory: "3.7 GB" | |
disks: "local-disk " + disk_size + " HDD" | |
preemptible: preemptible_tries | |
} | |
output { | |
File output_vcf = "${basename}.recab.cram-filtered.known.vcf.gz" | |
File output_vcf_idx = "${basename}.recab.cram-filtered.known.vcf.gz.tbi" | |
} | |
} | |
workflow subset_vcf_for_known { | |
File vcf_uris | |
Array[Array[String]] vcfs = read_tsv(vcf_uris) | |
File bed | |
String suffix | |
Int preemptible_tries | |
scatter (vcf in vcfs) { | |
String basename = sub(sub(vcf[0], "^.*/", ""), suffix + "$", "") | |
Int disk_size = ceil(size(vcf[0], "GB")) * 2 + 5 # size for input and output vcf + pad | |
call subset_vcf { | |
input: | |
vcf = vcf[0], | |
tabix = vcf[1], | |
basename = basename, | |
bed = bed, | |
disk_size = disk_size, | |
preemptible_tries = preemptible_tries | |
} | |
} | |
output { | |
subset_vcf.output_vcf | |
subset_vcf.output_vcf_idx | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment