Last active
December 23, 2023 23:51
-
-
Save lmtani/48a83a7052c90443fbde9e772a83a46f to your computer and use it in GitHub Desktop.
[blog] WDL workflow with compleasm
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version development | |
workflow MinibuscoWorkflow { | |
input { | |
Array[String] accessions | |
String busco_lineage | |
} | |
call MinibuscoDownload { | |
input: | |
lineage = busco_lineage | |
} | |
call FetchNCBI { | |
input: | |
accessions = accessions | |
} | |
scatter (fasta_url in FetchNCBI.assemblies) { | |
call DownloadAssembly { | |
input: | |
fasta_url = fasta_url | |
} | |
call MinibuscoRun { | |
input: | |
fasta = DownloadAssembly.fasta, | |
lineage_dir = MinibuscoDownload.lineage_dir, | |
output_directory = basename(fasta_url, ".fna.gz"), | |
lineage = busco_lineage | |
} | |
} | |
output { | |
Array[File] reports = MinibuscoRun.report | |
} | |
} | |
task MinibuscoDownload { | |
input { | |
String lineage # BUSCO compatible lineage name | |
} | |
command <<< | |
compleasm download --library_path mb_downloads ~{lineage} | |
>>> | |
runtime { | |
cpu: 1 | |
memory: "2 GB" | |
docker: "quay.io/biocontainers/compleasm:0.2.4--pyh7cba7a3_0" | |
disk: "local-disk 10 HDD" | |
} | |
output { | |
Directory lineage_dir = "mb_downloads" | |
} | |
} | |
task FetchNCBI { | |
input { | |
Array[String] accessions | |
String your_email = "your.email@domain.com" | |
} | |
command <<< | |
python <<CODE | |
from Bio import Entrez | |
# Always tell NCBI who you are | |
Entrez.email = "~{your_email}" | |
# The assembly accession number of the genome you want to download | |
# accession = "GCA_949128135.1" # This is an example, replace with your specific accession number | |
def get_ftp_url(accession): | |
# Fetch the assembly summary | |
handle = Entrez.esearch(db="assembly", term=accession) | |
id_list = Entrez.read(handle)["IdList"] | |
ids = ",".join(id_list) | |
ids = id_list[0] | |
handle = Entrez.esummary(db="assembly", id=ids) | |
record = Entrez.read(handle) | |
return record["DocumentSummarySet"]["DocumentSummary"][0]["FtpPath_Stats_rpt"].replace("_assembly_stats.txt", "_genomic.fna.gz") | |
# read lines from accessions.txt | |
with open("~{write_lines(accessions)}") as f: | |
accessions = f.readlines() | |
for accession in accessions: | |
print(get_ftp_url(accession)) | |
CODE | |
>>> | |
runtime { | |
cpu: 1 | |
memory: "2 GB" | |
disk: "local-disk 10 HDD" | |
docker: "quay.io/biocontainers/biopython:1.75" | |
} | |
output { | |
Array[String] assemblies = read_lines(stdout()) | |
} | |
} | |
task DownloadAssembly { | |
input { | |
String fasta_url | |
} | |
String output_name = basename(fasta_url) | |
command <<< | |
wget ~{fasta_url} | |
>>> | |
runtime { | |
cpu: 1 | |
memory: "2 GB" | |
docker: "quay.io/biocontainers/wget:1.20.1" | |
disk: "local-disk 10 HDD" | |
} | |
output { | |
File fasta = output_name | |
} | |
} | |
task MinibuscoRun { | |
input { | |
File fasta | |
Directory lineage_dir | |
String lineage | |
String output_directory | |
Int threads = 4 | |
Int memory = 12 | |
} | |
command <<< | |
set -e | |
compleasm run --library_path ~{lineage_dir} -a ~{fasta} -t ~{threads} -l ~{lineage} -o ~{output_directory} | |
>>> | |
runtime { | |
cpu: threads | |
memory: memory | |
docker: "quay.io/biocontainers/compleasm:0.2.4--pyh7cba7a3_0" | |
disk: "local-disk 10 HDD" | |
} | |
output { | |
File report = "~{output_directory}/summary.txt" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment