Skip to content

Instantly share code, notes, and snippets.

@lmtani
Last active December 23, 2023 23:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lmtani/48a83a7052c90443fbde9e772a83a46f to your computer and use it in GitHub Desktop.
Save lmtani/48a83a7052c90443fbde9e772a83a46f to your computer and use it in GitHub Desktop.
[blog] WDL workflow with compleasm
version development
workflow MinibuscoWorkflow {
input {
Array[String] accessions
String busco_lineage
}
call MinibuscoDownload {
input:
lineage = busco_lineage
}
call FetchNCBI {
input:
accessions = accessions
}
scatter (fasta_url in FetchNCBI.assemblies) {
call DownloadAssembly {
input:
fasta_url = fasta_url
}
call MinibuscoRun {
input:
fasta = DownloadAssembly.fasta,
lineage_dir = MinibuscoDownload.lineage_dir,
output_directory = basename(fasta_url, ".fna.gz"),
lineage = busco_lineage
}
}
output {
Array[File] reports = MinibuscoRun.report
}
}
task MinibuscoDownload {
input {
String lineage # BUSCO compatible lineage name
}
command <<<
compleasm download --library_path mb_downloads ~{lineage}
>>>
runtime {
cpu: 1
memory: "2 GB"
docker: "quay.io/biocontainers/compleasm:0.2.4--pyh7cba7a3_0"
disk: "local-disk 10 HDD"
}
output {
Directory lineage_dir = "mb_downloads"
}
}
task FetchNCBI {
input {
Array[String] accessions
String your_email = "your.email@domain.com"
}
command <<<
python <<CODE
from Bio import Entrez
# Always tell NCBI who you are
Entrez.email = "~{your_email}"
# The assembly accession number of the genome you want to download
# accession = "GCA_949128135.1" # This is an example, replace with your specific accession number
def get_ftp_url(accession):
# Fetch the assembly summary
handle = Entrez.esearch(db="assembly", term=accession)
id_list = Entrez.read(handle)["IdList"]
ids = ",".join(id_list)
ids = id_list[0]
handle = Entrez.esummary(db="assembly", id=ids)
record = Entrez.read(handle)
return record["DocumentSummarySet"]["DocumentSummary"][0]["FtpPath_Stats_rpt"].replace("_assembly_stats.txt", "_genomic.fna.gz")
# read lines from accessions.txt
with open("~{write_lines(accessions)}") as f:
accessions = f.readlines()
for accession in accessions:
print(get_ftp_url(accession))
CODE
>>>
runtime {
cpu: 1
memory: "2 GB"
disk: "local-disk 10 HDD"
docker: "quay.io/biocontainers/biopython:1.75"
}
output {
Array[String] assemblies = read_lines(stdout())
}
}
task DownloadAssembly {
input {
String fasta_url
}
String output_name = basename(fasta_url)
command <<<
wget ~{fasta_url}
>>>
runtime {
cpu: 1
memory: "2 GB"
docker: "quay.io/biocontainers/wget:1.20.1"
disk: "local-disk 10 HDD"
}
output {
File fasta = output_name
}
}
task MinibuscoRun {
input {
File fasta
Directory lineage_dir
String lineage
String output_directory
Int threads = 4
Int memory = 12
}
command <<<
set -e
compleasm run --library_path ~{lineage_dir} -a ~{fasta} -t ~{threads} -l ~{lineage} -o ~{output_directory}
>>>
runtime {
cpu: threads
memory: memory
docker: "quay.io/biocontainers/compleasm:0.2.4--pyh7cba7a3_0"
disk: "local-disk 10 HDD"
}
output {
File report = "~{output_directory}/summary.txt"
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment