Skip to content

Instantly share code, notes, and snippets.

@audy
Created April 3, 2023 02:30
Show Gist options
  • Save audy/beb2edce512aa24acb55f9cafaecf87b to your computer and use it in GitHub Desktop.
Save audy/beb2edce512aa24acb55f9cafaecf87b to your computer and use it in GitHub Desktop.
quickly fetch assemblies from NCBI
#!/usr/bin/env python3
import os
import downloads
from joblib import Parallel, delayed
from itertools import islice
from tqdm import tqdm
def get_gbk_path(assembly) -> str:
"""
Given an Assembly object, return the https path to its GenBank file
"""
path = assembly["ftp_path"].replace("ftp://", "https://")
filename = f"{path.rsplit('/', 1)[-1]}_genomic.gbff.gz"
return f"{os.path.join(path, filename)}"
def iter_assembly_summary(handle):
"""
Iterate over an assembly_summary.txt from NCBI yielding rows as dictionaries
# requires assembly summary file from ncbi genbank/refseq
curl --silent --progress-bar https://ftp.ncbi.nih.gov/genomes/{genbank,refseq}/{bacteria,archaea,viruses,fungi}/assembly_summary.txt assembly_summary.txt
"""
next(handle) # skip false header
header = [x.strip() for x in next(handle).split("\t")]
header[0] = header[0].replace("# ", "")
for line in handle:
yield dict(zip(header, [x.strip() for x in line.split("\t")]))
def download_assembly(assembly):
out_path = f"assemblies/{assembly['assembly_accession']}.gbk.gz"
remote_path = get_gbk_path(assembly)
if not os.path.exists(out_path):
downloads.download(remote_path, out_path=out_path)
return True
def main():
with open("combined_assembly_summary.txt") as handle:
Parallel(n_jobs=8)(delayed(download_assembly)(assembly) for assembly in tqdm(iter_assembly_summary(handle)))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment