Skip to content

Instantly share code, notes, and snippets.

@nvictus
Last active February 6, 2024 16:31
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nvictus/d1a04c2c2d3e949e75e12aa03b35485f to your computer and use it in GitHub Desktop.
Save nvictus/d1a04c2c2d3e949e75e12aa03b35485f to your computer and use it in GitHub Desktop.
Dump fastq-dump! Download NCBI-SRA FASTQs directly from the European Nucleotide Archive.
#!/usr/bin/env python
import hashlib
import os.path as op
import os
import re
import warnings
from contextlib import closing
from urllib.parse import urlsplit
from urllib.request import urlopen
import requests
import tqdm
# API docs PDF available here: https://www.ebi.ac.uk/ena/portal/api/doc
ENA_BASE_URL = "https://www.ebi.ac.uk/ena/portal/api"
def _get_run_report(accession):
params = {
"accession": str(accession),
"result": "read_run",
"fields": "run_accession,sample_title,fastq_ftp,fastq_md5,fastq_bytes",
"format": "json",
}
r = requests.get(ENA_BASE_URL + '/filereport', params)
r.raise_for_status()
msg = r.json()
for record in msg:
for field in ["fastq_ftp", "fastq_md5", "fastq_bytes"]:
record[field] = record[field].split(";")
record["fastq_bytes"] = list(map(int, record["fastq_bytes"]))
return msg
def _copyfileobj_with_progbar(fsrc, fdst, filesize, bufsize=16 * 1024, desc=None):
file_hash = hashlib.md5()
progbar = tqdm.tqdm(total=filesize, desc=desc, unit="B", unit_scale=True)
try:
while True:
buf = fsrc.read(bufsize)
if not buf:
break
fdst.write(buf)
file_hash.update(buf)
progbar.update(bufsize)
finally:
progbar.close()
return file_hash.hexdigest()
def _download_fastq(report, dest_dir):
dest_paths = []
for url, filesize, filemd5 in zip(
report["fastq_ftp"], report["fastq_bytes"], report["fastq_md5"]
):
if not url.startswith("ftp://"):
url = "ftp://" + url
filename = urlsplit(url).path.split("/")[-1]
outpath = op.join(dest_dir, filename)
dest_paths.append(outpath)
with closing(urlopen(url)) as r, open(outpath, "wb") as f:
checksum = _copyfileobj_with_progbar(r, f, filesize, desc=outpath)
if checksum != filemd5:
warnings.warn(
f"Checksum mismatch for {filename}: {checksum} != {filemd5}"
)
return dest_paths
def download_fastq(run_accession, dest_dir=""):
"""
Download FASTQ files for a sequencing run from ENA.
Sequencing runs may be from NCBI-SRA (SRR), EMBL-SRA (ERR), DDBJ-SRA (DRR).
Parameters
----------
run_accession : str
Sequencing run accession, having format (E|D|S)RR[0-9]{6,}.
dest_dir: str, optional
Destination directory for download. Default is the cwd.
Returns
-------
dict
Download metadata, including sample name, urls, file sizes and md5
hashes.
Examples
--------
>>> download_fastq('SRR001030', '/tmp')
{'sample_accession': 'SAMN00000119',
'run_accession': 'SRR001030',
'sample_title': 'Generic sample from Homo sapiens',
'fastq_ftp': ['ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001030/SRR001030.fastq.gz'],
'fastq_md5': ['56ad9495ef258a7fd589ef384130797f'],
'fastq_bytes': [19778064],
'fastq_download': ['/tmp/SRR001030.fastq.gz']}
"""
report = _get_run_report(run_accession)
for record in report:
print("Downloading:", record["sample_title"])
download_paths = _download_fastq(record, dest_dir)
record["fastq_download"] = download_paths
if isinstance(report, list) and len(report) == 1:
report = report[0]
return report
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
"accession", help="Sequencing run accession, having format (E|D|S)RR[0-9]{6,}."
)
parser.add_argument(
"-d", "--dest-dir", default="", help="Specify the download directory."
)
parser.add_argument(
"-s", "--symlink-title",
action="store_true",
help="Create (hopefully) informative symlinks from the sample's title.",
)
args = parser.parse_args()
report = download_fastq(args.accession, args.dest_dir)
if args.symlink_title:
sample_title = report["sample_title"]
for i, download_path in enumerate(report["fastq_download"]):
dirpath, filename = op.split(download_path)
try:
side_ext = re.findall(r"((_[12])?(\.fastq\S*))", filename)[0][0]
except IndexError:
side_ext = "." + str(i + 1)
if side_ext.startswith("_"):
side_ext = "." + side_ext[1:]
os.symlink(download_path, op.join(dirpath, sample_title + side_ext))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment