nvictus/ena_download.py

## ena_download.py
#!/usr/bin/env python
import hashlib
import os.path as op
import os
import re
import warnings
from contextlib import closing
from urllib.parse import urlsplit
from urllib.request import urlopen

import requests
import tqdm

# API docs PDF available here: https://www.ebi.ac.uk/ena/portal/api/doc
ENA_BASE_URL = "https://www.ebi.ac.uk/ena/portal/api"


def _get_run_report(accession):
    params = {
        "accession": str(accession),
        "result": "read_run",
        "fields": "run_accession,sample_title,fastq_ftp,fastq_md5,fastq_bytes",
        "format": "json",
    }
    r = requests.get(ENA_BASE_URL + '/filereport', params)
    r.raise_for_status()
    msg = r.json()
    for record in msg:
        for field in ["fastq_ftp", "fastq_md5", "fastq_bytes"]:
            record[field] = record[field].split(";")
        record["fastq_bytes"] = list(map(int, record["fastq_bytes"]))
    return msg


def _copyfileobj_with_progbar(fsrc, fdst, filesize, bufsize=16 * 1024, desc=None):
    file_hash = hashlib.md5()
    progbar = tqdm.tqdm(total=filesize, desc=desc, unit="B", unit_scale=True)
    try:
        while True:
            buf = fsrc.read(bufsize)
            if not buf:
                break
            fdst.write(buf)
            file_hash.update(buf)
            progbar.update(bufsize)
    finally:
        progbar.close()
    return file_hash.hexdigest()


def _download_fastq(report, dest_dir):
    dest_paths = []
    for url, filesize, filemd5 in zip(
        report["fastq_ftp"], report["fastq_bytes"], report["fastq_md5"]
    ):
        if not url.startswith("ftp://"):
            url = "ftp://" + url
        filename = urlsplit(url).path.split("/")[-1]
        outpath = op.join(dest_dir, filename)
        dest_paths.append(outpath)

        with closing(urlopen(url)) as r, open(outpath, "wb") as f:
            checksum = _copyfileobj_with_progbar(r, f, filesize, desc=outpath)
            if checksum != filemd5:
                warnings.warn(
                    f"Checksum mismatch for {filename}: {checksum} != {filemd5}"
                )
    return dest_paths


def download_fastq(run_accession, dest_dir=""):
    """
    Download FASTQ files for a sequencing run from ENA.

    Sequencing runs may be from NCBI-SRA (SRR), EMBL-SRA (ERR), DDBJ-SRA (DRR).

    Parameters
    ----------
    run_accession : str
        Sequencing run accession, having format (E|D|S)RR[0-9]{6,}.

    dest_dir: str, optional
        Destination directory for download. Default is the cwd.

    Returns
    -------
    dict
        Download metadata, including sample name, urls, file sizes and md5
        hashes.

    Examples
    --------
    >>> download_fastq('SRR001030', '/tmp')
    {'sample_accession': 'SAMN00000119',
     'run_accession': 'SRR001030',
     'sample_title': 'Generic sample from Homo sapiens',
     'fastq_ftp': ['ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001030/SRR001030.fastq.gz'],
     'fastq_md5': ['56ad9495ef258a7fd589ef384130797f'],
     'fastq_bytes': [19778064],
     'fastq_download': ['/tmp/SRR001030.fastq.gz']}

    """
    report = _get_run_report(run_accession)
    for record in report:
        print("Downloading:", record["sample_title"])
        download_paths = _download_fastq(record, dest_dir)
        record["fastq_download"] = download_paths
    if isinstance(report, list) and len(report) == 1:
        report = report[0]
    return report


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "accession", help="Sequencing run accession, having format (E|D|S)RR[0-9]{6,}."
    )
    parser.add_argument(
        "-d", "--dest-dir", default="", help="Specify the download directory."
    )
    parser.add_argument(
        "-s", "--symlink-title",
        action="store_true",
        help="Create (hopefully) informative symlinks from the sample's title.",
    )
    args = parser.parse_args()

    report = download_fastq(args.accession, args.dest_dir)

    if args.symlink_title:
        sample_title = report["sample_title"]
        for i, download_path in enumerate(report["fastq_download"]):
            dirpath, filename = op.split(download_path)

            try:
                side_ext = re.findall(r"((_[12])?(\.fastq\S*))", filename)[0][0]
            except IndexError:
                side_ext = "." + str(i + 1)

            if side_ext.startswith("_"):
                side_ext = "." + side_ext[1:]

            os.symlink(download_path, op.join(dirpath, sample_title + side_ext))
	#!/usr/bin/env python
	import hashlib
	import os.path as op
	import os
	import re
	import warnings
	from contextlib import closing
	from urllib.parse import urlsplit
	from urllib.request import urlopen

	import requests
	import tqdm

	# API docs PDF available here: https://www.ebi.ac.uk/ena/portal/api/doc
	ENA_BASE_URL = "https://www.ebi.ac.uk/ena/portal/api"


	def _get_run_report(accession):
	params = {
	"accession": str(accession),
	"result": "read_run",
	"fields": "run_accession,sample_title,fastq_ftp,fastq_md5,fastq_bytes",
	"format": "json",
	}
	r = requests.get(ENA_BASE_URL + '/filereport', params)
	r.raise_for_status()
	msg = r.json()
	for record in msg:
	for field in ["fastq_ftp", "fastq_md5", "fastq_bytes"]:
	record[field] = record[field].split(";")
	record["fastq_bytes"] = list(map(int, record["fastq_bytes"]))
	return msg


	def _copyfileobj_with_progbar(fsrc, fdst, filesize, bufsize=16 * 1024, desc=None):
	file_hash = hashlib.md5()
	progbar = tqdm.tqdm(total=filesize, desc=desc, unit="B", unit_scale=True)
	try:
	while True:
	buf = fsrc.read(bufsize)
	if not buf:
	break
	fdst.write(buf)
	file_hash.update(buf)
	progbar.update(bufsize)
	finally:
	progbar.close()
	return file_hash.hexdigest()


	def _download_fastq(report, dest_dir):
	dest_paths = []
	for url, filesize, filemd5 in zip(
	report["fastq_ftp"], report["fastq_bytes"], report["fastq_md5"]
	):
	if not url.startswith("ftp://"):
	url = "ftp://" + url
	filename = urlsplit(url).path.split("/")[-1]
	outpath = op.join(dest_dir, filename)
	dest_paths.append(outpath)

	with closing(urlopen(url)) as r, open(outpath, "wb") as f:
	checksum = _copyfileobj_with_progbar(r, f, filesize, desc=outpath)
	if checksum != filemd5:
	warnings.warn(
	f"Checksum mismatch for {filename}: {checksum} != {filemd5}"
	)
	return dest_paths


	def download_fastq(run_accession, dest_dir=""):
	"""
	Download FASTQ files for a sequencing run from ENA.

	Sequencing runs may be from NCBI-SRA (SRR), EMBL-SRA (ERR), DDBJ-SRA (DRR).

	Parameters
	----------
	run_accession : str
	Sequencing run accession, having format (E\|D\|S)RR[0-9]{6,}.

	dest_dir: str, optional
	Destination directory for download. Default is the cwd.

	Returns
	-------
	dict
	Download metadata, including sample name, urls, file sizes and md5
	hashes.

	Examples
	--------
	>>> download_fastq('SRR001030', '/tmp')
	{'sample_accession': 'SAMN00000119',
	'run_accession': 'SRR001030',
	'sample_title': 'Generic sample from Homo sapiens',
	'fastq_ftp': ['ftp.sra.ebi.ac.uk/vol1/fastq/SRR001/SRR001030/SRR001030.fastq.gz'],
	'fastq_md5': ['56ad9495ef258a7fd589ef384130797f'],
	'fastq_bytes': [19778064],
	'fastq_download': ['/tmp/SRR001030.fastq.gz']}

	"""
	report = _get_run_report(run_accession)
	for record in report:
	print("Downloading:", record["sample_title"])
	download_paths = _download_fastq(record, dest_dir)
	record["fastq_download"] = download_paths
	if isinstance(report, list) and len(report) == 1:
	report = report[0]
	return report


	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument(
	"accession", help="Sequencing run accession, having format (E\|D\|S)RR[0-9]{6,}."
	)
	parser.add_argument(
	"-d", "--dest-dir", default="", help="Specify the download directory."
	)
	parser.add_argument(
	"-s", "--symlink-title",
	action="store_true",
	help="Create (hopefully) informative symlinks from the sample's title.",
	)
	args = parser.parse_args()

	report = download_fastq(args.accession, args.dest_dir)

	if args.symlink_title:
	sample_title = report["sample_title"]
	for i, download_path in enumerate(report["fastq_download"]):
	dirpath, filename = op.split(download_path)

	try:
	side_ext = re.findall(r"((_[12])?(\.fastq\S*))", filename)[0][0]
	except IndexError:
	side_ext = "." + str(i + 1)

	if side_ext.startswith("_"):
	side_ext = "." + side_ext[1:]

	os.symlink(download_path, op.join(dirpath, sample_title + side_ext))