Skip to content

Instantly share code, notes, and snippets.

@kcha
Last active February 6, 2025 06:50
Show Gist options
  • Save kcha/44ee07e042bc17c483c7ab8d2626293f to your computer and use it in GitHub Desktop.
Save kcha/44ee07e042bc17c483c7ab8d2626293f to your computer and use it in GitHub Desktop.
Download SRA files from EBI via Aspera.
#!/usr/bin/env python
from __future__ import print_function
from six import iteritems
import sys
import os
import os.path
import re
import argparse
import pandas as pd
import subprocess
def getoptions():
desc = """
Download datasets from EBI metafiles via Aspera
Requirements:
1) Aspera client installed (ascp)
2) Table of read metafiles from EBI browser
To obtain a metafile:
1) Go to https://www.ebi.ac.uk
2) Search for read files (e.g. PRJEB11960).
3) Click into the project page hosted by the European
Nucleotide Archive
4) Ensure that the FASTQ files (FTP) column is shown
5) Click on "Select columns" and check "Experiment title"
6) Click on "TEXT" to download the table as a text file
7) Execute this script using the text file as input"""
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
description=desc)
parser.add_argument('ebi_files', metavar='FILES', nargs='+',
help="Input EBI metafiles. Specify one or more.")
parser.add_argument('-d', '--filedelim', default=".", dest="delim",
help="Deliminter for file name [%(default)s]")
parser.add_argument('-s', '--sample_col',
choices = ['experiment_title', 'sample_title'],
default = 'experiment_title',
help="Column to use to rename sample [%(default)s]")
parser.add_argument('-R', '--norename', action='store_true',
dest='norename',
help='Do not rename files after downloading')
parser.add_argument('-t', '--test', action='store_true', dest="test",
help="Test run")
parser.add_argument('--ascp', dest='ascp',
default=os.environ['HOME'] +
'/.aspera/connect/bin/ascp',
help="Path of ascp command [%(default)s]")
parser.add_argument('--ssh', dest='ssh',
default=os.environ['HOME'] +
'/.aspera/connect/etc/asperaweb_id_dsa.openssh',
help="Path of asperaweb_id_dsa.openssh [%(default)s]")
args = parser.parse_args()
return args
def parse_sample_name(title):
parts = title.split("; ")
sample = parts[1].split(": ")
if len(sample) > 1:
sample = re.sub(r"(\s|&)", "_", sample[1])
else:
sample = re.sub(r"(\s|&)", "_", sample[0])
return sample
def parse_ftp(element, sample, delim):
#m = re.search(r'^(ftp[^;]*fastq\.gz);?(ftp.*fastq\.gz)?$', element)
m = element.split(";")
aspera_urls = []
if m:
#for ftp in m.groups():
for ftp in m:
if ftp is not None:
#base = os.path.basename(ftp)
#base_with_sample = sample + delim + base
#if not os.path.isfile(base) \
#and not os.path.isfile(base_with_sample):
aspera_urls.append(re.sub('ftp.sra.ebi.ac.uk/',
'era-fasp@fasp.sra.ebi.ac.uk:',
ftp))
return aspera_urls
def get_new_fq_name(fq, sample, delim):
original_fq = os.path.basename(fq)
new_fq = sample + delim + original_fq
return (original_fq, new_fq)
def rename_fastq(fq, sample, delim, dryrun, new_fq_only=False):
(original_fq, new_fq) = get_new_fq_name(fq, sample, delim)
if new_fq_only:
return new_fq
#assert os.path.isfile(original_fq), 'fastq file does not exist'
if dryrun:
print("%s -> %s" % (original_fq, new_fq))
else:
if os.path.isfile(original_fq):
os.rename(original_fq, new_fq)
print("Renaming %s -> %s" % (original_fq, new_fq))
elif not os.path.isfile(new_fq):
print("%s does not exist" % original_fq)
def main():
args = getoptions()
urls = {}
# Go through each file and extract URLs for download
for textfile in args.ebi_files:
ebi = pd.read_table(textfile)
for index, row in ebi.iterrows():
# Get sample name
try:
if args.sample_col == 'experiment_title':
sample = parse_sample_name(row.experiment_title)
else:
sample = row.sample_title
sample = sample.replace(" ", "_")
sample = sample.replace(",", ".")
except AttributeError:
print(("Error: Can't find experiment_title column"
" in file %s" % textfile),
file=sys.stderr)
sys.exit(1)
except IndexError as e:
print(str(e), file=sys.stderr)
sys.exit(1)
# Get experiment name
try:
sample_ftps = parse_ftp(row.fastq_ftp, sample, args.delim)
assert len(sample_ftps) > 0
# store sample name for each ftp url
for link in sample_ftps:
urls[link] = sample
except AttributeError:
print(("Error: Can't find fastq_ftp column"
" in file %s" % textfile),
file=sys.stderr)
assert len(urls) > 0
# Download the files
if args.test:
print("~~~ DRY RUN ~~~", file=sys.stderr)
count = 0
for fq, sample in iteritems(urls):
(og, new_fq) = get_new_fq_name(fq, sample, args.delim)
if not os.path.isfile(new_fq):
command = [args.ascp, '-k1', '-QTr', '-l', '10000m', '-L', '.',
'-P33001', '-i', args.ssh,
fq,
'.']
print(' '.join(command), file=sys.stderr)
if not args.test:
try:
subprocess.check_call(command)
if not args.norename:
rename_fastq(fq, sample, args.delim, args.test)
count += 1
except OSError:
print("Problem with downloading %s" % fq, file=sys.stderr)
else:
if not args.norename:
rename_fastq(fq, sample, args.delim, args.test)
else:
print("Skipping %s because %s exists" % (os.path.basename(fq),
new_fq), file=sys.stderr)
print("---------------------------------------------", file=sys.stderr)
print("Downloaded %d/%d files" % (count, len(urls)), file=sys.stderr)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment