Last active
February 6, 2025 06:50
-
-
Save kcha/44ee07e042bc17c483c7ab8d2626293f to your computer and use it in GitHub Desktop.
Download SRA files from EBI via Aspera.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function | |
from six import iteritems | |
import sys | |
import os | |
import os.path | |
import re | |
import argparse | |
import pandas as pd | |
import subprocess | |
def getoptions(): | |
desc = """ | |
Download datasets from EBI metafiles via Aspera | |
Requirements: | |
1) Aspera client installed (ascp) | |
2) Table of read metafiles from EBI browser | |
To obtain a metafile: | |
1) Go to https://www.ebi.ac.uk | |
2) Search for read files (e.g. PRJEB11960). | |
3) Click into the project page hosted by the European | |
Nucleotide Archive | |
4) Ensure that the FASTQ files (FTP) column is shown | |
5) Click on "Select columns" and check "Experiment title" | |
6) Click on "TEXT" to download the table as a text file | |
7) Execute this script using the text file as input""" | |
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, | |
description=desc) | |
parser.add_argument('ebi_files', metavar='FILES', nargs='+', | |
help="Input EBI metafiles. Specify one or more.") | |
parser.add_argument('-d', '--filedelim', default=".", dest="delim", | |
help="Deliminter for file name [%(default)s]") | |
parser.add_argument('-s', '--sample_col', | |
choices = ['experiment_title', 'sample_title'], | |
default = 'experiment_title', | |
help="Column to use to rename sample [%(default)s]") | |
parser.add_argument('-R', '--norename', action='store_true', | |
dest='norename', | |
help='Do not rename files after downloading') | |
parser.add_argument('-t', '--test', action='store_true', dest="test", | |
help="Test run") | |
parser.add_argument('--ascp', dest='ascp', | |
default=os.environ['HOME'] + | |
'/.aspera/connect/bin/ascp', | |
help="Path of ascp command [%(default)s]") | |
parser.add_argument('--ssh', dest='ssh', | |
default=os.environ['HOME'] + | |
'/.aspera/connect/etc/asperaweb_id_dsa.openssh', | |
help="Path of asperaweb_id_dsa.openssh [%(default)s]") | |
args = parser.parse_args() | |
return args | |
def parse_sample_name(title): | |
parts = title.split("; ") | |
sample = parts[1].split(": ") | |
if len(sample) > 1: | |
sample = re.sub(r"(\s|&)", "_", sample[1]) | |
else: | |
sample = re.sub(r"(\s|&)", "_", sample[0]) | |
return sample | |
def parse_ftp(element, sample, delim): | |
#m = re.search(r'^(ftp[^;]*fastq\.gz);?(ftp.*fastq\.gz)?$', element) | |
m = element.split(";") | |
aspera_urls = [] | |
if m: | |
#for ftp in m.groups(): | |
for ftp in m: | |
if ftp is not None: | |
#base = os.path.basename(ftp) | |
#base_with_sample = sample + delim + base | |
#if not os.path.isfile(base) \ | |
#and not os.path.isfile(base_with_sample): | |
aspera_urls.append(re.sub('ftp.sra.ebi.ac.uk/', | |
'era-fasp@fasp.sra.ebi.ac.uk:', | |
ftp)) | |
return aspera_urls | |
def get_new_fq_name(fq, sample, delim): | |
original_fq = os.path.basename(fq) | |
new_fq = sample + delim + original_fq | |
return (original_fq, new_fq) | |
def rename_fastq(fq, sample, delim, dryrun, new_fq_only=False): | |
(original_fq, new_fq) = get_new_fq_name(fq, sample, delim) | |
if new_fq_only: | |
return new_fq | |
#assert os.path.isfile(original_fq), 'fastq file does not exist' | |
if dryrun: | |
print("%s -> %s" % (original_fq, new_fq)) | |
else: | |
if os.path.isfile(original_fq): | |
os.rename(original_fq, new_fq) | |
print("Renaming %s -> %s" % (original_fq, new_fq)) | |
elif not os.path.isfile(new_fq): | |
print("%s does not exist" % original_fq) | |
def main(): | |
args = getoptions() | |
urls = {} | |
# Go through each file and extract URLs for download | |
for textfile in args.ebi_files: | |
ebi = pd.read_table(textfile) | |
for index, row in ebi.iterrows(): | |
# Get sample name | |
try: | |
if args.sample_col == 'experiment_title': | |
sample = parse_sample_name(row.experiment_title) | |
else: | |
sample = row.sample_title | |
sample = sample.replace(" ", "_") | |
sample = sample.replace(",", ".") | |
except AttributeError: | |
print(("Error: Can't find experiment_title column" | |
" in file %s" % textfile), | |
file=sys.stderr) | |
sys.exit(1) | |
except IndexError as e: | |
print(str(e), file=sys.stderr) | |
sys.exit(1) | |
# Get experiment name | |
try: | |
sample_ftps = parse_ftp(row.fastq_ftp, sample, args.delim) | |
assert len(sample_ftps) > 0 | |
# store sample name for each ftp url | |
for link in sample_ftps: | |
urls[link] = sample | |
except AttributeError: | |
print(("Error: Can't find fastq_ftp column" | |
" in file %s" % textfile), | |
file=sys.stderr) | |
assert len(urls) > 0 | |
# Download the files | |
if args.test: | |
print("~~~ DRY RUN ~~~", file=sys.stderr) | |
count = 0 | |
for fq, sample in iteritems(urls): | |
(og, new_fq) = get_new_fq_name(fq, sample, args.delim) | |
if not os.path.isfile(new_fq): | |
command = [args.ascp, '-k1', '-QTr', '-l', '10000m', '-L', '.', | |
'-P33001', '-i', args.ssh, | |
fq, | |
'.'] | |
print(' '.join(command), file=sys.stderr) | |
if not args.test: | |
try: | |
subprocess.check_call(command) | |
if not args.norename: | |
rename_fastq(fq, sample, args.delim, args.test) | |
count += 1 | |
except OSError: | |
print("Problem with downloading %s" % fq, file=sys.stderr) | |
else: | |
if not args.norename: | |
rename_fastq(fq, sample, args.delim, args.test) | |
else: | |
print("Skipping %s because %s exists" % (os.path.basename(fq), | |
new_fq), file=sys.stderr) | |
print("---------------------------------------------", file=sys.stderr) | |
print("Downloaded %d/%d files" % (count, len(urls)), file=sys.stderr) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment