kcha/download_ebi.py

## download_ebi.py
#!/usr/bin/env python

from __future__ import print_function
from six import iteritems
import sys
import os
import os.path
import re
import argparse
import pandas as pd
import subprocess


def getoptions():
    desc = """
Download datasets from EBI metafiles via Aspera

Requirements:
  1) Aspera client installed (ascp)
  2) Table of read metafiles from EBI browser

To obtain a metafile:
  1) Go to https://www.ebi.ac.uk
  2) Search for read files (e.g. PRJEB11960).
  3) Click into the project page hosted by the European
     Nucleotide Archive
  4) Ensure that the FASTQ files (FTP) column is shown
  5) Click on "Select columns" and check "Experiment title"
  6) Click on "TEXT" to download the table as a text file
  7) Execute this script using the text file as input"""
    parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
                                     description=desc)
    parser.add_argument('ebi_files', metavar='FILES', nargs='+',
                        help="Input EBI metafiles. Specify one or more.")
    parser.add_argument('-d', '--filedelim', default=".", dest="delim",
                        help="Deliminter for file name [%(default)s]")
    parser.add_argument('-s', '--sample_col',
                        choices = ['experiment_title', 'sample_title'],
                        default = 'experiment_title',
                        help="Column to use to rename sample [%(default)s]")
    parser.add_argument('-R', '--norename', action='store_true',
                        dest='norename',
                        help='Do not rename files after downloading')
    parser.add_argument('-t', '--test', action='store_true', dest="test",
                        help="Test run")
    parser.add_argument('--ascp', dest='ascp',
                        default=os.environ['HOME'] +
                            '/.aspera/connect/bin/ascp',
                        help="Path of ascp command [%(default)s]")
    parser.add_argument('--ssh', dest='ssh',
                        default=os.environ['HOME'] +
                            '/.aspera/connect/etc/asperaweb_id_dsa.openssh',
                        help="Path of asperaweb_id_dsa.openssh [%(default)s]")
    args = parser.parse_args()
    return args


def parse_sample_name(title):
    parts = title.split("; ")
    sample = parts[1].split(": ")
    if len(sample) > 1:
        sample = re.sub(r"(\s|&amp;)", "_", sample[1])
    else:
        sample = re.sub(r"(\s|&amp;)", "_", sample[0])
    return sample


def parse_ftp(element, sample, delim):
    #m = re.search(r'^(ftp[^;]*fastq\.gz);?(ftp.*fastq\.gz)?$', element)
    m = element.split(";")
    aspera_urls = []
    if m:
        #for ftp in m.groups():
        for ftp in m:
            if ftp is not None:
                #base = os.path.basename(ftp)
                #base_with_sample = sample + delim + base

                #if not os.path.isfile(base) \
                        #and not os.path.isfile(base_with_sample):
                aspera_urls.append(re.sub('ftp.sra.ebi.ac.uk/',
                                          'era-fasp@fasp.sra.ebi.ac.uk:',
                                          ftp))
    return aspera_urls


def get_new_fq_name(fq, sample, delim):
    original_fq = os.path.basename(fq)
    new_fq = sample + delim + original_fq
    return (original_fq, new_fq)


def rename_fastq(fq, sample, delim, dryrun, new_fq_only=False):
    (original_fq, new_fq) = get_new_fq_name(fq, sample, delim)

    if new_fq_only:
        return new_fq

    #assert os.path.isfile(original_fq), 'fastq file does not exist'
    if dryrun:
        print("%s -> %s" % (original_fq, new_fq))
    else:
        if os.path.isfile(original_fq):
            os.rename(original_fq, new_fq)
            print("Renaming %s -> %s" % (original_fq, new_fq))
        elif not os.path.isfile(new_fq):
            print("%s does not exist" % original_fq)


def main():
    args = getoptions()
    urls = {}

    # Go through each file and extract URLs for download
    for textfile in args.ebi_files:
        ebi = pd.read_table(textfile)

        for index, row in ebi.iterrows():

            # Get sample name
            try:
                if args.sample_col == 'experiment_title':
                    sample = parse_sample_name(row.experiment_title)
                else:
                    sample = row.sample_title
                sample = sample.replace(" ", "_")
                sample = sample.replace(",", ".")

            except AttributeError:
                print(("Error: Can't find experiment_title column"
                                      " in file %s" % textfile),
                      file=sys.stderr)
                sys.exit(1)
            except IndexError as e:
                print(str(e), file=sys.stderr)
                sys.exit(1)


            # Get experiment name
            try:
                sample_ftps = parse_ftp(row.fastq_ftp, sample, args.delim)
                assert len(sample_ftps) > 0

                # store sample name for each ftp url
                for link in sample_ftps:
                    urls[link] = sample
            except AttributeError:
                print(("Error: Can't find fastq_ftp column"
                                      " in file %s" % textfile),
                      file=sys.stderr)


    assert len(urls) > 0

    # Download the files
    if args.test:
        print("~~~ DRY RUN ~~~", file=sys.stderr)

    count = 0

    for fq, sample in iteritems(urls):
        (og, new_fq) = get_new_fq_name(fq, sample, args.delim)

        if not os.path.isfile(new_fq):
            command = [args.ascp, '-k1', '-QTr', '-l', '10000m', '-L', '.',
                       '-P33001', '-i', args.ssh,
                       fq,
                       '.']
            print(' '.join(command), file=sys.stderr)

            if not args.test:
                try:
                    subprocess.check_call(command)

                    if not args.norename:
                        rename_fastq(fq, sample, args.delim, args.test)

                    count += 1
                except OSError:
                    print("Problem with downloading %s" % fq, file=sys.stderr)

            else:
                if not args.norename:
                    rename_fastq(fq, sample, args.delim, args.test)
        else:
            print("Skipping %s because %s exists" % (os.path.basename(fq),
                                                     new_fq), file=sys.stderr)

    print("---------------------------------------------", file=sys.stderr)
    print("Downloaded %d/%d files" % (count, len(urls)), file=sys.stderr)


if __name__ == '__main__':
    main()
	#!/usr/bin/env python

	from __future__ import print_function
	from six import iteritems
	import sys
	import os
	import os.path
	import re
	import argparse
	import pandas as pd
	import subprocess


	def getoptions():
	desc = """
	Download datasets from EBI metafiles via Aspera

	Requirements:
	1) Aspera client installed (ascp)
	2) Table of read metafiles from EBI browser

	To obtain a metafile:
	1) Go to https://www.ebi.ac.uk
	2) Search for read files (e.g. PRJEB11960).
	3) Click into the project page hosted by the European
	Nucleotide Archive
	4) Ensure that the FASTQ files (FTP) column is shown
	5) Click on "Select columns" and check "Experiment title"
	6) Click on "TEXT" to download the table as a text file
	7) Execute this script using the text file as input"""
	parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
	description=desc)
	parser.add_argument('ebi_files', metavar='FILES', nargs='+',
	help="Input EBI metafiles. Specify one or more.")
	parser.add_argument('-d', '--filedelim', default=".", dest="delim",
	help="Deliminter for file name [%(default)s]")
	parser.add_argument('-s', '--sample_col',
	choices = ['experiment_title', 'sample_title'],
	default = 'experiment_title',
	help="Column to use to rename sample [%(default)s]")
	parser.add_argument('-R', '--norename', action='store_true',
	dest='norename',
	help='Do not rename files after downloading')
	parser.add_argument('-t', '--test', action='store_true', dest="test",
	help="Test run")
	parser.add_argument('--ascp', dest='ascp',
	default=os.environ['HOME'] +
	'/.aspera/connect/bin/ascp',
	help="Path of ascp command [%(default)s]")
	parser.add_argument('--ssh', dest='ssh',
	default=os.environ['HOME'] +
	'/.aspera/connect/etc/asperaweb_id_dsa.openssh',
	help="Path of asperaweb_id_dsa.openssh [%(default)s]")
	args = parser.parse_args()
	return args


	def parse_sample_name(title):
	parts = title.split("; ")
	sample = parts[1].split(": ")
	if len(sample) > 1:
	sample = re.sub(r"(\s\|&)", "_", sample[1])
	else:
	sample = re.sub(r"(\s\|&)", "_", sample[0])
	return sample


	def parse_ftp(element, sample, delim):
	#m = re.search(r'^(ftp[^;]fastq\.gz);?(ftp.fastq\.gz)?$', element)
	m = element.split(";")
	aspera_urls = []
	if m:
	#for ftp in m.groups():
	for ftp in m:
	if ftp is not None:
	#base = os.path.basename(ftp)
	#base_with_sample = sample + delim + base

	#if not os.path.isfile(base) \
	#and not os.path.isfile(base_with_sample):
	aspera_urls.append(re.sub('ftp.sra.ebi.ac.uk/',
	'era-fasp@fasp.sra.ebi.ac.uk:',
	ftp))
	return aspera_urls


	def get_new_fq_name(fq, sample, delim):
	original_fq = os.path.basename(fq)
	new_fq = sample + delim + original_fq
	return (original_fq, new_fq)


	def rename_fastq(fq, sample, delim, dryrun, new_fq_only=False):
	(original_fq, new_fq) = get_new_fq_name(fq, sample, delim)

	if new_fq_only:
	return new_fq

	#assert os.path.isfile(original_fq), 'fastq file does not exist'
	if dryrun:
	print("%s -> %s" % (original_fq, new_fq))
	else:
	if os.path.isfile(original_fq):
	os.rename(original_fq, new_fq)
	print("Renaming %s -> %s" % (original_fq, new_fq))
	elif not os.path.isfile(new_fq):
	print("%s does not exist" % original_fq)


	def main():
	args = getoptions()
	urls = {}

	# Go through each file and extract URLs for download
	for textfile in args.ebi_files:
	ebi = pd.read_table(textfile)

	for index, row in ebi.iterrows():

	# Get sample name
	try:
	if args.sample_col == 'experiment_title':
	sample = parse_sample_name(row.experiment_title)
	else:
	sample = row.sample_title
	sample = sample.replace(" ", "_")
	sample = sample.replace(",", ".")

	except AttributeError:
	print(("Error: Can't find experiment_title column"
	" in file %s" % textfile),
	file=sys.stderr)
	sys.exit(1)
	except IndexError as e:
	print(str(e), file=sys.stderr)
	sys.exit(1)


	# Get experiment name
	try:
	sample_ftps = parse_ftp(row.fastq_ftp, sample, args.delim)
	assert len(sample_ftps) > 0

	# store sample name for each ftp url
	for link in sample_ftps:
	urls[link] = sample
	except AttributeError:
	print(("Error: Can't find fastq_ftp column"
	" in file %s" % textfile),
	file=sys.stderr)



	assert len(urls) > 0

	# Download the files
	if args.test:
	print("~~~ DRY RUN ~~~", file=sys.stderr)

	count = 0

	for fq, sample in iteritems(urls):
	(og, new_fq) = get_new_fq_name(fq, sample, args.delim)

	if not os.path.isfile(new_fq):
	command = [args.ascp, '-k1', '-QTr', '-l', '10000m', '-L', '.',
	'-P33001', '-i', args.ssh,
	fq,
	'.']
	print(' '.join(command), file=sys.stderr)

	if not args.test:
	try:
	subprocess.check_call(command)

	if not args.norename:
	rename_fastq(fq, sample, args.delim, args.test)

	count += 1
	except OSError:
	print("Problem with downloading %s" % fq, file=sys.stderr)

	else:
	if not args.norename:
	rename_fastq(fq, sample, args.delim, args.test)
	else:
	print("Skipping %s because %s exists" % (os.path.basename(fq),
	new_fq), file=sys.stderr)

	print("---------------------------------------------", file=sys.stderr)
	print("Downloaded %d/%d files" % (count, len(urls)), file=sys.stderr)


	if __name__ == '__main__':
	main()