mscook/data_fetcher.py

## data_fetcher.py
#!/usr/bin/env python

# Copyright 2014 Beatson Laboratory Licensed under the
# Educational Community License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may
# obtain a copy of the License at
#
# http://www.osedu.org/licenses/ECL-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS"
# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.

"""
A tool to programatically mass download data from the ENA
"""

import sys, argparse, urllib2, ftplib


def get_required_metadata(study_id):
    """
    Fetch a txt document containing useful data Given a valid SRA/ENA study id

    :param study_id: a valid SRA/ENA study id
    :type study_id: string

    :returns: the txt formatted metadata info as a list
    """
    BASE = ("http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession="
            "CURRENT&result=read_run&fields=study_accession,"
            "secondary_study_accession,sample_accession,"
            "secondary_sample_accession,experiment_accession,run_accession"
            ",scientific_name,instrument_model,library_layout,fastq_ftp"
            ",fastq_galaxy,submitted_ftp,submitted_galaxy,col_tax_id,"
            "col_scientific_name,sra_ftp,sra_galaxy,experiment_title,fastq_md5")
    BASE = BASE.replace('CURRENT', study_id)
    response = urllib2.urlopen(BASE)
    # Turn into a list
    return response.read().split('\n')


def parse_meta_data(args, metadata_list):
    """
    Extract out strain ID & ftp_links from a metadata list

    :param metadata: a metadata list as returned by
                     get_required_metadata(study_id):
    :type metadata_list: list

    :returns: 3 lists. First is a Banzai renaming file, 2nd is md5 hases and
              3rd is a list of FTP files to download
    """
    ftp_urls = []
    rename   = []
    md5sum   = []
    # strip off header, skip blank last line
    metadata_list = metadata_list[1:-1]
    for ele in metadata_list:
        cur = ele.split('\t')
        # Get the strain ID - might be specific to this dataset. Please check
        sid = cur[6].split()[-1]
        if cur[8] == 'PAIRED':
            # Get the read ftp URLs
            r1, r2 = cur[9].split(';')
            c1, c2 = cur[-1].strip().split(';')
            if r1.find("_1.fastq.gz") != -1:
                tmp = sid+"_1.fastq.gz,"+r1.split('/')[-1]
                rename.append(tmp)
                md5sum.append(c1+"\t"+r1.split('/')[-1])
            else:
                print "Read naming does not follow expected pattern"
                sys.exit(1)
            if r2.find("_2.fastq.gz") != -1:
                tmp = sid+"_2.fastq.gz,"+r2.split('/')[-1]
                rename.append(tmp)
                md5sum.append(c2+"\t"+r2.split('/')[-1])
            else:
                print "Read naming does not follow expected pattern"
                sys.exit(1)
            ftp_urls.append(r1)
            ftp_urls.append(r2)
        elif cur[8] == 'SINGLE' and args.single == True and cur[-1].strip().find(';') == -1:
            sid = cur[-2].split(" - ")[-1]
            # Get the read ftp URLs
            r1 = cur[9]
            c1 = cur[-1].strip()
            if r1.find(".fastq.gz") != -1:
                tmp = sid+".fastq.gz,"+r1.split('/')[-1]
                rename.append(tmp)
                md5sum.append(c1+"\t"+r1.split('/')[-1])
            else:
                print "Read naming does not follow expected pattern"
                sys.exit(1)
            ftp_urls.append(r1)
        else:
            print "Something is wrong... Skipping"
            print cur
    return rename, md5sum, ftp_urls


def print_data_stats(args, urls):
    """
    Calculate number of read *pairs*that will be donwloaded

    :param urls: list of ftp urls
    """
    # Assume paired
    if args.single == False:
        print "Will download paired reads for %i strains" % (len(urls)/2)
    else:
        "Will download single end reads for %i strains" % len(urls)

def build_files(rename, md5):
    """
    TODO: Document
    """
    with open("rename.dat", 'w') as f1, open ("checksums.md5", 'w') as f2:
        for line in rename:
            f1.write(line+"\n")
        for line in md5:
            f2.write(line+"\n")

def download_files(ftp_data):
    """
    Use FTP to connect and download all files in given list

    :param ftp_data: a list of ftp server paths
    :type ftp_data: list
    """
    for url in ftp_data:
        tmp =  url.split('/')
        server = tmp[0]
        location = '/'.join(tmp[1:])
        fname = tmp[-1]
        ftp = ftplib.FTP(server, 'anonymous', 'm.stantoncook@gmail.com')
        print "Getting %s " % (fname)

        with open(fname, 'w') as out:
            ftp.retrbinary('RETR ' + location, out.write)


def core(args):
    """
    The core function (accepts argparse and calls required)

    :param args: an argparse object
    """
    # Download the fastq
    metadata = get_required_metadata(args.study_id)
    rename, md5, urls  = parse_meta_data(args, metadata)
    print_data_stats(args, urls)
    build_files(rename, md5)
    download_files(urls)


if __name__ == '__main__':
    try:
        parser = argparse.ArgumentParser(description=
                'Bulk downloader given SRA/ENA study accession',
                epilog='Written by the Beatson Lab. http://www.beatsonlab.com')
        parser.add_argument("study_id", help="A valid SRA/ENA accession")
        parser.add_argument("--single", default=False, action='store_true',
                            help="SE reads")
        parser.set_defaults(func=core)
        args = parser.parse_args()
        args.func(args)
        sys.exit(0)
    except:
        pass
	#!/usr/bin/env python

	# Copyright 2014 Beatson Laboratory Licensed under the
	# Educational Community License, Version 2.0 (the "License"); you may
	# not use this file except in compliance with the License. You may
	# obtain a copy of the License at
	#
	# http://www.osedu.org/licenses/ECL-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an "AS IS"
	# BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
	# or implied. See the License for the specific language governing
	# permissions and limitations under the License.

	"""
	A tool to programatically mass download data from the ENA
	"""

	import sys, argparse, urllib2, ftplib


	def get_required_metadata(study_id):
	"""
	Fetch a txt document containing useful data Given a valid SRA/ENA study id

	:param study_id: a valid SRA/ENA study id
	:type study_id: string

	:returns: the txt formatted metadata info as a list
	"""
	BASE = ("http://www.ebi.ac.uk/ena/data/warehouse/filereport?accession="
	"CURRENT&result=read_run&fields=study_accession,"
	"secondary_study_accession,sample_accession,"
	"secondary_sample_accession,experiment_accession,run_accession"
	",scientific_name,instrument_model,library_layout,fastq_ftp"
	",fastq_galaxy,submitted_ftp,submitted_galaxy,col_tax_id,"
	"col_scientific_name,sra_ftp,sra_galaxy,experiment_title,fastq_md5")
	BASE = BASE.replace('CURRENT', study_id)
	response = urllib2.urlopen(BASE)
	# Turn into a list
	return response.read().split('\n')


	def parse_meta_data(args, metadata_list):
	"""
	Extract out strain ID & ftp_links from a metadata list

	:param metadata: a metadata list as returned by
	get_required_metadata(study_id):
	:type metadata_list: list

	:returns: 3 lists. First is a Banzai renaming file, 2nd is md5 hases and
	3rd is a list of FTP files to download
	"""
	ftp_urls = []
	rename = []
	md5sum = []
	# strip off header, skip blank last line
	metadata_list = metadata_list[1:-1]
	for ele in metadata_list:
	cur = ele.split('\t')
	# Get the strain ID - might be specific to this dataset. Please check
	sid = cur[6].split()[-1]
	if cur[8] == 'PAIRED':
	# Get the read ftp URLs
	r1, r2 = cur[9].split(';')
	c1, c2 = cur[-1].strip().split(';')
	if r1.find("_1.fastq.gz") != -1:
	tmp = sid+"_1.fastq.gz,"+r1.split('/')[-1]
	rename.append(tmp)
	md5sum.append(c1+"\t"+r1.split('/')[-1])
	else:
	print "Read naming does not follow expected pattern"
	sys.exit(1)
	if r2.find("_2.fastq.gz") != -1:
	tmp = sid+"_2.fastq.gz,"+r2.split('/')[-1]
	rename.append(tmp)
	md5sum.append(c2+"\t"+r2.split('/')[-1])
	else:
	print "Read naming does not follow expected pattern"
	sys.exit(1)
	ftp_urls.append(r1)
	ftp_urls.append(r2)
	elif cur[8] == 'SINGLE' and args.single == True and cur[-1].strip().find(';') == -1:
	sid = cur[-2].split(" - ")[-1]
	# Get the read ftp URLs
	r1 = cur[9]
	c1 = cur[-1].strip()
	if r1.find(".fastq.gz") != -1:
	tmp = sid+".fastq.gz,"+r1.split('/')[-1]
	rename.append(tmp)
	md5sum.append(c1+"\t"+r1.split('/')[-1])
	else:
	print "Read naming does not follow expected pattern"
	sys.exit(1)
	ftp_urls.append(r1)
	else:
	print "Something is wrong... Skipping"
	print cur
	return rename, md5sum, ftp_urls


	def print_data_stats(args, urls):
	"""
	Calculate number of read pairsthat will be donwloaded

	:param urls: list of ftp urls
	"""
	# Assume paired
	if args.single == False:
	print "Will download paired reads for %i strains" % (len(urls)/2)
	else:
	"Will download single end reads for %i strains" % len(urls)

	def build_files(rename, md5):
	"""
	TODO: Document
	"""
	with open("rename.dat", 'w') as f1, open ("checksums.md5", 'w') as f2:
	for line in rename:
	f1.write(line+"\n")
	for line in md5:
	f2.write(line+"\n")

	def download_files(ftp_data):
	"""
	Use FTP to connect and download all files in given list

	:param ftp_data: a list of ftp server paths
	:type ftp_data: list
	"""
	for url in ftp_data:
	tmp = url.split('/')
	server = tmp[0]
	location = '/'.join(tmp[1:])
	fname = tmp[-1]
	ftp = ftplib.FTP(server, 'anonymous', 'm.stantoncook@gmail.com')
	print "Getting %s " % (fname)

	with open(fname, 'w') as out:
	ftp.retrbinary('RETR ' + location, out.write)


	def core(args):
	"""
	The core function (accepts argparse and calls required)

	:param args: an argparse object
	"""
	# Download the fastq
	metadata = get_required_metadata(args.study_id)
	rename, md5, urls = parse_meta_data(args, metadata)
	print_data_stats(args, urls)
	build_files(rename, md5)
	download_files(urls)


	if __name__ == '__main__':
	try:
	parser = argparse.ArgumentParser(description=
	'Bulk downloader given SRA/ENA study accession',
	epilog='Written by the Beatson Lab. http://www.beatsonlab.com')
	parser.add_argument("study_id", help="A valid SRA/ENA accession")
	parser.add_argument("--single", default=False, action='store_true',
	help="SE reads")
	parser.set_defaults(func=core)
	args = parser.parse_args()
	args.func(args)
	sys.exit(0)
	except:
	pass