Skip to content

Instantly share code, notes, and snippets.

@dwinter
Created April 30, 2014 23:39
Show Gist options
  • Save dwinter/87cf6bb1127fa8fbfba4 to your computer and use it in GitHub Desktop.
Save dwinter/87cf6bb1127fa8fbfba4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Fetch all FASTQ files associated with an SRA ID
usage:
sra_fetch.py [sra-id]
"""
import xml.etree.ElementTree as ET
import urllib2
import sys
import time
base_xml_url = "http://www.ebi.ac.uk/ena/data/view/{0}&display=xml"
base_fastq_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/{0}/{1}/{2}"
CHUNK = 16 * 1024
def fetch_run_ids(SRA_ID):
""" """
rec = ET.parse(urllib2.urlopen(base_xml_url.format(SRA_ID)))
ena_node = [r for r in rec.findall(".//XREF_LINK") if r[0].text == "ENA-RUN"][0]
return ena_node[1].text.split(","), bool( rec.findall(".//PAIRED") )
def fetch_fastq(run_id, paired=True):
if paired:
files = ["{0}_{1}.fastq.gz".format(run_id,i) for i in [1,2]]
else:
files = [run_id + "fastq.gz"]
for f in files:
url = base_fastq_url.format(run_id[:6], run_id, f)
print "Downloading " + f + "..."
response = urllib2.urlopen(url)
with open(f, "wb") as out:
while True:
next_chunk = response.read(CHUNK)
if not next_chunk:
response.close()
time.sleep(6)#let's not DoS the ENA
break
out.write(next_chunk)
def main():
SRA_ID = sys.argv[1]
ids, is_paired = fetch_run_ids(SRA_ID)
for sid in ids:
fetch_fastq(sid, is_paired)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment