Skip to content

Instantly share code, notes, and snippets.

@bartaelterman
Created September 26, 2014 11:52
Show Gist options
  • Save bartaelterman/8f911ea385fe0e4d3715 to your computer and use it in GitHub Desktop.
Save bartaelterman/8f911ea385fe0e4d3715 to your computer and use it in GitHub Desktop.
fetch bacterial RNA-seq data
import sys
import os
ldir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(ldir)
from query_sra import SRAQueryInterface
q = SRAQueryInterface()
header = ['experiment', 'study', 'runs', 'title', 'submitter', 'species', 'taxon_id', 'library_name', 'library_strategy', 'library_source', 'library_selection', 'platform', 'instrument', 'filter_result']
retstart = 0
more_results_to_fetch = True
out = open('RNA_Seq_experiments.tsv', 'w+', 0)
out.write('\t'.join(header) + '\n')
while more_results_to_fetch:
result_uids = q.fetch_experiment_uuids(term='RNA-Seq bacteria', retmax=500, retstart=retstart)
if len(result_uids) > 0:
retstart += 500
for uid in result_uids:
try:
doc = q.fetch_exp_summary_doc(uid)
tree = q.get_experiment_tree(doc)
filter_result = q.passes_experiment_filter(tree)
data = q.get_experiment_data(tree, doc)
outline = [data['experiment'], data['study'], ':'.join(data['runs']), data['title'], data['submitter'], data['species'], str(data['taxon_id']), str(data['library_name']), data['library_strategy'], data['library_source'], data['library_selection'], data['platform'], data['instrument'], str(filter_result)]
out.write('\t'.join(outline) + '\n')
except Exception as e:
print e
dss = doc.find('DocumentSummarySet')
ds = dss.find('DocumentSummary')
e = ds.find('ExpXml')
try:
print e.text
except:
print '--could not print text'
out.close()
# Fetch all RNA-Seq datasets from bacteria and flag those from Illumina and a gut species
RNA_Seq_experiments.tsv: find_sra_exps.py update_SRA_run_dates.py query_sra.py test_query_sra.py
python find_sra_exps.py
python update_SRA_run_dates.py RNA_Seq_experiments.tsv RNA_Seq_experiments_date.tsv
mv RNA_Seq_experiments_date.tsv RNA_Seq_experiments.tsv
import requests
import re
import os
import csv
from lxml import etree
class SRAQueryInterface:
GENOMES_FILE = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/data/IMGv4.0_GIT_genomes.tsv'
def get_create_date_for_run(self, uid):
params = {'db': 'sra', 'id': uid, 'version': '2.0'}
experiment_summaries_result = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi', params=params)
experiment_summaries_doc = etree.fromstring(experiment_summaries_result.text.encode('utf-8'))
doc_summary = experiment_summaries_doc.getchildren()[0]
experiments = doc_summary.getchildren()[0]
runs = doc_summary.getchildren()[1]
date = None
for child in runs.iterchildren():
if child.tag == 'CreateDate':
date = child.text
return date
def fetch_experiment_uuids(self, term=None, retmax=20, retstart=0):
params = {'db': 'sra', 'term': term, 'retmax': retmax, 'retstart': retstart}
result = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=params)
xml_result = etree.fromstring(result.text.encode('utf-8'))
experiment_ids = []
for child in xml_result.iterchildren():
if child.tag == 'IdList':
for idchild in child.iterchildren():
experiment_ids.append(idchild.text)
return experiment_ids
def _create_genomes_list(self):
self.genomes_list = []
with open(self.GENOMES_FILE) as f:
reader = csv.reader(f, delimiter='\t')
header = reader.next()
for row in reader:
genus = row[10]
species = row[11]
name1 = '{0} {1}'.format(genus, species)
name2 = '{0} {1}'.format(genus[0] + '.', species)
self.genomes_list.append(name1)
self.genomes_list.append(name2)
def is_gut_microbe(self, taxon_name):
if not 'genomes_list' in self.__dict__.keys():
self._create_genomes_list()
for gut_taxon in self.genomes_list:
if re.search(gut_taxon, taxon_name):
return True
return False
def fetch_all_RNA_experiments(self):
term = 'RNA'
retstart = 0
retmax = 500
more_to_fetch = True
experiment_uuids = []
while more_to_fetch:
uuids = fetch_uuid_experiments(term=term, retmax=retmax, retstart=retstart)
experiment_uuids += uuids
if len(uuids) == 0:
more_to_fetch = False
else:
retstart += 500
return experiment_uuids
def passes_experiment_filter(self, exp_tree):
experiment_is_ok = True
# should be rna seq
library_desctree = exp_tree.find('Library_descriptor')
source = library_desctree.find('LIBRARY_SOURCE').text
if source != 'TRANSCRIPTOMIC':
experiment_is_ok = False
# should be illumina
instr_tree = exp_tree.find('Instrument')
instr = instr_tree.values()[0]
if re.search('Illumina', instr) == None:
experiment_is_ok = False
# should be a gut microbe
species_tree = exp_tree.find('Organism')
try:
name = species_tree.attrib['ScientificName']
except KeyError, e:
name = species_tree.attrib['CommonName']
taxid = species_tree.attrib['taxid']
if not self.is_gut_microbe(name):
experiment_is_ok = False
return experiment_is_ok
def get_experiment_data(self, exp_tree, summary_tree):
# Title
title = exp_tree.find('Summary').find('Title').text
# Submitter center
submitter = exp_tree.find('Submitter').attrib['center_name']
# species
try:
name = exp_tree.find('Organism').attrib['ScientificName']
except KeyError, e:
name = exp_tree.find('Organism').attrib['CommonName']
taxid = exp_tree.find('Organism').attrib['taxid']
# library
library_desctree = exp_tree.find('Library_descriptor')
lib_name = library_desctree.find('LIBRARY_NAME').text
lib_strategy = library_desctree.find('LIBRARY_STRATEGY').text
lib_source = library_desctree.find('LIBRARY_SOURCE').text
lib_selection = library_desctree.find('LIBRARY_SELECTION').text
# platform
instr_tree = exp_tree.find('Instrument')
instr = instr_tree.values()[0]
platform = exp_tree.find('Summary').find('Platform').attrib['instrument_model']
#study accession
study_acc = exp_tree.find('Study').attrib['acc']
# experiment accession
exp_acc = exp_tree.find('Experiment').attrib['acc']
# runs
dss = summary_tree.find('DocumentSummarySet')
ds = dss.find('DocumentSummary')
runs = []
runs_tree = etree.fromstring('<Runs>' + ds.find('Runs').text + '</Runs>')
for child in runs_tree.iterchildren():
runs.append(child.attrib['acc'])
data = {
'experiment': exp_acc,
'study': study_acc,
'runs': runs,
'title': title,
'submitter': submitter,
'species': name,
'taxon_id': taxid,
'library_name': lib_name,
'library_strategy': lib_strategy,
'library_source': lib_source,
'library_selection': lib_selection,
'instrument': instr,
'platform': platform
}
return data
def fetch_exp_summary_doc(self, uuid):
params = {'db': 'sra', 'id': uuid, 'version': '2.0'}
print params
experiment_summaries_result = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi', params=params)
experiment_summaries_doc = etree.fromstring(experiment_summaries_result.text.encode('utf-8'))
return experiment_summaries_doc
def get_filtered_experiments_by_uuid(self, uuidlist):
outexperiments = []
for uuid in uuidlist:
experiment_summaries_doc = self.fetch_experiment_data
exp_tree = self.get_experiment_tree(experiment_summaries_doc)
if passes_experiment_filter(experiment_tree):
experiment_data = self.get_experiment_data(experiment_tree, experiment_summaries_doc)
created_date = ds.find('CreateDate').text
outexperiments.append(experiment_data)
return outexperiments
def get_experiment_tree(self, summary_tree):
dss = summary_tree.find('DocumentSummarySet')
ds = dss.find('DocumentSummary')
e = ds.find('ExpXml')
experiment_tree = etree.fromstring('<total>' + e.text + '</total>') # etree needs to have 1 root element
return experiment_tree
#!/usr/bin/python
import unittest
import sys
import os
ldir = os.path.dirname(os.path.realpath(__file__))
sys.path.append(ldir)
from query_sra import SRAQueryInterface
from lxml import etree
class TestSRA_Query(unittest.TestCase):
def setUp(self):
self.test_data = [
{'accession': 'ERX014110', 'uid': ['86375'], 'create_date': '2011/06/13'},
{'accession': 'ERX014108', 'uid': ['86373'], 'create_date': '2011/06/13'}
]
self.q = SRAQueryInterface()
def _get_dummy_experiment_summary(self):
xml = """<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary sra 20130524//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20130524/esummary_sra.dtd">
<eSummaryResult>
<DocumentSummarySet status="OK">
<DbBuild>Build140401-1723m.1</DbBuild>
<DocumentSummary uid="703993">
<ExpXml> &lt;Summary&gt;&lt;Title&gt;1135893704039 SAMN02711893 ROTAVIRUSA-RL-01-793 Experiment&lt;/Title&gt;&lt;Platform instrument_model=&quot;454 GS FLX Titanium&quot;&gt;LS454&lt;/Platform&gt;&lt;Statistics total_runs=&quot;1&quot; total_spots=&quot;0&quot; total_bases=&quot;0&quot; total_size=&quot;0&quot; load_done=&quot;true&quot; cluster_name=&quot;public&quot;/&gt;&lt;/Summary&gt;&lt;Submitter acc=&quot;SRA150000&quot; center_name=&quot;JCVI&quot; contact_name=&quot;Ewen Kirkness&quot; lab_name=&quot;&quot;/&gt;&lt;Experiment acc=&quot;SRX504714&quot; ver=&quot;1&quot; status=&quot;public&quot; name=&quot;1135893704039 SAMN02711893 ROTAVIRUSA-RL-01-793 Experiment&quot;/&gt;&lt;Study acc=&quot;SRP040739&quot; name=&quot;Human stool metatranscriptome&quot;/&gt;&lt;Organism taxid=&quot;408170&quot; ScientificName=&quot;human gut metagenome&quot;/&gt;&lt;Sample acc=&quot;SRS584396&quot; name=&quot;&quot;/&gt;&lt;Instrument LS454=&quot;454 GS FLX Titanium&quot;/&gt;&lt;Library_descriptor&gt;&lt;LIBRARY_NAME&gt;ROTAVIRUSA-RL-01-793&lt;/LIBRARY_NAME&gt;&lt;LIBRARY_STRATEGY&gt;RNA-Seq&lt;/LIBRARY_STRATEGY&gt;&lt;LIBRARY_SOURCE&gt;TRANSCRIPTOMIC&lt;/LIBRARY_SOURCE&gt;&lt;LIBRARY_SELECTION&gt;RANDOM&lt;/LIBRARY_SELECTION&gt;&lt;LIBRARY_LAYOUT&gt; &lt;SINGLE/&gt; &lt;/LIBRARY_LAYOUT&gt;&lt;/Library_descriptor&gt;&lt;Biosample id=&quot;2711893&quot; acc=&quot;SAMN02711893&quot; sample_id=&quot;742081&quot; sample_acc=&quot;SRS584396&quot;/&gt;&lt;Bioproject&gt;SRP040739&lt;/Bioproject&gt; </ExpXml>
<Runs> &lt;Run acc=&quot;SRR1210219&quot; total_spots=&quot;&quot; total_bases=&quot;&quot; load_done=&quot;&quot; is_public=&quot;&quot; cluster_name=&quot;&quot;/&gt; </Runs>
<ExtLinks> </ExtLinks>
<CreateDate>2014/03/31</CreateDate>
<UpdateDate>2014/03/31</UpdateDate>
</DocumentSummary>
</DocumentSummarySet>
</eSummaryResult>
"""
doc_sum = etree.fromstring(xml.encode('utf-8'))
return doc_sum
def _get_dummy_exp_tree(self):
xml = ' <Summary><Title>1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150020" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504733" ver="1" status="public" name="1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" ScientificName="human gut metagenome"/><Sample acc="SRS584415" name=""/><Instrument LS454="454 GS FLX Titanium"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711912" acc="SAMN02711912" sample_id="742100" sample_acc="SRS584415"/><Bioproject>SRP040739</Bioproject> '
return etree.fromstring('<total>' + xml + '</total>')
def _get_dummy_gut_exp_tree(self):
xml = ' <Summary><Title>1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150020" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504733" ver="1" status="public" name="1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" ScientificName="Escherichia coli"/><Sample acc="SRS584415" name=""/><Instrument LS454="Illumina HiSeq"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711912" acc="SAMN02711912" sample_id="742100" sample_acc="SRS584415"/><Bioproject>SRP040739</Bioproject> '
return etree.fromstring('<total>' + xml + '</total>')
def _get_dummy_exp_tree_no_scientifName(self):
xml = ' <Summary><Title>1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150020" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504733" ver="1" status="public" name="1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" CommonName="Escherichia coli"/><Sample acc="SRS584415" name=""/><Instrument LS454="Illumina HiSeq"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711912" acc="SAMN02711912" sample_id="742100" sample_acc="SRS584415"/><Bioproject>SRP040739</Bioproject> '
return etree.fromstring('<total>' + xml + '</total>')
@unittest.skip('this one works')
def test_fetch_uid(self):
for test_data_point in self.test_data:
acc = test_data_point['accession']
uid = test_data_point['uid']
result_uid = self.q.fetch_experiment_uuids(term=acc)
self.assertEqual(result_uid, uid)
def test_get_create_date(self):
for test_data_point in self.test_data:
uid = test_data_point['uid'][0]
date = test_data_point['create_date']
result_date = self.q.get_create_date_for_run(uid)
self.assertEqual(result_date, date)
def test_fetch_exp_summary_doc(self):
for test_data_point in self.test_data:
uid = test_data_point['uid'][0]
result = self.q.fetch_exp_summary_doc(uid)
self.assertEqual(result.tag, 'eSummaryResult')
docSummSet = result.find('DocumentSummarySet')
self.assertEqual(type(docSummSet), etree._Element)
def test_get_experiment_tree(self):
summ_tree = self._get_dummy_experiment_summary()
tree = self.q.get_experiment_tree(summ_tree)
expected_tree = self._get_dummy_exp_tree()
tree_children_tags = [x.tag for x in tree.getchildren()]
exp_tree_children_tags = [x.tag for x in expected_tree.getchildren()]
self.assertTrue(tree_children_tags == exp_tree_children_tags)
def test_passes_experiment_filter(self):
tree = self._get_dummy_exp_tree()
self.assertFalse(self.q.passes_experiment_filter(tree))
tree = self._get_dummy_gut_exp_tree()
self.assertTrue(self.q.passes_experiment_filter(tree))
tree = self._get_dummy_exp_tree_no_scientifName()
self.assertTrue(self.q.passes_experiment_filter(tree))
def test_is_gut_microbe(self):
self.assertTrue(self.q.is_gut_microbe('E. coli'))
self.assertTrue(self.q.is_gut_microbe('Escherichia coli'))
self.assertFalse(self.q.is_gut_microbe('Staphylococcus aureus'))
def test_get_experiment_data(self):
docsumset = self._get_dummy_experiment_summary()
tree = self._get_dummy_exp_tree()
expected_data = {
'experiment': 'SRX504733',
'study': 'SRP040739',
'runs': ['SRR1210219'],
'title': '1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment',
'submitter': 'JCVI',
'create_date': '',
'species': 'human gut metagenome',
'taxon_id': '408170',
'library_name': 'ROTAVIRUSA-RL-01-793',
'library_strategy': 'RNA-Seq',
'library_source': 'TRANSCRIPTOMIC',
'library_selection': 'RANDOM',
'instrument': '454 GS FLX Titanium',
'platform': '454 GS FLX Titanium',
'reference_genome': ''
}
data = self.q.get_experiment_data(tree, docsumset)
self.assertEqual(expected_data['experiment'], data['experiment'])
self.assertEqual(expected_data['study'], data['study'])
self.assertEqual(expected_data['runs'], data['runs'])
self.assertEqual(expected_data['title'], data['title'])
self.assertEqual(expected_data['submitter'], data['submitter'])
self.assertEqual(expected_data['create_date'], data['create_date'])
self.assertEqual(expected_data['species'], data['species'])
self.assertEqual(expected_data['taxon_id'], data['taxon_id'])
self.assertEqual(expected_data['library_name'], data['library_name'])
self.assertEqual(expected_data['library_strategy'], data['library_strategy'])
self.assertEqual(expected_data['library_source'], data['library_source'])
self.assertEqual(expected_data['library_selection'], data['library_selection'])
self.assertEqual(expected_data['instrument'], data['instrument'])
self.assertEqual(expected_data['platform'], data['platform'])
tree = self._get_dummy_exp_tree_no_scientifName()
data = self.q.get_experiment_data(tree, docsumset)
self.assertEqual(data['species'], 'Escherichia coli')
self.assertEqual(data['reference_genome'], 'Escherichia coli')
suite = unittest.TestLoader().loadTestsFromTestCase(TestSRA_Query)
unittest.TextTestRunner(verbosity=2).run(suite)
#!/usr/bin/python
"""
This script will fetch the creation date of SRA experiments using the NCBI API.
The input file should be a tab delimited file, with the experiment id in the first
column, and the creation date in the last.
If a creation date is already documented on a row, the row will be skipped to
reduce the number of API calls.
"""
import sys
import os
import csv
import time
sys.path.append(os.path.dirname(os.path.realpath(__file__)))
import query_sra
TESTING = False ;# set to True if you want to mock the calls to the NCBI API
def fetch_experiment_uuids_mock(term=None):
""" Function used for testing. It mocks the query_sra function `fetch_experiment_uuids`. """
return 'mocked_uid'
def get_create_date_for_run_mock(uid):
""" Function used for testing. It mocks the query_sra function `get_create_date_for_run`. """
return 'mocked_date'
def mock_functions(q):
""" Function used for testing. It replaces the query_sra functions by their mock functions. """
q.fetch_experiment_uuids = fetch_experiment_uuids_mock
q.get_create_date_for_run = get_create_date_for_run_mock
def check_arguments():
if len(sys.argv) != 3:
print 'usage: ./update_SRA_run_dates.py <infile> <outfile>'
sys.exit(-1)
return sys.argv[1:]
def is_a_date(indate):
""" Check whether the given indate is a date of format %Y/%m/%d.
Returns True if the string matches that pattern, False if it doesn't.
"""
try:
time.strptime(indate, '%Y/%m/%d')
returnvalue = True
except ValueError, e:
returnvalue = False
return returnvalue
def main():
infile, outfile = check_arguments()
q = query_sra.SRAQueryInterface()
if TESTING:
mock_functions(q)
with open(infile) as f, open(outfile, 'w+') as out:
reader = csv.reader(f, delimiter='\t')
header = reader.next()
out.write('\t'.join(header) + '\n')
for row in reader:
date = row[-1]
if not is_a_date(date):
accession = row[0]
uid = q.fetch_experiment_uuids(term=accession)
try:
created_date = q.get_create_date_for_run(uid)
except:
print 'Could not fetch date for row: {0}'.format(row)
sys.exit(-1)
row[-1] = created_date
out.write('\t'.join(row) + '\n')
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment