Created
September 26, 2014 11:52
-
-
Save bartaelterman/8f911ea385fe0e4d3715 to your computer and use it in GitHub Desktop.
fetch bacterial RNA-seq data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
ldir = os.path.dirname(os.path.realpath(__file__)) | |
sys.path.append(ldir) | |
from query_sra import SRAQueryInterface | |
q = SRAQueryInterface() | |
header = ['experiment', 'study', 'runs', 'title', 'submitter', 'species', 'taxon_id', 'library_name', 'library_strategy', 'library_source', 'library_selection', 'platform', 'instrument', 'filter_result'] | |
retstart = 0 | |
more_results_to_fetch = True | |
out = open('RNA_Seq_experiments.tsv', 'w+', 0) | |
out.write('\t'.join(header) + '\n') | |
while more_results_to_fetch: | |
result_uids = q.fetch_experiment_uuids(term='RNA-Seq bacteria', retmax=500, retstart=retstart) | |
if len(result_uids) > 0: | |
retstart += 500 | |
for uid in result_uids: | |
try: | |
doc = q.fetch_exp_summary_doc(uid) | |
tree = q.get_experiment_tree(doc) | |
filter_result = q.passes_experiment_filter(tree) | |
data = q.get_experiment_data(tree, doc) | |
outline = [data['experiment'], data['study'], ':'.join(data['runs']), data['title'], data['submitter'], data['species'], str(data['taxon_id']), str(data['library_name']), data['library_strategy'], data['library_source'], data['library_selection'], data['platform'], data['instrument'], str(filter_result)] | |
out.write('\t'.join(outline) + '\n') | |
except Exception as e: | |
print e | |
dss = doc.find('DocumentSummarySet') | |
ds = dss.find('DocumentSummary') | |
e = ds.find('ExpXml') | |
try: | |
print e.text | |
except: | |
print '--could not print text' | |
out.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Fetch all RNA-Seq datasets from bacteria and flag those from Illumina and a gut species | |
RNA_Seq_experiments.tsv: find_sra_exps.py update_SRA_run_dates.py query_sra.py test_query_sra.py | |
python find_sra_exps.py | |
python update_SRA_run_dates.py RNA_Seq_experiments.tsv RNA_Seq_experiments_date.tsv | |
mv RNA_Seq_experiments_date.tsv RNA_Seq_experiments.tsv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import re | |
import os | |
import csv | |
from lxml import etree | |
class SRAQueryInterface: | |
GENOMES_FILE = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + '/data/IMGv4.0_GIT_genomes.tsv' | |
def get_create_date_for_run(self, uid): | |
params = {'db': 'sra', 'id': uid, 'version': '2.0'} | |
experiment_summaries_result = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi', params=params) | |
experiment_summaries_doc = etree.fromstring(experiment_summaries_result.text.encode('utf-8')) | |
doc_summary = experiment_summaries_doc.getchildren()[0] | |
experiments = doc_summary.getchildren()[0] | |
runs = doc_summary.getchildren()[1] | |
date = None | |
for child in runs.iterchildren(): | |
if child.tag == 'CreateDate': | |
date = child.text | |
return date | |
def fetch_experiment_uuids(self, term=None, retmax=20, retstart=0): | |
params = {'db': 'sra', 'term': term, 'retmax': retmax, 'retstart': retstart} | |
result = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=params) | |
xml_result = etree.fromstring(result.text.encode('utf-8')) | |
experiment_ids = [] | |
for child in xml_result.iterchildren(): | |
if child.tag == 'IdList': | |
for idchild in child.iterchildren(): | |
experiment_ids.append(idchild.text) | |
return experiment_ids | |
def _create_genomes_list(self): | |
self.genomes_list = [] | |
with open(self.GENOMES_FILE) as f: | |
reader = csv.reader(f, delimiter='\t') | |
header = reader.next() | |
for row in reader: | |
genus = row[10] | |
species = row[11] | |
name1 = '{0} {1}'.format(genus, species) | |
name2 = '{0} {1}'.format(genus[0] + '.', species) | |
self.genomes_list.append(name1) | |
self.genomes_list.append(name2) | |
def is_gut_microbe(self, taxon_name): | |
if not 'genomes_list' in self.__dict__.keys(): | |
self._create_genomes_list() | |
for gut_taxon in self.genomes_list: | |
if re.search(gut_taxon, taxon_name): | |
return True | |
return False | |
def fetch_all_RNA_experiments(self): | |
term = 'RNA' | |
retstart = 0 | |
retmax = 500 | |
more_to_fetch = True | |
experiment_uuids = [] | |
while more_to_fetch: | |
uuids = fetch_uuid_experiments(term=term, retmax=retmax, retstart=retstart) | |
experiment_uuids += uuids | |
if len(uuids) == 0: | |
more_to_fetch = False | |
else: | |
retstart += 500 | |
return experiment_uuids | |
def passes_experiment_filter(self, exp_tree): | |
experiment_is_ok = True | |
# should be rna seq | |
library_desctree = exp_tree.find('Library_descriptor') | |
source = library_desctree.find('LIBRARY_SOURCE').text | |
if source != 'TRANSCRIPTOMIC': | |
experiment_is_ok = False | |
# should be illumina | |
instr_tree = exp_tree.find('Instrument') | |
instr = instr_tree.values()[0] | |
if re.search('Illumina', instr) == None: | |
experiment_is_ok = False | |
# should be a gut microbe | |
species_tree = exp_tree.find('Organism') | |
try: | |
name = species_tree.attrib['ScientificName'] | |
except KeyError, e: | |
name = species_tree.attrib['CommonName'] | |
taxid = species_tree.attrib['taxid'] | |
if not self.is_gut_microbe(name): | |
experiment_is_ok = False | |
return experiment_is_ok | |
def get_experiment_data(self, exp_tree, summary_tree): | |
# Title | |
title = exp_tree.find('Summary').find('Title').text | |
# Submitter center | |
submitter = exp_tree.find('Submitter').attrib['center_name'] | |
# species | |
try: | |
name = exp_tree.find('Organism').attrib['ScientificName'] | |
except KeyError, e: | |
name = exp_tree.find('Organism').attrib['CommonName'] | |
taxid = exp_tree.find('Organism').attrib['taxid'] | |
# library | |
library_desctree = exp_tree.find('Library_descriptor') | |
lib_name = library_desctree.find('LIBRARY_NAME').text | |
lib_strategy = library_desctree.find('LIBRARY_STRATEGY').text | |
lib_source = library_desctree.find('LIBRARY_SOURCE').text | |
lib_selection = library_desctree.find('LIBRARY_SELECTION').text | |
# platform | |
instr_tree = exp_tree.find('Instrument') | |
instr = instr_tree.values()[0] | |
platform = exp_tree.find('Summary').find('Platform').attrib['instrument_model'] | |
#study accession | |
study_acc = exp_tree.find('Study').attrib['acc'] | |
# experiment accession | |
exp_acc = exp_tree.find('Experiment').attrib['acc'] | |
# runs | |
dss = summary_tree.find('DocumentSummarySet') | |
ds = dss.find('DocumentSummary') | |
runs = [] | |
runs_tree = etree.fromstring('<Runs>' + ds.find('Runs').text + '</Runs>') | |
for child in runs_tree.iterchildren(): | |
runs.append(child.attrib['acc']) | |
data = { | |
'experiment': exp_acc, | |
'study': study_acc, | |
'runs': runs, | |
'title': title, | |
'submitter': submitter, | |
'species': name, | |
'taxon_id': taxid, | |
'library_name': lib_name, | |
'library_strategy': lib_strategy, | |
'library_source': lib_source, | |
'library_selection': lib_selection, | |
'instrument': instr, | |
'platform': platform | |
} | |
return data | |
def fetch_exp_summary_doc(self, uuid): | |
params = {'db': 'sra', 'id': uuid, 'version': '2.0'} | |
print params | |
experiment_summaries_result = requests.get('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi', params=params) | |
experiment_summaries_doc = etree.fromstring(experiment_summaries_result.text.encode('utf-8')) | |
return experiment_summaries_doc | |
def get_filtered_experiments_by_uuid(self, uuidlist): | |
outexperiments = [] | |
for uuid in uuidlist: | |
experiment_summaries_doc = self.fetch_experiment_data | |
exp_tree = self.get_experiment_tree(experiment_summaries_doc) | |
if passes_experiment_filter(experiment_tree): | |
experiment_data = self.get_experiment_data(experiment_tree, experiment_summaries_doc) | |
created_date = ds.find('CreateDate').text | |
outexperiments.append(experiment_data) | |
return outexperiments | |
def get_experiment_tree(self, summary_tree): | |
dss = summary_tree.find('DocumentSummarySet') | |
ds = dss.find('DocumentSummary') | |
e = ds.find('ExpXml') | |
experiment_tree = etree.fromstring('<total>' + e.text + '</total>') # etree needs to have 1 root element | |
return experiment_tree |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import unittest | |
import sys | |
import os | |
ldir = os.path.dirname(os.path.realpath(__file__)) | |
sys.path.append(ldir) | |
from query_sra import SRAQueryInterface | |
from lxml import etree | |
class TestSRA_Query(unittest.TestCase): | |
def setUp(self): | |
self.test_data = [ | |
{'accession': 'ERX014110', 'uid': ['86375'], 'create_date': '2011/06/13'}, | |
{'accession': 'ERX014108', 'uid': ['86373'], 'create_date': '2011/06/13'} | |
] | |
self.q = SRAQueryInterface() | |
def _get_dummy_experiment_summary(self): | |
xml = """<?xml version="1.0" encoding="UTF-8"?> | |
<!DOCTYPE eSummaryResult PUBLIC "-//NLM//DTD esummary sra 20130524//EN" "http://eutils.ncbi.nlm.nih.gov/eutils/dtd/20130524/esummary_sra.dtd"> | |
<eSummaryResult> | |
<DocumentSummarySet status="OK"> | |
<DbBuild>Build140401-1723m.1</DbBuild> | |
<DocumentSummary uid="703993"> | |
<ExpXml> <Summary><Title>1135893704039 SAMN02711893 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150000" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504714" ver="1" status="public" name="1135893704039 SAMN02711893 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" ScientificName="human gut metagenome"/><Sample acc="SRS584396" name=""/><Instrument LS454="454 GS FLX Titanium"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711893" acc="SAMN02711893" sample_id="742081" sample_acc="SRS584396"/><Bioproject>SRP040739</Bioproject> </ExpXml> | |
<Runs> <Run acc="SRR1210219" total_spots="" total_bases="" load_done="" is_public="" cluster_name=""/> </Runs> | |
<ExtLinks> </ExtLinks> | |
<CreateDate>2014/03/31</CreateDate> | |
<UpdateDate>2014/03/31</UpdateDate> | |
</DocumentSummary> | |
</DocumentSummarySet> | |
</eSummaryResult> | |
""" | |
doc_sum = etree.fromstring(xml.encode('utf-8')) | |
return doc_sum | |
def _get_dummy_exp_tree(self): | |
xml = ' <Summary><Title>1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150020" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504733" ver="1" status="public" name="1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" ScientificName="human gut metagenome"/><Sample acc="SRS584415" name=""/><Instrument LS454="454 GS FLX Titanium"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711912" acc="SAMN02711912" sample_id="742100" sample_acc="SRS584415"/><Bioproject>SRP040739</Bioproject> ' | |
return etree.fromstring('<total>' + xml + '</total>') | |
def _get_dummy_gut_exp_tree(self): | |
xml = ' <Summary><Title>1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150020" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504733" ver="1" status="public" name="1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" ScientificName="Escherichia coli"/><Sample acc="SRS584415" name=""/><Instrument LS454="Illumina HiSeq"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711912" acc="SAMN02711912" sample_id="742100" sample_acc="SRS584415"/><Bioproject>SRP040739</Bioproject> ' | |
return etree.fromstring('<total>' + xml + '</total>') | |
def _get_dummy_exp_tree_no_scientifName(self): | |
xml = ' <Summary><Title>1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment</Title><Platform instrument_model="454 GS FLX Titanium">LS454</Platform><Statistics total_runs="1" total_spots="0" total_bases="0" total_size="0" load_done="true" cluster_name="public"/></Summary><Submitter acc="SRA150020" center_name="JCVI" contact_name="Ewen Kirkness" lab_name=""/><Experiment acc="SRX504733" ver="1" status="public" name="1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment"/><Study acc="SRP040739" name="Human stool metatranscriptome"/><Organism taxid="408170" CommonName="Escherichia coli"/><Sample acc="SRS584415" name=""/><Instrument LS454="Illumina HiSeq"/><Library_descriptor><LIBRARY_NAME>ROTAVIRUSA-RL-01-793</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOMIC</LIBRARY_SOURCE><LIBRARY_SELECTION>RANDOM</LIBRARY_SELECTION><LIBRARY_LAYOUT> <SINGLE/> </LIBRARY_LAYOUT></Library_descriptor><Biosample id="2711912" acc="SAMN02711912" sample_id="742100" sample_acc="SRS584415"/><Bioproject>SRP040739</Bioproject> ' | |
return etree.fromstring('<total>' + xml + '</total>') | |
@unittest.skip('this one works') | |
def test_fetch_uid(self): | |
for test_data_point in self.test_data: | |
acc = test_data_point['accession'] | |
uid = test_data_point['uid'] | |
result_uid = self.q.fetch_experiment_uuids(term=acc) | |
self.assertEqual(result_uid, uid) | |
def test_get_create_date(self): | |
for test_data_point in self.test_data: | |
uid = test_data_point['uid'][0] | |
date = test_data_point['create_date'] | |
result_date = self.q.get_create_date_for_run(uid) | |
self.assertEqual(result_date, date) | |
def test_fetch_exp_summary_doc(self): | |
for test_data_point in self.test_data: | |
uid = test_data_point['uid'][0] | |
result = self.q.fetch_exp_summary_doc(uid) | |
self.assertEqual(result.tag, 'eSummaryResult') | |
docSummSet = result.find('DocumentSummarySet') | |
self.assertEqual(type(docSummSet), etree._Element) | |
def test_get_experiment_tree(self): | |
summ_tree = self._get_dummy_experiment_summary() | |
tree = self.q.get_experiment_tree(summ_tree) | |
expected_tree = self._get_dummy_exp_tree() | |
tree_children_tags = [x.tag for x in tree.getchildren()] | |
exp_tree_children_tags = [x.tag for x in expected_tree.getchildren()] | |
self.assertTrue(tree_children_tags == exp_tree_children_tags) | |
def test_passes_experiment_filter(self): | |
tree = self._get_dummy_exp_tree() | |
self.assertFalse(self.q.passes_experiment_filter(tree)) | |
tree = self._get_dummy_gut_exp_tree() | |
self.assertTrue(self.q.passes_experiment_filter(tree)) | |
tree = self._get_dummy_exp_tree_no_scientifName() | |
self.assertTrue(self.q.passes_experiment_filter(tree)) | |
def test_is_gut_microbe(self): | |
self.assertTrue(self.q.is_gut_microbe('E. coli')) | |
self.assertTrue(self.q.is_gut_microbe('Escherichia coli')) | |
self.assertFalse(self.q.is_gut_microbe('Staphylococcus aureus')) | |
def test_get_experiment_data(self): | |
docsumset = self._get_dummy_experiment_summary() | |
tree = self._get_dummy_exp_tree() | |
expected_data = { | |
'experiment': 'SRX504733', | |
'study': 'SRP040739', | |
'runs': ['SRR1210219'], | |
'title': '1135893704077 SAMN02711912 ROTAVIRUSA-RL-01-793 Experiment', | |
'submitter': 'JCVI', | |
'create_date': '', | |
'species': 'human gut metagenome', | |
'taxon_id': '408170', | |
'library_name': 'ROTAVIRUSA-RL-01-793', | |
'library_strategy': 'RNA-Seq', | |
'library_source': 'TRANSCRIPTOMIC', | |
'library_selection': 'RANDOM', | |
'instrument': '454 GS FLX Titanium', | |
'platform': '454 GS FLX Titanium', | |
'reference_genome': '' | |
} | |
data = self.q.get_experiment_data(tree, docsumset) | |
self.assertEqual(expected_data['experiment'], data['experiment']) | |
self.assertEqual(expected_data['study'], data['study']) | |
self.assertEqual(expected_data['runs'], data['runs']) | |
self.assertEqual(expected_data['title'], data['title']) | |
self.assertEqual(expected_data['submitter'], data['submitter']) | |
self.assertEqual(expected_data['create_date'], data['create_date']) | |
self.assertEqual(expected_data['species'], data['species']) | |
self.assertEqual(expected_data['taxon_id'], data['taxon_id']) | |
self.assertEqual(expected_data['library_name'], data['library_name']) | |
self.assertEqual(expected_data['library_strategy'], data['library_strategy']) | |
self.assertEqual(expected_data['library_source'], data['library_source']) | |
self.assertEqual(expected_data['library_selection'], data['library_selection']) | |
self.assertEqual(expected_data['instrument'], data['instrument']) | |
self.assertEqual(expected_data['platform'], data['platform']) | |
tree = self._get_dummy_exp_tree_no_scientifName() | |
data = self.q.get_experiment_data(tree, docsumset) | |
self.assertEqual(data['species'], 'Escherichia coli') | |
self.assertEqual(data['reference_genome'], 'Escherichia coli') | |
suite = unittest.TestLoader().loadTestsFromTestCase(TestSRA_Query) | |
unittest.TextTestRunner(verbosity=2).run(suite) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
This script will fetch the creation date of SRA experiments using the NCBI API. | |
The input file should be a tab delimited file, with the experiment id in the first | |
column, and the creation date in the last. | |
If a creation date is already documented on a row, the row will be skipped to | |
reduce the number of API calls. | |
""" | |
import sys | |
import os | |
import csv | |
import time | |
sys.path.append(os.path.dirname(os.path.realpath(__file__))) | |
import query_sra | |
TESTING = False ;# set to True if you want to mock the calls to the NCBI API | |
def fetch_experiment_uuids_mock(term=None): | |
""" Function used for testing. It mocks the query_sra function `fetch_experiment_uuids`. """ | |
return 'mocked_uid' | |
def get_create_date_for_run_mock(uid): | |
""" Function used for testing. It mocks the query_sra function `get_create_date_for_run`. """ | |
return 'mocked_date' | |
def mock_functions(q): | |
""" Function used for testing. It replaces the query_sra functions by their mock functions. """ | |
q.fetch_experiment_uuids = fetch_experiment_uuids_mock | |
q.get_create_date_for_run = get_create_date_for_run_mock | |
def check_arguments(): | |
if len(sys.argv) != 3: | |
print 'usage: ./update_SRA_run_dates.py <infile> <outfile>' | |
sys.exit(-1) | |
return sys.argv[1:] | |
def is_a_date(indate): | |
""" Check whether the given indate is a date of format %Y/%m/%d. | |
Returns True if the string matches that pattern, False if it doesn't. | |
""" | |
try: | |
time.strptime(indate, '%Y/%m/%d') | |
returnvalue = True | |
except ValueError, e: | |
returnvalue = False | |
return returnvalue | |
def main(): | |
infile, outfile = check_arguments() | |
q = query_sra.SRAQueryInterface() | |
if TESTING: | |
mock_functions(q) | |
with open(infile) as f, open(outfile, 'w+') as out: | |
reader = csv.reader(f, delimiter='\t') | |
header = reader.next() | |
out.write('\t'.join(header) + '\n') | |
for row in reader: | |
date = row[-1] | |
if not is_a_date(date): | |
accession = row[0] | |
uid = q.fetch_experiment_uuids(term=accession) | |
try: | |
created_date = q.get_create_date_for_run(uid) | |
except: | |
print 'Could not fetch date for row: {0}'.format(row) | |
sys.exit(-1) | |
row[-1] = created_date | |
out.write('\t'.join(row) + '\n') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment