Skip to content

Instantly share code, notes, and snippets.

@fbrundu
Created March 27, 2017 14:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fbrundu/a5aa436af7f3f1ff8069c7b31416f56b to your computer and use it in GitHub Desktop.
Save fbrundu/a5aa436af7f3f1ff8069c7b31416f56b to your computer and use it in GitHub Desktop.
Retrieve TCGA gene expression data using GDC api
# -*- coding: utf-8 -*-
import logging as log
import pandas as pd
import requests as rq
class TCGA:
def __init__(self, gdc_url='https://gdc-api.nci.nih.gov', per_page=100,
logfile=None):
''' Initialisation '''
self.gdc_url = gdc_url
self.per_page = per_page
log.basicConfig(filename=logfile, level=log.INFO,
format='%(asctime)s : %(levelname)8s : %(message)s (%(module)s.%(funcName)s)',
datefmt='%Y-%m-%d %H:%M:%S')
def get_geneexp(self, projects, exp_strategy='RNA-Seq',
workflow='HTSeq - FPKM'):
filters = _FilterBuilder.logical(
'and', [
_FilterBuilder.equal('files.data_type',
'Gene Expression Quantification'),
_FilterBuilder.equal('experimental_strategy', exp_strategy),
_FilterBuilder.equal('files.analysis.workflow_type', workflow),
_FilterBuilder.inclusion('cases.project.project_id', projects)])
file_ids = self._get_file_ids(filters)
log.info(f'{len(file_ids)} files found')
df = None
for i, fid in enumerate(file_ids):
f = pd.read_table(f'{self.gdc_url}/data/{fid}', compression='gzip',
index_col=0, header=None)
df = pd.concat([df, f], axis=1)
log.info(f'{i+1:4}. File {fid} integrated')
df.columns = file_ids
return df
def _get_file_ids(self, filters):
file_ids = []
resp = rq.post(f'{self.gdc_url}/files?size={self.per_page}',
json={'filters': filters})
if resp.status_code == 200:
resp = resp.json()
meta = resp['data']['pagination']
file_ids += [h['file_id'] for h in resp['data']['hits']]
if meta['pages'] > 1:
for _from in range(self.per_page + 1, meta['total'], self.per_page):
resp = rq.post(
f'{self.gdc_url}/files?size={self.per_page}&from={_from}',
json={'filters': filters})
if resp.status_code == 200:
resp = resp.json()
file_ids += [h['file_id'] for h in resp['data']['hits']]
return file_ids
class _FilterBuilder:
@staticmethod
def logical(op, args):
''' Logical operator '''
_filter = { 'op': op, 'content': [o for o in args] }
return _filter
@staticmethod
def inclusion(field, values):
''' Inclusion operator '''
if len(values) < 1:
raise RuntimeError(f'Invalid number of values: {len(values)}')
_filter = { 'op': 'in', 'content': { 'field': field, 'value': values }}
return _filter
@staticmethod
def equal(field, value):
''' Equal operator '''
_filter = { 'op': '=', 'content': { 'field': field, 'value': value }}
return _filter
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment