Retrieve TCGA gene expression data using GDC api
# -*- coding: utf-8 -*- | |
import logging as log | |
import pandas as pd | |
import requests as rq | |
class TCGA: | |
def __init__(self, gdc_url='https://gdc-api.nci.nih.gov', per_page=100, | |
logfile=None): | |
''' Initialisation ''' | |
self.gdc_url = gdc_url | |
self.per_page = per_page | |
log.basicConfig(filename=logfile, level=log.INFO, | |
format='%(asctime)s : %(levelname)8s : %(message)s (%(module)s.%(funcName)s)', | |
datefmt='%Y-%m-%d %H:%M:%S') | |
def get_geneexp(self, projects, exp_strategy='RNA-Seq', | |
workflow='HTSeq - FPKM'): | |
filters = _FilterBuilder.logical( | |
'and', [ | |
_FilterBuilder.equal('files.data_type', | |
'Gene Expression Quantification'), | |
_FilterBuilder.equal('experimental_strategy', exp_strategy), | |
_FilterBuilder.equal('files.analysis.workflow_type', workflow), | |
_FilterBuilder.inclusion('cases.project.project_id', projects)]) | |
file_ids = self._get_file_ids(filters) | |
log.info(f'{len(file_ids)} files found') | |
df = None | |
for i, fid in enumerate(file_ids): | |
f = pd.read_table(f'{self.gdc_url}/data/{fid}', compression='gzip', | |
index_col=0, header=None) | |
df = pd.concat([df, f], axis=1) | |
log.info(f'{i+1:4}. File {fid} integrated') | |
df.columns = file_ids | |
return df | |
def _get_file_ids(self, filters): | |
file_ids = [] | |
resp = rq.post(f'{self.gdc_url}/files?size={self.per_page}', | |
json={'filters': filters}) | |
if resp.status_code == 200: | |
resp = resp.json() | |
meta = resp['data']['pagination'] | |
file_ids += [h['file_id'] for h in resp['data']['hits']] | |
if meta['pages'] > 1: | |
for _from in range(self.per_page + 1, meta['total'], self.per_page): | |
resp = rq.post( | |
f'{self.gdc_url}/files?size={self.per_page}&from={_from}', | |
json={'filters': filters}) | |
if resp.status_code == 200: | |
resp = resp.json() | |
file_ids += [h['file_id'] for h in resp['data']['hits']] | |
return file_ids | |
class _FilterBuilder: | |
@staticmethod | |
def logical(op, args): | |
''' Logical operator ''' | |
_filter = { 'op': op, 'content': [o for o in args] } | |
return _filter | |
@staticmethod | |
def inclusion(field, values): | |
''' Inclusion operator ''' | |
if len(values) < 1: | |
raise RuntimeError(f'Invalid number of values: {len(values)}') | |
_filter = { 'op': 'in', 'content': { 'field': field, 'value': values }} | |
return _filter | |
@staticmethod | |
def equal(field, value): | |
''' Equal operator ''' | |
_filter = { 'op': '=', 'content': { 'field': field, 'value': value }} | |
return _filter |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment