Skip to content

Instantly share code, notes, and snippets.

@uludag
Last active December 10, 2016 14:09
Show Gist options
  • Save uludag/3bb2b4ff045e49ac35c92d46e24c78b4 to your computer and use it in GitHub Desktop.
Save uludag/3bb2b4ff045e49ac35c92d46e24c78b4 to your computer and use it in GitHub Desktop.
Call genenames.org Biomart REST service to get gene names, symbols and aliases for a given list of Ensembl gene ids
#!/usr/bin/python
"""
Call genenames.org Biomart REST service to get gene names
and symbols for a given list of Ensembl gene ids.
Since this script uses the fetch API for each individual ID it is not fast
for large number of ids.
Script biomartxmlquery_gene_names_symbols.py[1] uses XML bulk queries
and is much faster when the number of ids are large.
[1] https://bitbucket.org/hspsdb/biomart-sample-clients/src
"""
import argparse
import json, sys
import httplib2 as http
if sys.version < '3':
from urlparse import urlparse
from urllib import urlencode
else:
from urllib.parse import urlparse, urlencode
headers = {'Accept': 'application/json'}
uri = 'http://rest.genenames.org'
path = '/fetch/ensembl_gene_id/'
parser = argparse.ArgumentParser()
parser.add_argument('--inputFile', help='Input File', required=False,
default="ensemblgeneids.txt")
args = parser.parse_args()
def get_biomart_data(ensgid):
target = urlparse(uri + path + ensgid)
method = 'GET'
body = ''
h = http.Http()
response, content = h.request(target.geturl(),
method, body, headers)
r = ensgid
if response['status'] == '200':
data = json.loads(content.decode('utf-8'))
if len(data['response']['docs']) > 0:
doc = data['response']['docs'][0]
r += '\t' + doc['symbol']
r += '\t\"' + doc['name'] + "\""
if 'alias_symbol' in doc:
for als in (doc['alias_symbol']):
r += '\t' + als
if 'alias_name' in doc:
for als in (doc['alias_name']):
r += '\t\"' + als + "\""
else:
print('Error detected: ' + response['status'])
r = None
return r
def process_inputfile(inf):
i = 0
with open(inf) as infile:
for line in infile:
i += 1
ensid = line.strip()
if ensid.startswith("ENSG"):
print(str(i) + '\t' + get_biomart_data(ensid))
elif i > 1:
print("\n\nInvalid Ensembl gene id line: %s\n" % ensid)
exit(-1)
print("Line#\tEnsembl-gene-ID\tApproved-symbol\tApproved-name\tAlias-symbols"
"\tAlias-names")
process_inputfile(args.inputFile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment