uludag/biomart_gene_names_symbols.py

## biomart_gene_names_symbols.py
#!/usr/bin/python
"""
Call genenames.org Biomart REST service to get gene names
and symbols for a given list of Ensembl gene ids.
Since this script uses the fetch API for each individual ID it is not fast
for large number of ids.
Script biomartxmlquery_gene_names_symbols.py[1] uses XML bulk queries
and is much faster when the number of ids are large.
[1] https://bitbucket.org/hspsdb/biomart-sample-clients/src
"""

import argparse
import json, sys
import httplib2 as http
if sys.version < '3':
    from urlparse import urlparse
    from urllib import urlencode
else:
    from urllib.parse import urlparse, urlencode

headers = {'Accept': 'application/json'}
uri = 'http://rest.genenames.org'
path = '/fetch/ensembl_gene_id/'
parser = argparse.ArgumentParser()
parser.add_argument('--inputFile', help='Input File', required=False,
                    default="ensemblgeneids.txt")
args = parser.parse_args()


def get_biomart_data(ensgid):
    target = urlparse(uri + path + ensgid)
    method = 'GET'
    body = ''
    h = http.Http()
    response, content = h.request(target.geturl(),
                                  method, body, headers)
    r = ensgid
    if response['status'] == '200':
        data = json.loads(content.decode('utf-8'))
        if len(data['response']['docs']) > 0:
            doc = data['response']['docs'][0]
            r += '\t' + doc['symbol']
            r += '\t\"' + doc['name'] + "\""
            if 'alias_symbol' in doc:
                for als in (doc['alias_symbol']):
                    r += '\t' + als
            if 'alias_name' in doc:
                for als in (doc['alias_name']):
                    r += '\t\"' + als + "\""
    else:
        print('Error detected: ' + response['status'])
        r = None
    return r


def process_inputfile(inf):
    i = 0
    with open(inf) as infile:
        for line in infile:
            i += 1
            ensid = line.strip()
            if ensid.startswith("ENSG"):
                print(str(i) + '\t' + get_biomart_data(ensid))
            elif i > 1:
                print("\n\nInvalid Ensembl gene id line: %s\n" % ensid)
                exit(-1)


print("Line#\tEnsembl-gene-ID\tApproved-symbol\tApproved-name\tAlias-symbols"
      "\tAlias-names")
process_inputfile(args.inputFile)
	#!/usr/bin/python
	"""
	Call genenames.org Biomart REST service to get gene names
	and symbols for a given list of Ensembl gene ids.
	Since this script uses the fetch API for each individual ID it is not fast
	for large number of ids.
	Script biomartxmlquery_gene_names_symbols.py[1] uses XML bulk queries
	and is much faster when the number of ids are large.
	[1] https://bitbucket.org/hspsdb/biomart-sample-clients/src
	"""

	import argparse
	import json, sys
	import httplib2 as http
	if sys.version < '3':
	from urlparse import urlparse
	from urllib import urlencode
	else:
	from urllib.parse import urlparse, urlencode

	headers = {'Accept': 'application/json'}
	uri = 'http://rest.genenames.org'
	path = '/fetch/ensembl_gene_id/'
	parser = argparse.ArgumentParser()
	parser.add_argument('--inputFile', help='Input File', required=False,
	default="ensemblgeneids.txt")
	args = parser.parse_args()


	def get_biomart_data(ensgid):
	target = urlparse(uri + path + ensgid)
	method = 'GET'
	body = ''
	h = http.Http()
	response, content = h.request(target.geturl(),
	method, body, headers)
	r = ensgid
	if response['status'] == '200':
	data = json.loads(content.decode('utf-8'))
	if len(data['response']['docs']) > 0:
	doc = data['response']['docs'][0]
	r += '\t' + doc['symbol']
	r += '\t\"' + doc['name'] + "\""
	if 'alias_symbol' in doc:
	for als in (doc['alias_symbol']):
	r += '\t' + als
	if 'alias_name' in doc:
	for als in (doc['alias_name']):
	r += '\t\"' + als + "\""
	else:
	print('Error detected: ' + response['status'])
	r = None
	return r


	def process_inputfile(inf):
	i = 0
	with open(inf) as infile:
	for line in infile:
	i += 1
	ensid = line.strip()
	if ensid.startswith("ENSG"):
	print(str(i) + '\t' + get_biomart_data(ensid))
	elif i > 1:
	print("\n\nInvalid Ensembl gene id line: %s\n" % ensid)
	exit(-1)


	print("Line#\tEnsembl-gene-ID\tApproved-symbol\tApproved-name\tAlias-symbols"
	"\tAlias-names")
	process_inputfile(args.inputFile)