Skip to content

Instantly share code, notes, and snippets.

@uludag
Last active December 12, 2016 13:05
Show Gist options
  • Save uludag/f07a28d92af1cba4abf83738e4974438 to your computer and use it in GitHub Desktop.
Save uludag/f07a28d92af1cba4abf83738e4974438 to your computer and use it in GitHub Desktop.
Call genenames.org Biomart xml query service to get gene names, symbols and aliases for a given list of Ensembl gene ids
#!/usr/bin/python
"""
Script to call genenames.org Biomart xml query service to get gene names
and symbols for a given list of Ensembl gene ids
Usage:
> ./biomartxmlquery_gene_names_symbols.py --inputFile ensgeneids.txt > results.tsv
Input file: one Ensembl gene id per line
"""
import argparse
import os, io, csv, sys
import httplib2 as http
if sys.version < '3':
from urlparse import urlparse
from urllib import urlencode
else:
from urllib.parse import urlparse, urlencode
headers = {"Content-Type": "application/x-www-form-urlencoded"}
uri = 'http://biomart.genenames.org/martservice/results'
path = '/fetch/ensembl_gene_id/'
parser = argparse.ArgumentParser()
parser.add_argument('--inputFile', help='Input File', required=False,
default="ensemblgeneids.txt")
args = parser.parse_args()
genes = []
query = "<!DOCTYPE Query>\
<Query client=\"biomartclient\" processor=\"TSV\" limit=\"-1\" header=\"1\">\
<Dataset name=\"hgnc_gene_mart\" config=\"hgnc_gene_config\">\
<Filter name=\"hgnc_gene__ensembl_gene__ensembl_gene_id_104\"\
value=\"comma_separated_ensembl_ids\" filter_list=\"\"/>\
<Attribute name=\"hgnc_gene__ensembl_gene__ensembl_gene_id_104\"/>\
<Attribute name=\"hgnc_gene__approved_symbol_1010\"/>\
<Attribute name=\"hgnc_gene__approved_name_1010\"/>\
<Attribute name=\"hgnc_gene__hgnc_alias_symbol__alias_symbol_108\"/>\
<Attribute name=\"hgnc_gene__hgnc_alias_name__alias_name_107\"/>\
</Dataset>\
</Query>"
def query_biomart(xmlquery):
target = urlparse(uri)
method = 'POST'
h = http.Http()
q = {"query": xmlquery}
r = None
response, content = h.request(target.geturl(),
method, body=urlencode(q), headers=headers)
if response['status'] == '200':
r = content.decode('utf-8')
else:
print('Biomart query has returned error: ' + response['status'])
exit(-1)
return r
# Customize this function if you have any additional text in input file
# other than Ensembl gene ids
def pruneline(line):
if line.startswith("WinLen"):
r = line[20:].replace("_MethylCounts.xls", "")
else:
r = line
return r
def process_inputfile(inf):
i = 0
s = ""
with open(inf) as infile:
for line in infile:
i += 1
ensid = pruneline(line.strip())
if i > 1: s += ','
s += ensid
genes.append(ensid)
s= query.replace("comma_separated_ensembl_ids", s)
return s
def process_biomart_results(br):
i = 0
previd = ""
csvfile = io.StringIO(br)
gmap = {}
r = csv.reader(csvfile, delimiter='\t')
for row in r:
i += 1
if i == 1: continue
ensid = row[0]
aps = row[1]
apn = row[2]
if previd != ensid:
if i > 2:
gmap[previd] = printline(l)
l = (ensid, aps, apn, set(), set())
if len(row) > 3 and len(row[3])>0:
l[3].add(row[3])
if len(row) > 4 and len(row[4])>0:
l[4].add(row[4])
previd=ensid
gmap[previd] = printline(l)
return gmap
def printline(l):
s = str(l[0])
s += "\t" + str(l[1]) # approved symbol
s += "\t\"" + str(l[2]) + "\"" # approved name
for r in l[3]:
s += '\t'
s += r
for r in l[4]:
s += '\t'
s += '"' + r + '"'
return s
def print_sorted_results(rmap):
i = 1
for gene in genes:
if gene in rmap:
print(str(i) + '\t' + rmap[gene])
else:
print(str(i) + '\t' + gene)
i += 1
return
xml_query = process_inputfile(args.inputFile)
br = query_biomart(xmlquery=xml_query)
rmap = process_biomart_results(br)
print("Line#\tEnsembl-gene-ID\tApproved-symbol\tApproved-name\tAlias-symbols"
"\tAlias-names")
print_sorted_results(rmap)
# Not used by the current workflow
def get_biomart_data_curl(ensgid):
cmd = "curl --data-urlencode query@query.xml http://biomart.genenames.org/martservice/results"
os.system(cmd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment