Last active
December 12, 2016 13:05
-
-
Save uludag/f07a28d92af1cba4abf83738e4974438 to your computer and use it in GitHub Desktop.
Call genenames.org Biomart xml query service to get gene names, symbols and aliases for a given list of Ensembl gene ids
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
""" | |
Script to call genenames.org Biomart xml query service to get gene names | |
and symbols for a given list of Ensembl gene ids | |
Usage: | |
> ./biomartxmlquery_gene_names_symbols.py --inputFile ensgeneids.txt > results.tsv | |
Input file: one Ensembl gene id per line | |
""" | |
import argparse | |
import os, io, csv, sys | |
import httplib2 as http | |
if sys.version < '3': | |
from urlparse import urlparse | |
from urllib import urlencode | |
else: | |
from urllib.parse import urlparse, urlencode | |
headers = {"Content-Type": "application/x-www-form-urlencoded"} | |
uri = 'http://biomart.genenames.org/martservice/results' | |
path = '/fetch/ensembl_gene_id/' | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--inputFile', help='Input File', required=False, | |
default="ensemblgeneids.txt") | |
args = parser.parse_args() | |
genes = [] | |
query = "<!DOCTYPE Query>\ | |
<Query client=\"biomartclient\" processor=\"TSV\" limit=\"-1\" header=\"1\">\ | |
<Dataset name=\"hgnc_gene_mart\" config=\"hgnc_gene_config\">\ | |
<Filter name=\"hgnc_gene__ensembl_gene__ensembl_gene_id_104\"\ | |
value=\"comma_separated_ensembl_ids\" filter_list=\"\"/>\ | |
<Attribute name=\"hgnc_gene__ensembl_gene__ensembl_gene_id_104\"/>\ | |
<Attribute name=\"hgnc_gene__approved_symbol_1010\"/>\ | |
<Attribute name=\"hgnc_gene__approved_name_1010\"/>\ | |
<Attribute name=\"hgnc_gene__hgnc_alias_symbol__alias_symbol_108\"/>\ | |
<Attribute name=\"hgnc_gene__hgnc_alias_name__alias_name_107\"/>\ | |
</Dataset>\ | |
</Query>" | |
def query_biomart(xmlquery): | |
target = urlparse(uri) | |
method = 'POST' | |
h = http.Http() | |
q = {"query": xmlquery} | |
r = None | |
response, content = h.request(target.geturl(), | |
method, body=urlencode(q), headers=headers) | |
if response['status'] == '200': | |
r = content.decode('utf-8') | |
else: | |
print('Biomart query has returned error: ' + response['status']) | |
exit(-1) | |
return r | |
# Customize this function if you have any additional text in input file | |
# other than Ensembl gene ids | |
def pruneline(line): | |
if line.startswith("WinLen"): | |
r = line[20:].replace("_MethylCounts.xls", "") | |
else: | |
r = line | |
return r | |
def process_inputfile(inf): | |
i = 0 | |
s = "" | |
with open(inf) as infile: | |
for line in infile: | |
i += 1 | |
ensid = pruneline(line.strip()) | |
if i > 1: s += ',' | |
s += ensid | |
genes.append(ensid) | |
s= query.replace("comma_separated_ensembl_ids", s) | |
return s | |
def process_biomart_results(br): | |
i = 0 | |
previd = "" | |
csvfile = io.StringIO(br) | |
gmap = {} | |
r = csv.reader(csvfile, delimiter='\t') | |
for row in r: | |
i += 1 | |
if i == 1: continue | |
ensid = row[0] | |
aps = row[1] | |
apn = row[2] | |
if previd != ensid: | |
if i > 2: | |
gmap[previd] = printline(l) | |
l = (ensid, aps, apn, set(), set()) | |
if len(row) > 3 and len(row[3])>0: | |
l[3].add(row[3]) | |
if len(row) > 4 and len(row[4])>0: | |
l[4].add(row[4]) | |
previd=ensid | |
gmap[previd] = printline(l) | |
return gmap | |
def printline(l): | |
s = str(l[0]) | |
s += "\t" + str(l[1]) # approved symbol | |
s += "\t\"" + str(l[2]) + "\"" # approved name | |
for r in l[3]: | |
s += '\t' | |
s += r | |
for r in l[4]: | |
s += '\t' | |
s += '"' + r + '"' | |
return s | |
def print_sorted_results(rmap): | |
i = 1 | |
for gene in genes: | |
if gene in rmap: | |
print(str(i) + '\t' + rmap[gene]) | |
else: | |
print(str(i) + '\t' + gene) | |
i += 1 | |
return | |
xml_query = process_inputfile(args.inputFile) | |
br = query_biomart(xmlquery=xml_query) | |
rmap = process_biomart_results(br) | |
print("Line#\tEnsembl-gene-ID\tApproved-symbol\tApproved-name\tAlias-symbols" | |
"\tAlias-names") | |
print_sorted_results(rmap) | |
# Not used by the current workflow | |
def get_biomart_data_curl(ensgid): | |
cmd = "curl --data-urlencode query@query.xml http://biomart.genenames.org/martservice/results" | |
os.system(cmd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment