Created
August 24, 2012 15:43
-
-
Save nickschurch/3452151 to your computer and use it in GitHub Desktop.
Python routine for converting entrezIDs to ensemblIDs see differentialExpressions.blogspot.co.uk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import MySQLdb, re, time | |
def entrez2ensembl(entrez_gene_list, | |
genus_name, | |
species_name, | |
ensemblver=None): | |
''' Converts a list of EntrezIDs into ensemblIDs | |
Requires a list of EntrezIDs, a genus and species name, and | |
optionally the enembl version to use. Genus_name and | |
species_name must be the same that ensembl uses. | |
For each matching record the script returns a list of tuples containing: | |
(EntrezID, ensembl gene ID, biotype, description, chromosome | |
start coord, stop coord, strand). | |
Example: | |
ensembl_genes = entrez2ensembl(["SCFD2", "PPAP2C", "LASS6"], | |
genus_name="homo", | |
species_name="sapiens") | |
''' | |
# ensemlb details | |
ensembl_host = "ensembldb.ensembl.org" | |
ensembl_port = 5306 | |
# open connection | |
ensemblDB = MySQLdb.connect(host=ensembl_host, | |
user="anonymous", | |
port=ensembl_port) | |
cur = ensemblDB.cursor() | |
# build ensembl species string | |
ensembl_species = "%s_%s" % (genus_name.lower(), species_name.lower()) | |
# get tables in the schema matching the query species | |
schema_sql = 'show schemas like "%s_core%s_%s"' % (ensembl_species, | |
"\\", | |
"%") | |
cur.execute(schema_sql) | |
records = cur.fetchall() | |
if ensemblver is None: | |
# trim out the latest one and split the version number | |
# and table version number | |
latest_ver = records[-1][0].split("_")[-2:] | |
else: | |
for record in records: | |
if re.match("%s_core_%i.+" % (ensembl_species, | |
ensemblver), | |
record[0]): | |
latest_ver = record[0].split("_")[-2:] | |
# define dbs to use | |
core_db = "%s_core_%s_%s" % (ensembl_species, | |
latest_ver[0], | |
latest_ver[1]) | |
# id query string | |
# 'external_db_id=1300' = EntrezGene | |
query_str = ''' | |
SELECT x.display_label, | |
g.stable_id, | |
g.biotype, | |
g.description, | |
c.name, | |
g.seq_region_start, | |
g.seq_region_end, | |
g.seq_region_strand | |
FROM %s.xref x | |
JOIN %s.dependent_xref dx ON x.xref_id=dx.dependent_xref_id | |
JOIN %s.object_xref ox ON dx.object_xref_id = ox.object_xref_id | |
JOIN %s.gene g ON ox.ensembl_id=g.gene_id | |
JOIN %s.seq_region c ON g.seq_region_id=c.seq_region_id | |
JOIN %s.coord_system cs ON c.coord_system_id=cs.coord_system_id | |
WHERE x.external_db_id=1300 | |
AND cs.rank=1 | |
AND x.display_label IN ("%s"); | |
''' % (core_db, core_db, core_db, | |
core_db, core_db, core_db, | |
'", "'.join(entrez_gene_list) | |
) | |
print query_str | |
#execute query | |
ttime = time.gmtime() | |
timestring = "%02.0d:%02.0d:%02.0d" % (ttime[3],ttime[4],ttime[5]) | |
print "querying the ensembl database for all ensembl genes at %s..." % timestring | |
t1=time.time() | |
cur.execute(query_str) # this is not so quick (~5min) | |
records=cur.fetchall() | |
print "returned %i records in %.2fs" % (len(records), (time.time()-t1)) | |
return(records) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment