Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
a script to retrieve a mapping from entrez id to Wikidata QID and English Wikipedia pages
import requests
import urllib
import json
__author__ = 'Sebastian Burgstaller'
__license__ = 'AGPLv3'
prefix = '''
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
'''
query = '''
SELECT ?entrez_id ?cid ?article ?label WHERE {
?cid wdt:P351 ?entrez_id .
OPTIONAL {
?cid rdfs:label ?label filter (lang(?label) = "en") .
?article schema:about ?cid .
?article schema:inLanguage "en" .
FILTER (SUBSTR(str(?article), 1, 25) = "https://en.wikipedia.org/") .
}
}
'''
params = {
'query': prefix + query,
'format': 'json'
}
headers = {
'Accept': 'application/sparql-results+json'
}
url = 'https://query.wikidata.org/sparql'
results = requests.get(url, params=params, headers=headers).json()['results']['bindings']
entrez_map = {}
for x in results:
entrez_id = x['entrez_id']['value']
tmp = {
entrez_id: {
'wikipedia': {
'url_stub': '',
},
'wikidata': x['cid']['value'].split('/')[-1]
}
}
if 'article' in x:
tmp[entrez_id]['wikipedia']['url_stub'] = urllib.parse.unquote(x['article']['value'].split('/')[-1])
else:
del tmp[entrez_id]['wikipedia']
if entrez_id in entrez_map:
tmp = {entrez_id: entrez_map[entrez_id]}
else:
entrez_map.update(tmp)
f = open('mygene.info', 'w')
f.write(json.dumps(entrez_map))
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment