Created
August 7, 2019 19:01
-
-
Save gtsueng/3cd5886ff97b77eadde0853fae057734 to your computer and use it in GitHub Desktop.
Pull all human genes in Wikidata with a corresponding Wikipedia page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mygene | |
import pandas | |
import json | |
import urllib.request | |
import requests | |
from collections import OrderedDict | |
from pandas import read_csv | |
url = 'https://query.wikidata.org/sparql' | |
query = """ | |
SELECT | |
?gene ?geneid ?protein ?article | |
WHERE { | |
?gene wdt:P31 wd:Q7187; #Find genes | |
wdt:P703 wd:Q15978631. #limit to humans | |
?article schema:about ?gene. #limit to genes with corresponding English Wikipedia articles | |
?article schema:inLanguage "en". | |
?article schema:isPartOf <https://en.wikipedia.org/> | |
OPTIONAL { ?gene wdt:P688 ?protein } #Get associated proteins | |
OPTIONAL { ?gene wdt:P351 ?geneid } #Get the geneid | |
} | |
""" | |
r = requests.get(url, params = {'format': 'json', 'query': query}) | |
data = r.json() | |
print("query completed") | |
genes = [] | |
for item in data['results']['bindings']: | |
genes.append(OrderedDict({ | |
'gene_uri': item['gene']['value'], | |
'gene_id': item['geneid']['value'] | |
if 'geneid' in item else None, | |
'protein_uri': item['protein']['value'] | |
if 'protein' in item else None, | |
'wiki_uri': item['article']['value'] | |
if 'article' in item else None})) | |
wikidata_genes_uri = pandas.DataFrame(genes) | |
wikidata_genes_uri['genes_wdid'] = wikidata_genes_uri['gene_uri'].astype(str).str.replace("http://www.wikidata.org/entity/","") | |
wikidata_genes_uri['protein_wdid'] = wikidata_genes_uri['protein_uri'].astype(str).str.replace("http://www.wikidata.org/entity/","") | |
wikidata_genes_uri['wiki_stub'] = wikidata_genes_uri['wiki_uri'].astype(str).str.replace("https://en.wikipedia.org/wiki/","") | |
wikidata_genes_uri.head() | |
print(len(wikidata_genes_uri)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment