Skip to content

Instantly share code, notes, and snippets.

@gtsueng
Created August 7, 2019 19:01
Show Gist options
  • Save gtsueng/3cd5886ff97b77eadde0853fae057734 to your computer and use it in GitHub Desktop.
Save gtsueng/3cd5886ff97b77eadde0853fae057734 to your computer and use it in GitHub Desktop.
Pull all human genes in Wikidata with a corresponding Wikipedia page
import mygene
import pandas
import json
import urllib.request
import requests
from collections import OrderedDict
from pandas import read_csv
url = 'https://query.wikidata.org/sparql'
query = """
SELECT
?gene ?geneid ?protein ?article
WHERE {
?gene wdt:P31 wd:Q7187; #Find genes
wdt:P703 wd:Q15978631. #limit to humans
?article schema:about ?gene. #limit to genes with corresponding English Wikipedia articles
?article schema:inLanguage "en".
?article schema:isPartOf <https://en.wikipedia.org/>
OPTIONAL { ?gene wdt:P688 ?protein } #Get associated proteins
OPTIONAL { ?gene wdt:P351 ?geneid } #Get the geneid
}
"""
r = requests.get(url, params = {'format': 'json', 'query': query})
data = r.json()
print("query completed")
genes = []
for item in data['results']['bindings']:
genes.append(OrderedDict({
'gene_uri': item['gene']['value'],
'gene_id': item['geneid']['value']
if 'geneid' in item else None,
'protein_uri': item['protein']['value']
if 'protein' in item else None,
'wiki_uri': item['article']['value']
if 'article' in item else None}))
wikidata_genes_uri = pandas.DataFrame(genes)
wikidata_genes_uri['genes_wdid'] = wikidata_genes_uri['gene_uri'].astype(str).str.replace("http://www.wikidata.org/entity/","")
wikidata_genes_uri['protein_wdid'] = wikidata_genes_uri['protein_uri'].astype(str).str.replace("http://www.wikidata.org/entity/","")
wikidata_genes_uri['wiki_stub'] = wikidata_genes_uri['wiki_uri'].astype(str).str.replace("https://en.wikipedia.org/wiki/","")
wikidata_genes_uri.head()
print(len(wikidata_genes_uri))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment