Skip to content

Instantly share code, notes, and snippets.

@sebotic
Created February 18, 2016 06:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sebotic/321c7b7bd3a5017a7d43 to your computer and use it in GitHub Desktop.
Save sebotic/321c7b7bd3a5017a7d43 to your computer and use it in GitHub Desktop.
Retrieves the latest revision for all Wikidata items which are retrieved by a SPARQL query
import requests
import PBB_login
import sys
import pprint
__author__ = 'Sebastian Burgstaller-Muehlbacher'
__license__ = 'AGPLv3'
def exec_wd_query(y):
rev_id_block = [x[1] for x in y]
params = {
'action': 'query',
'revids': '|'.join(rev_id_block),
'prop': 'revisions',
'rvprop': 'ids|user|timestamp',
'format': 'json'
}
url = 'https://www.wikidata.org/w/api.php'
r = requests.get(url, params=params, cookies=login.get_edit_cookie())
for z in r.json()['query']['pages'].values():
qid = z['title']
for rev in z['revisions']:
rev_id = rev['revid']
if str(rev_id) not in rev_id_block:
print('Warning: This reference was not requested')
parent_id = rev['parentid']
user_name = rev['user']
timestamp = rev['timestamp']
print('QID:', qid, 'rev_id:', rev_id, user_name, 'parent id:', parent_id, timestamp)
yield tuple([qid, rev_id, user_name, parent_id, timestamp])
def extract_values(x):
gene_qid = x['gene']['value'].split('/')[-1]
revision_id = x['revision']['value']
return tuple([gene_qid, revision_id])
prefix = '''
PREFIX schema: <http://schema.org/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
'''
query = '''
select ?gene ?revision where {
?gene wdt:P351 ?entrez .
?gene wdt:P703 wd:Q5 .
?gene schema:version ?revision .
}
'''
headers = {
'Accept': 'application/sparql-results+json'
}
params = {
'query': prefix + query,
'format': 'json'
}
url = 'https://query.wikidata.org/sparql'
print(url)
print(sys.argv[1])
login = PBB_login.WDLogin(user='ProteinBoxBot', pwd=sys.argv[1])
sparql_results = requests.get(url, params=params, headers=headers)
sr = sparql_results.json()['results']['bindings']
chunks = [[extract_values(x) for x in sr[i:i + 500]] for i in range(0, len(sr), 500)]
final_results = [exec_wd_query(y) for y in chunks]
print(len([x for y in final_results for x in y]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment