Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Retrieves the latest revision for all Wikidata items which are retrieved by a SPARQL query
import requests
import PBB_login
import sys
import pprint
__author__ = 'Sebastian Burgstaller-Muehlbacher'
__license__ = 'AGPLv3'
def exec_wd_query(y):
rev_id_block = [x[1] for x in y]
params = {
'action': 'query',
'revids': '|'.join(rev_id_block),
'prop': 'revisions',
'rvprop': 'ids|user|timestamp',
'format': 'json'
url = ''
r = requests.get(url, params=params, cookies=login.get_edit_cookie())
for z in r.json()['query']['pages'].values():
qid = z['title']
for rev in z['revisions']:
rev_id = rev['revid']
if str(rev_id) not in rev_id_block:
print('Warning: This reference was not requested')
parent_id = rev['parentid']
user_name = rev['user']
timestamp = rev['timestamp']
print('QID:', qid, 'rev_id:', rev_id, user_name, 'parent id:', parent_id, timestamp)
yield tuple([qid, rev_id, user_name, parent_id, timestamp])
def extract_values(x):
gene_qid = x['gene']['value'].split('/')[-1]
revision_id = x['revision']['value']
return tuple([gene_qid, revision_id])
prefix = '''
PREFIX schema: <>
PREFIX wd: <>
PREFIX wdt: <>
query = '''
select ?gene ?revision where {
?gene wdt:P351 ?entrez .
?gene wdt:P703 wd:Q5 .
?gene schema:version ?revision .
headers = {
'Accept': 'application/sparql-results+json'
params = {
'query': prefix + query,
'format': 'json'
url = ''
login = PBB_login.WDLogin(user='ProteinBoxBot', pwd=sys.argv[1])
sparql_results = requests.get(url, params=params, headers=headers)
sr = sparql_results.json()['results']['bindings']
chunks = [[extract_values(x) for x in sr[i:i + 500]] for i in range(0, len(sr), 500)]
final_results = [exec_wd_query(y) for y in chunks]
print(len([x for y in final_results for x in y]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment