Created
February 18, 2016 06:22
-
-
Save sebotic/321c7b7bd3a5017a7d43 to your computer and use it in GitHub Desktop.
Retrieves the latest revision for all Wikidata items which are retrieved by a SPARQL query
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import PBB_login | |
import sys | |
import pprint | |
__author__ = 'Sebastian Burgstaller-Muehlbacher' | |
__license__ = 'AGPLv3' | |
def exec_wd_query(y): | |
rev_id_block = [x[1] for x in y] | |
params = { | |
'action': 'query', | |
'revids': '|'.join(rev_id_block), | |
'prop': 'revisions', | |
'rvprop': 'ids|user|timestamp', | |
'format': 'json' | |
} | |
url = 'https://www.wikidata.org/w/api.php' | |
r = requests.get(url, params=params, cookies=login.get_edit_cookie()) | |
for z in r.json()['query']['pages'].values(): | |
qid = z['title'] | |
for rev in z['revisions']: | |
rev_id = rev['revid'] | |
if str(rev_id) not in rev_id_block: | |
print('Warning: This reference was not requested') | |
parent_id = rev['parentid'] | |
user_name = rev['user'] | |
timestamp = rev['timestamp'] | |
print('QID:', qid, 'rev_id:', rev_id, user_name, 'parent id:', parent_id, timestamp) | |
yield tuple([qid, rev_id, user_name, parent_id, timestamp]) | |
def extract_values(x): | |
gene_qid = x['gene']['value'].split('/')[-1] | |
revision_id = x['revision']['value'] | |
return tuple([gene_qid, revision_id]) | |
prefix = ''' | |
PREFIX schema: <http://schema.org/> | |
PREFIX wd: <http://www.wikidata.org/entity/> | |
PREFIX wdt: <http://www.wikidata.org/prop/direct/> | |
''' | |
query = ''' | |
select ?gene ?revision where { | |
?gene wdt:P351 ?entrez . | |
?gene wdt:P703 wd:Q5 . | |
?gene schema:version ?revision . | |
} | |
''' | |
headers = { | |
'Accept': 'application/sparql-results+json' | |
} | |
params = { | |
'query': prefix + query, | |
'format': 'json' | |
} | |
url = 'https://query.wikidata.org/sparql' | |
print(url) | |
print(sys.argv[1]) | |
login = PBB_login.WDLogin(user='ProteinBoxBot', pwd=sys.argv[1]) | |
sparql_results = requests.get(url, params=params, headers=headers) | |
sr = sparql_results.json()['results']['bindings'] | |
chunks = [[extract_values(x) for x in sr[i:i + 500]] for i in range(0, len(sr), 500)] | |
final_results = [exec_wd_query(y) for y in chunks] | |
print(len([x for y in final_results for x in y])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment