Created
September 26, 2022 14:56
-
-
Save brawer/776ef9bd0870a8d2484629a16be40735 to your computer and use it in GitHub Desktop.
query Wikidata redirects
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, requests | |
def _query(query): | |
r = requests.post('https://query.wikidata.org/sparql', | |
params={'query': query}, | |
headers={'Accept': 'text/tab-separated-values'}) | |
return r.text.splitlines() | |
_entity_re = re.compile(r'^<http://www.wikidata.org/entity/([A-Z]\d+)>$') | |
_timestamp_re = re.compile(r'^"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"\^\^<http://www.w3.org/2001/XMLSchema#dateTime>$') | |
def _query_redirect_chunk(offset, limit): | |
query = ''' | |
SELECT ?a ?b ?modified WHERE { | |
SERVICE bd:slice { | |
?a owl:sameAs ?b . | |
bd:serviceParam bd:slice.offset %d . | |
bd:serviceParam bd:slice.limit %d . | |
} | |
?a schema:dateModified ?modified . | |
}''' % (offset, limit) | |
for line in _query(query)[1:]: | |
c = line.split('\t') | |
src = _entity_re.search(c[0]).group(1) | |
target = _entity_re.search(c[1]).group(1) | |
timestamp = _timestamp_re.search(c[2]).group(1) | |
yield (src, target, timestamp) | |
def _query_redirects(): | |
count = int(_query('SELECT (COUNT(*) as ?c) WHERE {[] owl:sameAs [].}')[1]) | |
limit = 100000 | |
for off in range(0, count, limit): | |
for src, target, timestamp in _query_redirect_chunk(off, limit): | |
yield (src, target, timestamp) | |
if __name__ == '__main__': | |
for src, target, timestamp in _query_redirects(): | |
print (src, target, timestamp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment