Skip to content

Instantly share code, notes, and snippets.

@brawer
Created September 26, 2022 14:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save brawer/776ef9bd0870a8d2484629a16be40735 to your computer and use it in GitHub Desktop.
Save brawer/776ef9bd0870a8d2484629a16be40735 to your computer and use it in GitHub Desktop.
query Wikidata redirects
import re, requests
def _query(query):
r = requests.post('https://query.wikidata.org/sparql',
params={'query': query},
headers={'Accept': 'text/tab-separated-values'})
return r.text.splitlines()
_entity_re = re.compile(r'^<http://www.wikidata.org/entity/([A-Z]\d+)>$')
_timestamp_re = re.compile(r'^"(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z)"\^\^<http://www.w3.org/2001/XMLSchema#dateTime>$')
def _query_redirect_chunk(offset, limit):
query = '''
SELECT ?a ?b ?modified WHERE {
SERVICE bd:slice {
?a owl:sameAs ?b .
bd:serviceParam bd:slice.offset %d .
bd:serviceParam bd:slice.limit %d .
}
?a schema:dateModified ?modified .
}''' % (offset, limit)
for line in _query(query)[1:]:
c = line.split('\t')
src = _entity_re.search(c[0]).group(1)
target = _entity_re.search(c[1]).group(1)
timestamp = _timestamp_re.search(c[2]).group(1)
yield (src, target, timestamp)
def _query_redirects():
count = int(_query('SELECT (COUNT(*) as ?c) WHERE {[] owl:sameAs [].}')[1])
limit = 100000
for off in range(0, count, limit):
for src, target, timestamp in _query_redirect_chunk(off, limit):
yield (src, target, timestamp)
if __name__ == '__main__':
for src, target, timestamp in _query_redirects():
print (src, target, timestamp)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment