Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
structured-data-usage.py
# written by gabrielchl
# on 12 May 2020
import requests
import threading
import queue
done_adding = False
q = queue.Queue()
properties_count = {}
processed_count = 0
def thread_func():
while True:
print('Queue size: {}'.format(q.qsize()))
try:
titles = q.get(True, 2)
params = {
'action': 'wbgetentities',
'format': 'json',
'sites': 'commonswiki',
'titles': titles
}
response = requests.get(
'https://commons.wikimedia.org/w/api.php',
params=params
)
response = response.json()
for entity in list(response['entities'].values()):
try:
props = list(entity['statements'].keys())
for prop in props:
try:
properties_count[prop] += 1
except KeyError:
properties_count[prop] = 0
except Exception:
continue
q.task_done()
except queue.Empty:
if done_adding:
print('thread exit.')
return
threads = list()
for i in range(15):
thread = threading.Thread(target=thread_func)
threads.append(thread)
thread.start()
image_count = 0
continue_key = ''
while (image_count < 1000000):
allimages_params = {
'action': 'query',
'format': 'json',
'list': 'allimages',
'aisort': 'timestamp',
'aidir': 'descending',
'ailimit': '500'
}
if continue_key != '':
allimages_params['aicontinue'] = continue_key
allimages_response = requests.get(
'https://commons.wikimedia.org/w/api.php',
params=allimages_params
).json()
continue_key = allimages_response['continue']['aicontinue']
image_count += 500
print('Got new batch of images. {}'.format(image_count))
for index in range(50):
titles = '|'.join([image['title'] for image in
allimages_response['query']['allimages']]
[index * 10:(index + 1) * 10])
q.put(titles)
done_adding = True
q.join()
print(properties_count)
print(image_count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment