structured-data-usage.py
# written by gabrielchl | |
# on 12 May 2020 | |
import requests | |
import threading | |
import queue | |
done_adding = False | |
q = queue.Queue() | |
properties_count = {} | |
processed_count = 0 | |
def thread_func(): | |
while True: | |
print('Queue size: {}'.format(q.qsize())) | |
try: | |
titles = q.get(True, 2) | |
params = { | |
'action': 'wbgetentities', | |
'format': 'json', | |
'sites': 'commonswiki', | |
'titles': titles | |
} | |
response = requests.get( | |
'https://commons.wikimedia.org/w/api.php', | |
params=params | |
) | |
response = response.json() | |
for entity in list(response['entities'].values()): | |
try: | |
props = list(entity['statements'].keys()) | |
for prop in props: | |
try: | |
properties_count[prop] += 1 | |
except KeyError: | |
properties_count[prop] = 0 | |
except Exception: | |
continue | |
q.task_done() | |
except queue.Empty: | |
if done_adding: | |
print('thread exit.') | |
return | |
threads = list() | |
for i in range(15): | |
thread = threading.Thread(target=thread_func) | |
threads.append(thread) | |
thread.start() | |
image_count = 0 | |
continue_key = '' | |
while (image_count < 1000000): | |
allimages_params = { | |
'action': 'query', | |
'format': 'json', | |
'list': 'allimages', | |
'aisort': 'timestamp', | |
'aidir': 'descending', | |
'ailimit': '500' | |
} | |
if continue_key != '': | |
allimages_params['aicontinue'] = continue_key | |
allimages_response = requests.get( | |
'https://commons.wikimedia.org/w/api.php', | |
params=allimages_params | |
).json() | |
continue_key = allimages_response['continue']['aicontinue'] | |
image_count += 500 | |
print('Got new batch of images. {}'.format(image_count)) | |
for index in range(50): | |
titles = '|'.join([image['title'] for image in | |
allimages_response['query']['allimages']] | |
[index * 10:(index + 1) * 10]) | |
q.put(titles) | |
done_adding = True | |
q.join() | |
print(properties_count) | |
print(image_count) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment