Skip to content

Instantly share code, notes, and snippets.

@morrisalp
Created September 10, 2023 16:43
Show Gist options
  • Save morrisalp/64d572bc61c91e647b8986b4127663d0 to your computer and use it in GitHub Desktop.
Save morrisalp/64d572bc61c91e647b8986b4127663d0 to your computer and use it in GitHub Desktop.
simple API calls to get lists of files and (sub)categories from WikiMedia Commons
import requests
import json
from tqdm.auto import tqdm
import pandas as pd
def kwargs2url(**kwargs):
url = 'https://commons.wikimedia.org/w/api.php?'
for k, v in kwargs.items():
url += f'{k}={v}&'
return url.rstrip('&')
base_kwargs = {
'format': 'json',
'action': 'query',
'list': 'categorymembers',
'cmlimit': '500'
}
def datagen(cat, cmtype):
cont = ''
while True:
title = f'Category:{cat.replace(" ", "_")}'
url = kwargs2url(**base_kwargs, cmtype=cmtype, cmcontinue=cont, cmtitle=title)
res = requests.get(url)
assert res.ok
obj = json.loads(res.text)
for x in obj['query']['categorymembers']:
yield x
if 'continue' not in obj:
break
cont = obj['continue']['cmcontinue']
def get_filenames(cat):
return pd.DataFrame(list(tqdm(datagen(cat, 'file'))))
def get_subcategories(cat):
return pd.DataFrame(list(tqdm(datagen(cat, 'subcat'))))
# try the following: (return dataframes)
# get_filenames('Tibetan Plateau')
# get_subcategories('Tibetan Plateau')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment