Skip to content

Instantly share code, notes, and snippets.

@ZanSara
Created November 28, 2022 13:38
Show Gist options
  • Save ZanSara/6383e4742865b2879a1bde8a4ccb5e2c to your computer and use it in GitHub Desktop.
Save ZanSara/6383e4742865b2879a1bde8a4ccb5e2c to your computer and use it in GitHub Desktop.
Script to download all Wikimedia images in a category
# From https://colab.research.google.com/drive/12jGo_tm2bAD7NRiqxvF-XfKfEWgKIx4X#scrollTo=sDL9EihTwBaC&uniqifier=1
# pip install lxml aiohttp asyncio nest_asyncio aiofiles
import shutil
from lxml import etree
from lxml import html
import aiohttp
import asyncio
import aiofiles
import nest_asyncio
import os
nest_asyncio.apply()
url = 'https://commons.wikimedia.org/wiki/Category:Animals_in_Jardim_Zoológico_de_Lisboa'
storeDirectory = 'wikimedia_images/'
checkForCategories = True
tasks = []
categories = 0
categoryTasks = []
checkedCategories = []
completed = -1
totalImages = 0
completedImages = 0
async def fetch_page(session, url, cat = ''):
try:
async with session.get(url) as resp:
source = await resp.text()
dom = html.fromstring(source)
return [cat, dom]
except asyncio.TimeoutError or aiohttp.ClientConnectorError:
#print('Timeout')
return False
async def fetch_images(session, url):
global totalImages
dom = await fetch_page(session, url)
#timeout error
if dom == False:
return
images = dom[1].xpath('*//div[@class="thumb"]//a')
subcategories = dom[1].xpath('*//div[@class="CategoryTreeItem"]//a')
if(len(subcategories) > 0 and checkForCategories):
for category in subcategories:
if(category not in checkedCategories):
categoryTasks.append(asyncio.ensure_future(fetch_images(session, 'https://commons.wikimedia.org' + category.attrib['href'])))
checkedCategories.append(category)
print('Found category', category.attrib['href'])
if (len(images) > 0):
totalImages += len(images)
print("Found", len(images), "images")
#download images for each category
for image in images:
cat = url.split('Category:')[1]
tasks.append(asyncio.ensure_future(fetch_page(session, 'https://commons.wikimedia.org' + image.attrib['href'], cat)))
global completed
completed += 1
async def main(loop):
global url
global completedImages
async with aiohttp.ClientSession(loop=loop) as session:
await fetch_images(session, url)
#fix to resolve finding all categories first
while True:
await asyncio.gather(*categoryTasks)
#check if images have been found on all category pages
if(completed == len(categoryTasks)):
break
pages = await asyncio.gather(*tasks)
for page in pages:
#timeout error
if(page == False):
continue
cat = page[0]
source = page[1]
#print(cat, source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src'])
imgURL = source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src']
filename = imgURL.split('/')[-1]
#TODO: save images into category folders
async with session.get(imgURL) as resp:
if resp.status == 200:
if(os.path.isdir(storeDirectory + cat + '/') == False):
os.mkdir(storeDirectory + cat + '/')
f = await aiofiles.open(storeDirectory + cat + '/' + filename, mode='wb')
await f.write(await resp.read())
await f.close()
completedImages += 1
print(completedImages, '/', totalImages)
#main event loop
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment