Created
November 28, 2022 13:38
-
-
Save ZanSara/6383e4742865b2879a1bde8a4ccb5e2c to your computer and use it in GitHub Desktop.
Script to download all Wikimedia images in a category
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From https://colab.research.google.com/drive/12jGo_tm2bAD7NRiqxvF-XfKfEWgKIx4X#scrollTo=sDL9EihTwBaC&uniqifier=1 | |
# pip install lxml aiohttp asyncio nest_asyncio aiofiles | |
import shutil | |
from lxml import etree | |
from lxml import html | |
import aiohttp | |
import asyncio | |
import aiofiles | |
import nest_asyncio | |
import os | |
nest_asyncio.apply() | |
url = 'https://commons.wikimedia.org/wiki/Category:Animals_in_Jardim_Zoológico_de_Lisboa' | |
storeDirectory = 'wikimedia_images/' | |
checkForCategories = True | |
tasks = [] | |
categories = 0 | |
categoryTasks = [] | |
checkedCategories = [] | |
completed = -1 | |
totalImages = 0 | |
completedImages = 0 | |
async def fetch_page(session, url, cat = ''): | |
try: | |
async with session.get(url) as resp: | |
source = await resp.text() | |
dom = html.fromstring(source) | |
return [cat, dom] | |
except asyncio.TimeoutError or aiohttp.ClientConnectorError: | |
#print('Timeout') | |
return False | |
async def fetch_images(session, url): | |
global totalImages | |
dom = await fetch_page(session, url) | |
#timeout error | |
if dom == False: | |
return | |
images = dom[1].xpath('*//div[@class="thumb"]//a') | |
subcategories = dom[1].xpath('*//div[@class="CategoryTreeItem"]//a') | |
if(len(subcategories) > 0 and checkForCategories): | |
for category in subcategories: | |
if(category not in checkedCategories): | |
categoryTasks.append(asyncio.ensure_future(fetch_images(session, 'https://commons.wikimedia.org' + category.attrib['href']))) | |
checkedCategories.append(category) | |
print('Found category', category.attrib['href']) | |
if (len(images) > 0): | |
totalImages += len(images) | |
print("Found", len(images), "images") | |
#download images for each category | |
for image in images: | |
cat = url.split('Category:')[1] | |
tasks.append(asyncio.ensure_future(fetch_page(session, 'https://commons.wikimedia.org' + image.attrib['href'], cat))) | |
global completed | |
completed += 1 | |
async def main(loop): | |
global url | |
global completedImages | |
async with aiohttp.ClientSession(loop=loop) as session: | |
await fetch_images(session, url) | |
#fix to resolve finding all categories first | |
while True: | |
await asyncio.gather(*categoryTasks) | |
#check if images have been found on all category pages | |
if(completed == len(categoryTasks)): | |
break | |
pages = await asyncio.gather(*tasks) | |
for page in pages: | |
#timeout error | |
if(page == False): | |
continue | |
cat = page[0] | |
source = page[1] | |
#print(cat, source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src']) | |
imgURL = source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src'] | |
filename = imgURL.split('/')[-1] | |
#TODO: save images into category folders | |
async with session.get(imgURL) as resp: | |
if resp.status == 200: | |
if(os.path.isdir(storeDirectory + cat + '/') == False): | |
os.mkdir(storeDirectory + cat + '/') | |
f = await aiofiles.open(storeDirectory + cat + '/' + filename, mode='wb') | |
await f.write(await resp.read()) | |
await f.close() | |
completedImages += 1 | |
print(completedImages, '/', totalImages) | |
#main event loop | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main(loop)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment