ZanSara/wikimedia_category_downloader.py

## wikimedia_category_downloader.py
# From https://colab.research.google.com/drive/12jGo_tm2bAD7NRiqxvF-XfKfEWgKIx4X#scrollTo=sDL9EihTwBaC&uniqifier=1

# pip install lxml aiohttp asyncio nest_asyncio aiofiles

import shutil
from lxml import etree
from lxml import html
import aiohttp
import asyncio
import aiofiles
import nest_asyncio
import os

nest_asyncio.apply()


url = 'https://commons.wikimedia.org/wiki/Category:Animals_in_Jardim_Zoológico_de_Lisboa'
storeDirectory = 'wikimedia_images/'
checkForCategories = True


tasks = []
categories = 0
categoryTasks = []
checkedCategories = []
completed = -1
totalImages = 0
completedImages = 0

async def fetch_page(session, url, cat = ''):
  try:
    async with session.get(url) as resp:
      source = await resp.text()

      dom = html.fromstring(source)

      return [cat, dom]
  except asyncio.TimeoutError or aiohttp.ClientConnectorError:
    #print('Timeout')
    return False

async def fetch_images(session, url):
  global totalImages

  dom = await fetch_page(session, url)

  #timeout error
  if dom == False:
    return

  images = dom[1].xpath('*//div[@class="thumb"]//a')
  subcategories = dom[1].xpath('*//div[@class="CategoryTreeItem"]//a')

  if(len(subcategories) > 0 and checkForCategories):
    for category in subcategories:
      if(category not in checkedCategories):
        categoryTasks.append(asyncio.ensure_future(fetch_images(session, 'https://commons.wikimedia.org' + category.attrib['href'])))
        checkedCategories.append(category)
        print('Found category', category.attrib['href'])

  if (len(images) > 0):
    totalImages += len(images)
    print("Found", len(images), "images")
    #download images for each category
    for image in images:
      cat = url.split('Category:')[1]
      tasks.append(asyncio.ensure_future(fetch_page(session, 'https://commons.wikimedia.org' + image.attrib['href'], cat)))

  global completed
  completed += 1

async def main(loop):
  global url
  global completedImages

  async with aiohttp.ClientSession(loop=loop) as session:
    await fetch_images(session, url)

    #fix to resolve finding all categories first
    while True:
      await asyncio.gather(*categoryTasks)

      #check if images have been found on all category pages
      if(completed == len(categoryTasks)):
        break

    pages = await asyncio.gather(*tasks)
    for page in pages:
      #timeout error
      if(page == False):
        continue

      cat = page[0]
      source = page[1]

      #print(cat, source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src'])
      imgURL = source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src']

      filename = imgURL.split('/')[-1]
      #TODO: save images into category folders
      async with session.get(imgURL) as resp:
        if resp.status == 200:
            if(os.path.isdir(storeDirectory + cat + '/') == False):
              os.mkdir(storeDirectory + cat + '/')

            f = await aiofiles.open(storeDirectory + cat + '/' + filename, mode='wb')
            await f.write(await resp.read())
            await f.close()
            completedImages += 1
            print(completedImages, '/', totalImages)

#main event loop
loop = asyncio.get_event_loop()
loop.run_until_complete(main(loop))
	# From https://colab.research.google.com/drive/12jGo_tm2bAD7NRiqxvF-XfKfEWgKIx4X#scrollTo=sDL9EihTwBaC&uniqifier=1

	# pip install lxml aiohttp asyncio nest_asyncio aiofiles

	import shutil
	from lxml import etree
	from lxml import html
	import aiohttp
	import asyncio
	import aiofiles
	import nest_asyncio
	import os

	nest_asyncio.apply()



	url = 'https://commons.wikimedia.org/wiki/Category:Animals_in_Jardim_Zoológico_de_Lisboa'
	storeDirectory = 'wikimedia_images/'
	checkForCategories = True



	tasks = []
	categories = 0
	categoryTasks = []
	checkedCategories = []
	completed = -1
	totalImages = 0
	completedImages = 0

	async def fetch_page(session, url, cat = ''):
	try:
	async with session.get(url) as resp:
	source = await resp.text()

	dom = html.fromstring(source)

	return [cat, dom]
	except asyncio.TimeoutError or aiohttp.ClientConnectorError:
	#print('Timeout')
	return False

	async def fetch_images(session, url):
	global totalImages

	dom = await fetch_page(session, url)

	#timeout error
	if dom == False:
	return

	images = dom[1].xpath('*//div[@class="thumb"]//a')
	subcategories = dom[1].xpath('*//div[@class="CategoryTreeItem"]//a')

	if(len(subcategories) > 0 and checkForCategories):
	for category in subcategories:
	if(category not in checkedCategories):
	categoryTasks.append(asyncio.ensure_future(fetch_images(session, 'https://commons.wikimedia.org' + category.attrib['href'])))
	checkedCategories.append(category)
	print('Found category', category.attrib['href'])

	if (len(images) > 0):
	totalImages += len(images)
	print("Found", len(images), "images")
	#download images for each category
	for image in images:
	cat = url.split('Category:')[1]
	tasks.append(asyncio.ensure_future(fetch_page(session, 'https://commons.wikimedia.org' + image.attrib['href'], cat)))

	global completed
	completed += 1

	async def main(loop):
	global url
	global completedImages

	async with aiohttp.ClientSession(loop=loop) as session:
	await fetch_images(session, url)

	#fix to resolve finding all categories first
	while True:
	await asyncio.gather(*categoryTasks)

	#check if images have been found on all category pages
	if(completed == len(categoryTasks)):
	break

	pages = await asyncio.gather(*tasks)
	for page in pages:
	#timeout error
	if(page == False):
	continue

	cat = page[0]
	source = page[1]

	#print(cat, source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src'])
	imgURL = source.xpath('*//div[@class="fullImageLink"]//img')[0].attrib['src']

	filename = imgURL.split('/')[-1]
	#TODO: save images into category folders
	async with session.get(imgURL) as resp:
	if resp.status == 200:
	if(os.path.isdir(storeDirectory + cat + '/') == False):
	os.mkdir(storeDirectory + cat + '/')

	f = await aiofiles.open(storeDirectory + cat + '/' + filename, mode='wb')
	await f.write(await resp.read())
	await f.close()
	completedImages += 1
	print(completedImages, '/', totalImages)

	#main event loop
	loop = asyncio.get_event_loop()
	loop.run_until_complete(main(loop))