macloo/download_images.py

## download_images.py
"""
  Find the main image on a Wikipedia page and download it.
  Using a list of Wikipedia URLs, download the main image from each page in the list.
  Name the downloaded file to match the page URL.
"""

import requests, os

# set the folder name where images will be stored
my_folder = 'wiki_images'

# create the folder in the current working directory
# in which to store the downloaded images
os.makedirs(my_folder, exist_ok=True)

# front part of each Wikipedia URL
base_url = 'https://en.wikipedia.org/wiki/'

# partial URLs for each desired Wikipedia page
my_list = ['Anaea_troglodyta',
    'Colias_eurytheme',
    'Euphilotes_battoides',
    'Great_spangled_fritillary',
    'Papilio_troilus']

# Wikipedia API query string to get the main image on a page
# (partial URL will be added to the end)
query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

# get JSON data w/ API and extract image URL
def get_image_url(partial_url):
    try:
        api_res = requests.get(query + partial_url).json()
        first_part = api_res['query']['pages']
        # this is a way around not knowing the article id number
        for key, value in first_part.items():
            if (value['original']['source']):
                data = value['original']['source']
                return data
    except Exception as exc:
        print(exc)
        print("Partial URL: " + partial_url)
        data = None
    return data

# download one image with URL obtained from API
def download_image(the_url, the_page):
    res = requests.get(the_url)
    res.raise_for_status()

    # get original file extension for image
    # by splitting on . and getting the final segment
    file_ext = '.' + the_url.split('.')[-1].lower()

    # save the image to folder - binary file - with desired filename
    image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')

    # download the image file
    # HT to Automate the Boring Stuff with Python, chapter 12
    for chunk in res.iter_content(100000):
        image_file.write(chunk)
    image_file.close()

# loop to download main image for each page in list
counter = 1
for the_page in my_list:
    # get JSON data and extract image URL
    the_url = get_image_url(the_page)
    # if the URL is not None ...
    if (the_url):
        # tell us where we are for the heck of it
        print("Downloading image " + str(counter))
        # download that image
        download_image(the_url, the_page)
    else:
        print("No image file for " + the_page)
    counter += 1

print("All done!")
	"""
	Find the main image on a Wikipedia page and download it.
	Using a list of Wikipedia URLs, download the main image from each page in the list.
	Name the downloaded file to match the page URL.
	"""

	import requests, os

	# set the folder name where images will be stored
	my_folder = 'wiki_images'

	# create the folder in the current working directory
	# in which to store the downloaded images
	os.makedirs(my_folder, exist_ok=True)

	# front part of each Wikipedia URL
	base_url = 'https://en.wikipedia.org/wiki/'

	# partial URLs for each desired Wikipedia page
	my_list = ['Anaea_troglodyta',
	'Colias_eurytheme',
	'Euphilotes_battoides',
	'Great_spangled_fritillary',
	'Papilio_troilus']

	# Wikipedia API query string to get the main image on a page
	# (partial URL will be added to the end)
	query = 'http://en.wikipedia.org/w/api.php?action=query&prop=pageimages&format=json&piprop=original&titles='

	# get JSON data w/ API and extract image URL
	def get_image_url(partial_url):
	try:
	api_res = requests.get(query + partial_url).json()
	first_part = api_res['query']['pages']
	# this is a way around not knowing the article id number
	for key, value in first_part.items():
	if (value['original']['source']):
	data = value['original']['source']
	return data
	except Exception as exc:
	print(exc)
	print("Partial URL: " + partial_url)
	data = None
	return data

	# download one image with URL obtained from API
	def download_image(the_url, the_page):
	res = requests.get(the_url)
	res.raise_for_status()

	# get original file extension for image
	# by splitting on . and getting the final segment
	file_ext = '.' + the_url.split('.')[-1].lower()

	# save the image to folder - binary file - with desired filename
	image_file = open(os.path.join(my_folder, os.path.basename(the_page + file_ext)), 'wb')

	# download the image file
	# HT to Automate the Boring Stuff with Python, chapter 12
	for chunk in res.iter_content(100000):
	image_file.write(chunk)
	image_file.close()

	# loop to download main image for each page in list
	counter = 1
	for the_page in my_list:
	# get JSON data and extract image URL
	the_url = get_image_url(the_page)
	# if the URL is not None ...
	if (the_url):
	# tell us where we are for the heck of it
	print("Downloading image " + str(counter))
	# download that image
	download_image(the_url, the_page)
	else:
	print("No image file for " + the_page)
	counter += 1

	print("All done!")