pcote/image_collector.py

## image_collector.py
"""
image_collector.py

A script to screen scrape an image archive and then download those images
one by one to a local folder.  Interesting example usage of BeautifulSoup.

Modify as needed for the site you intend to scrape.
Use only with permission of site owner!!!

Geeky Note:
This is interesting example of using maps and list comprehensions to keep
state changes to a minimum.
"""

import random
import time
from bs4 import BeautifulSoup
import urllib
import re

def wait_a_few_secs(func):
    """Handy little decorator to use for when you
    don't want to hammer a server with requests to fast."""
    def wrapper_func( *args ):
        image_url = func(*args)
        random.seed(time.time())
        min_wait_time, max_wait_time = 2, 10
        sleep_time = random.randrange(min_wait_time+1, max_wait_time)
        print("stored image: %s \nNow sleeping for %d seconds\n" % (image_url, sleep_time))
        time.sleep(sleep_time)

    return wrapper_func


def get_archive_image_links():
    """Collects and returns beautiful soup links to the image pages.
    Works on the assumption that the target contains all the links to the desired
    images on one archive page.
    """
    archive_url = "http://www.example.com/archive/"
    raw_data = urllib.urlopen(archive_url).read()
    soup = BeautifulSoup(raw_data, "lxml")
    all_links = soup.findAll("a")

    # image links for this case follow the <a href="/1234/">foo</a> pattern
    pattern = r"/[0-9]{1,4}/"
    found_href = lambda x : re.match(pattern, x['href'])
    image_links = [x for x in all_links if found_href(x)]
    return image_links

@wait_a_few_secs
def download_image(link):
    """Uses the beautiful soup link to navigate to the page with the image.
    Then finds the image, downloads it, and sticks it in a local folder."""

    # image url will be the page that has the image file we want embedded in it.
    image_page_url = "http://www.example.com%s" % link['href']
    raw_data = urllib.urlopen(image_page_url).read()
    image_page = BeautifulSoup(raw_data)

    # the page the link takes you to should have one div tag IDed as 'imageid'
    found_id = lambda x :  x.has_attr("id") and x["id"] == 'example_id'
    image_divs = [x for x in image_page.findAll("div") if found_id(x)][0]
    image_tag = image_divs.findAll("img")[0]
    image_url = image_tag["src"]

    # download the image and stick it in a folder.
    image_data = urllib.urlopen(image_url).read()
    image_name = "image_stash/%s" % image_url.split("/")[-1]
    image_file = open(image_name, "wb")
    image_file.write(image_data)
    image_file.flush()
    return image_url

if __name__ == '__main__':
    image_links = get_archive_image_links()
    map(download_image, image_links)
	"""
	image_collector.py

	A script to screen scrape an image archive and then download those images
	one by one to a local folder. Interesting example usage of BeautifulSoup.

	Modify as needed for the site you intend to scrape.
	Use only with permission of site owner!!!

	Geeky Note:
	This is interesting example of using maps and list comprehensions to keep
	state changes to a minimum.
	"""

	import random
	import time
	from bs4 import BeautifulSoup
	import urllib
	import re

	def wait_a_few_secs(func):
	"""Handy little decorator to use for when you
	don't want to hammer a server with requests to fast."""
	def wrapper_func( *args ):
	image_url = func(*args)
	random.seed(time.time())
	min_wait_time, max_wait_time = 2, 10
	sleep_time = random.randrange(min_wait_time+1, max_wait_time)
	print("stored image: %s \nNow sleeping for %d seconds\n" % (image_url, sleep_time))
	time.sleep(sleep_time)

	return wrapper_func


	def get_archive_image_links():
	"""Collects and returns beautiful soup links to the image pages.
	Works on the assumption that the target contains all the links to the desired
	images on one archive page.
	"""
	archive_url = "http://www.example.com/archive/"
	raw_data = urllib.urlopen(archive_url).read()
	soup = BeautifulSoup(raw_data, "lxml")
	all_links = soup.findAll("a")

	# image links for this case follow the <a href="/1234/">foo</a> pattern
	pattern = r"/[0-9]{1,4}/"
	found_href = lambda x : re.match(pattern, x['href'])
	image_links = [x for x in all_links if found_href(x)]
	return image_links

	@wait_a_few_secs
	def download_image(link):
	"""Uses the beautiful soup link to navigate to the page with the image.
	Then finds the image, downloads it, and sticks it in a local folder."""

	# image url will be the page that has the image file we want embedded in it.
	image_page_url = "http://www.example.com%s" % link['href']
	raw_data = urllib.urlopen(image_page_url).read()
	image_page = BeautifulSoup(raw_data)

	# the page the link takes you to should have one div tag IDed as 'imageid'
	found_id = lambda x : x.has_attr("id") and x["id"] == 'example_id'
	image_divs = [x for x in image_page.findAll("div") if found_id(x)][0]
	image_tag = image_divs.findAll("img")[0]
	image_url = image_tag["src"]

	# download the image and stick it in a folder.
	image_data = urllib.urlopen(image_url).read()
	image_name = "image_stash/%s" % image_url.split("/")[-1]
	image_file = open(image_name, "wb")
	image_file.write(image_data)
	image_file.flush()
	return image_url

	if __name__ == '__main__':
	image_links = get_archive_image_links()
	map(download_image, image_links)