Skip to content

Instantly share code, notes, and snippets.

@pcote
Created August 15, 2012 01:32
Show Gist options
  • Save pcote/3354583 to your computer and use it in GitHub Desktop.
Save pcote/3354583 to your computer and use it in GitHub Desktop.
An image collecting screen scraper built as an interesting example usage of Beautiful Soup. Done with Python 2.7
"""
image_collector.py
A script to screen scrape an image archive and then download those images
one by one to a local folder. Interesting example usage of BeautifulSoup.
Modify as needed for the site you intend to scrape.
Use only with permission of site owner!!!
Geeky Note:
This is interesting example of using maps and list comprehensions to keep
state changes to a minimum.
"""
import random
import time
from bs4 import BeautifulSoup
import urllib
import re
def wait_a_few_secs(func):
"""Handy little decorator to use for when you
don't want to hammer a server with requests to fast."""
def wrapper_func( *args ):
image_url = func(*args)
random.seed(time.time())
min_wait_time, max_wait_time = 2, 10
sleep_time = random.randrange(min_wait_time+1, max_wait_time)
print("stored image: %s \nNow sleeping for %d seconds\n" % (image_url, sleep_time))
time.sleep(sleep_time)
return wrapper_func
def get_archive_image_links():
"""Collects and returns beautiful soup links to the image pages.
Works on the assumption that the target contains all the links to the desired
images on one archive page.
"""
archive_url = "http://www.example.com/archive/"
raw_data = urllib.urlopen(archive_url).read()
soup = BeautifulSoup(raw_data, "lxml")
all_links = soup.findAll("a")
# image links for this case follow the <a href="/1234/">foo</a> pattern
pattern = r"/[0-9]{1,4}/"
found_href = lambda x : re.match(pattern, x['href'])
image_links = [x for x in all_links if found_href(x)]
return image_links
@wait_a_few_secs
def download_image(link):
"""Uses the beautiful soup link to navigate to the page with the image.
Then finds the image, downloads it, and sticks it in a local folder."""
# image url will be the page that has the image file we want embedded in it.
image_page_url = "http://www.example.com%s" % link['href']
raw_data = urllib.urlopen(image_page_url).read()
image_page = BeautifulSoup(raw_data)
# the page the link takes you to should have one div tag IDed as 'imageid'
found_id = lambda x : x.has_attr("id") and x["id"] == 'example_id'
image_divs = [x for x in image_page.findAll("div") if found_id(x)][0]
image_tag = image_divs.findAll("img")[0]
image_url = image_tag["src"]
# download the image and stick it in a folder.
image_data = urllib.urlopen(image_url).read()
image_name = "image_stash/%s" % image_url.split("/")[-1]
image_file = open(image_name, "wb")
image_file.write(image_data)
image_file.flush()
return image_url
if __name__ == '__main__':
image_links = get_archive_image_links()
map(download_image, image_links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment