Skip to content

Instantly share code, notes, and snippets.

Created August 15, 2012 01:32
Show Gist options
  • Save pcote/3354583 to your computer and use it in GitHub Desktop.
Save pcote/3354583 to your computer and use it in GitHub Desktop.
An image collecting screen scraper built as an interesting example usage of Beautiful Soup. Done with Python 2.7
A script to screen scrape an image archive and then download those images
one by one to a local folder. Interesting example usage of BeautifulSoup.
Modify as needed for the site you intend to scrape.
Use only with permission of site owner!!!
Geeky Note:
This is interesting example of using maps and list comprehensions to keep
state changes to a minimum.
import random
import time
from bs4 import BeautifulSoup
import urllib
import re
def wait_a_few_secs(func):
"""Handy little decorator to use for when you
don't want to hammer a server with requests to fast."""
def wrapper_func( *args ):
image_url = func(*args)
min_wait_time, max_wait_time = 2, 10
sleep_time = random.randrange(min_wait_time+1, max_wait_time)
print("stored image: %s \nNow sleeping for %d seconds\n" % (image_url, sleep_time))
return wrapper_func
def get_archive_image_links():
"""Collects and returns beautiful soup links to the image pages.
Works on the assumption that the target contains all the links to the desired
images on one archive page.
archive_url = ""
raw_data = urllib.urlopen(archive_url).read()
soup = BeautifulSoup(raw_data, "lxml")
all_links = soup.findAll("a")
# image links for this case follow the <a href="/1234/">foo</a> pattern
pattern = r"/[0-9]{1,4}/"
found_href = lambda x : re.match(pattern, x['href'])
image_links = [x for x in all_links if found_href(x)]
return image_links
def download_image(link):
"""Uses the beautiful soup link to navigate to the page with the image.
Then finds the image, downloads it, and sticks it in a local folder."""
# image url will be the page that has the image file we want embedded in it.
image_page_url = "" % link['href']
raw_data = urllib.urlopen(image_page_url).read()
image_page = BeautifulSoup(raw_data)
# the page the link takes you to should have one div tag IDed as 'imageid'
found_id = lambda x : x.has_attr("id") and x["id"] == 'example_id'
image_divs = [x for x in image_page.findAll("div") if found_id(x)][0]
image_tag = image_divs.findAll("img")[0]
image_url = image_tag["src"]
# download the image and stick it in a folder.
image_data = urllib.urlopen(image_url).read()
image_name = "image_stash/%s" % image_url.split("/")[-1]
image_file = open(image_name, "wb")
return image_url
if __name__ == '__main__':
image_links = get_archive_image_links()
map(download_image, image_links)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment