Created
August 15, 2012 01:32
-
-
Save pcote/3354583 to your computer and use it in GitHub Desktop.
An image collecting screen scraper built as an interesting example usage of Beautiful Soup. Done with Python 2.7
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
image_collector.py | |
A script to screen scrape an image archive and then download those images | |
one by one to a local folder. Interesting example usage of BeautifulSoup. | |
Modify as needed for the site you intend to scrape. | |
Use only with permission of site owner!!! | |
Geeky Note: | |
This is interesting example of using maps and list comprehensions to keep | |
state changes to a minimum. | |
""" | |
import random | |
import time | |
from bs4 import BeautifulSoup | |
import urllib | |
import re | |
def wait_a_few_secs(func): | |
"""Handy little decorator to use for when you | |
don't want to hammer a server with requests to fast.""" | |
def wrapper_func( *args ): | |
image_url = func(*args) | |
random.seed(time.time()) | |
min_wait_time, max_wait_time = 2, 10 | |
sleep_time = random.randrange(min_wait_time+1, max_wait_time) | |
print("stored image: %s \nNow sleeping for %d seconds\n" % (image_url, sleep_time)) | |
time.sleep(sleep_time) | |
return wrapper_func | |
def get_archive_image_links(): | |
"""Collects and returns beautiful soup links to the image pages. | |
Works on the assumption that the target contains all the links to the desired | |
images on one archive page. | |
""" | |
archive_url = "http://www.example.com/archive/" | |
raw_data = urllib.urlopen(archive_url).read() | |
soup = BeautifulSoup(raw_data, "lxml") | |
all_links = soup.findAll("a") | |
# image links for this case follow the <a href="/1234/">foo</a> pattern | |
pattern = r"/[0-9]{1,4}/" | |
found_href = lambda x : re.match(pattern, x['href']) | |
image_links = [x for x in all_links if found_href(x)] | |
return image_links | |
@wait_a_few_secs | |
def download_image(link): | |
"""Uses the beautiful soup link to navigate to the page with the image. | |
Then finds the image, downloads it, and sticks it in a local folder.""" | |
# image url will be the page that has the image file we want embedded in it. | |
image_page_url = "http://www.example.com%s" % link['href'] | |
raw_data = urllib.urlopen(image_page_url).read() | |
image_page = BeautifulSoup(raw_data) | |
# the page the link takes you to should have one div tag IDed as 'imageid' | |
found_id = lambda x : x.has_attr("id") and x["id"] == 'example_id' | |
image_divs = [x for x in image_page.findAll("div") if found_id(x)][0] | |
image_tag = image_divs.findAll("img")[0] | |
image_url = image_tag["src"] | |
# download the image and stick it in a folder. | |
image_data = urllib.urlopen(image_url).read() | |
image_name = "image_stash/%s" % image_url.split("/")[-1] | |
image_file = open(image_name, "wb") | |
image_file.write(image_data) | |
image_file.flush() | |
return image_url | |
if __name__ == '__main__': | |
image_links = get_archive_image_links() | |
map(download_image, image_links) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment