Skip to content

Instantly share code, notes, and snippets.

@timbroder
Created February 6, 2013 22:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save timbroder/4726372 to your computer and use it in GitHub Desktop.
Save timbroder/4726372 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib2, urllib
import sys
import imghdr
import os
root = "http://www.imgspark.com"
base = "%s/image/popular/sethwhitton/alltime/" % root
page = urllib2.urlopen(base).read()
soup = BeautifulSoup(page)
print "pulling from %s" % base
next_page = True
while next_page:
for thumb in soup.findAll('img', { "class":"spark_image" }):
preview_url = "%s%s" % (root, thumb.parent['href'])
preview_page = urllib2.urlopen(preview_url).read()
preview_soup = BeautifulSoup(preview_page)
orig_size = preview_soup.find('p', { "class":"original_size" })
orig_size_url = orig_size.find('a')['href']
orig_sizes = orig_size_url.split('/')
orig_size = orig_sizes[len(orig_sizes)-2]
foo_url = "%s%s" % (root, orig_size_url)
urllib.urlretrieve(foo_url, orig_size)
test = file(orig_size, "r")
file_type = imghdr.what(test)
new_file = "%s.%s" % (orig_size, file_type)
test.close()
if not os.path.isfile(new_file):
os.rename(orig_size, new_file)
print new_file
next_page = soup.find('a', { "id":"next-page-link" })
next_page_url = "%s%s" % (root, next_page['href'])
page = urllib2.urlopen(next_page_url).read()
soup = BeautifulSoup(page)
print "pulling from %s" % next_page_url
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment