Skip to content

Instantly share code, notes, and snippets.

@wihoho
Created July 11, 2013 03:21
Show Gist options
  • Save wihoho/5972288 to your computer and use it in GitHub Desktop.
Save wihoho/5972288 to your computer and use it in GitHub Desktop.
Crawl images from a tumblr blog
import urllib2
import urllib
from bs4 import BeautifulSoup
def processURL(url):
subEntryRequest = urllib2.Request(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'})
try:
return urllib2.urlopen(subEntryRequest).read()
except:
return None
# Parse the HTML file
def parseHTML(data, count):
soup = BeautifulSoup(data)
images = soup.find_all(id = "entry")
for image in images:
imageURL = image.find("img")
if imageURL is not None:
realURL = imageURL.attrs["src"]
count += 1
f = open('Images/' +str(count)+ '.jpg','wb')
f.write(urllib.urlopen(realURL).read())
f.close()
print count
return count
domain = "http://inspirational-images.tumblr.com"
page = 0
count = 0
while True:
page += 1
if page == 1:
crawlURL = domain
else:
crawlURL = domain +"/page/" +str(page)
data = processURL(crawlURL)
if data is None:
break
count = parseHTML(data, count)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment