wihoho/TumblrCrawler.py

## TumblrCrawler.py
import urllib2
import urllib
from bs4 import BeautifulSoup
def processURL(url):
        subEntryRequest = urllib2.Request(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'})
        try:
            return urllib2.urlopen(subEntryRequest).read()
        except:
            return None

# Parse the HTML file
def parseHTML(data, count):
    soup = BeautifulSoup(data)
    images = soup.find_all(id = "entry")
    for image in images:
        imageURL = image.find("img")

        if imageURL is not None:
            realURL = imageURL.attrs["src"]
            count += 1
            f = open('Images/' +str(count)+ '.jpg','wb')
            f.write(urllib.urlopen(realURL).read())
            f.close()

            print count

    return count

domain = "http://inspirational-images.tumblr.com"
page = 0
count = 0
while True:
    page += 1
    if page == 1:
        crawlURL = domain
    else:
        crawlURL = domain +"/page/" +str(page)

    data = processURL(crawlURL)
    if data is None:
        break

    count = parseHTML(data, count)
	import urllib2
	import urllib
	from bs4 import BeautifulSoup
	def processURL(url):
	subEntryRequest = urllib2.Request(url, headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'})
	try:
	return urllib2.urlopen(subEntryRequest).read()
	except:
	return None

	# Parse the HTML file
	def parseHTML(data, count):
	soup = BeautifulSoup(data)
	images = soup.find_all(id = "entry")
	for image in images:
	imageURL = image.find("img")

	if imageURL is not None:
	realURL = imageURL.attrs["src"]
	count += 1
	f = open('Images/' +str(count)+ '.jpg','wb')
	f.write(urllib.urlopen(realURL).read())
	f.close()

	print count

	return count

	domain = "http://inspirational-images.tumblr.com"
	page = 0
	count = 0
	while True:
	page += 1
	if page == 1:
	crawlURL = domain
	else:
	crawlURL = domain +"/page/" +str(page)

	data = processURL(crawlURL)
	if data is None:
	break

	count = parseHTML(data, count)