jharjono/Ruby script to scrape off images from Tumblr

## Ruby script to scrape off images from Tumblr
# Quick hack - really hacky, untested, and probably breaks a lot

require 'rubygems'
require 'mechanize'

module TumblrScraper

	class TumblrPost

		attr_accessor :url, :post_id, :tumblr_id

		# @param [Nokogiri::XML::Element] post - a DOM element representing a Tumblr div.post
		def initialize(post)
			@url = post.search("./a").first.attr("href")
			@post_id = @url.split("/")[4].to_i
			@tumblr_id = @url.split("/")[2].split(".").first
			puts "processing post_id #{@post_id} #{@url}"
		end
	end

	class TumblrImagePost < TumblrPost
		def initialize(post)
			@img_url = nil
			begin
				super(post)
				@img_url = post.search("img").first.attr('src')
			rescue => e
				puts "Not an image file!"
			end
		end

		def download(destination_dir=Dir.pwd)
			if @img_url.nil?
				return
			end
			out_fname = File.join(destination_dir, "#{@tumblr_id}_#{@post_id}.jpg")
			# %x[wget #{@img_url} -O #{out_fname}]
			puts "Downloaded #{@img_url} as #{out_fname}."
		end
	end

	# Scraper for Tumblr images
	class ImageScraper

		def initialize(tumblr_id)
			@url = "http://#{tumblr_id}.tumblr.com"
			@agent = Mechanize.new
		end

		# @param [Int] until - post ID that we will stop scraping at - note that post ID for a tumblr user monotonically increase with time
		def scrape(limit, download_dir=Dir.pwd)
			limit_reached = false
			url = @url
			page_num = 1

			while not limit_reached
				page = @agent.get(url + "/page/#{page_num}/")
				posts = page.search(".post").map {|p| TumblrImagePost.new(p)}
				posts.each do |post|
					if post.post_id <= limit
						limit_reached = true
						puts "Limit reached at post #{post.post_id} <= limit #{limit}. Aborting scraper..."
						break
					else
						post.download(download_dir)
					end
				end
				if posts.size == 0
					# no more pages left
					limit_reached = true
					puts "Reached end of archive. Aborting scraper..."
					break
				end

				# all image posts in this page downloaded, going backwards in history
				page_num += 1
			end

		end

	end
end

if __FILE__ == $0
	scraper = TumblrScraper::ImageScraper.new("tumblr-id-here")
	scraper.scrape(1)
end
	# Quick hack - really hacky, untested, and probably breaks a lot

	require 'rubygems'
	require 'mechanize'

	module TumblrScraper

	class TumblrPost

	attr_accessor :url, :post_id, :tumblr_id

	# @param [Nokogiri::XML::Element] post - a DOM element representing a Tumblr div.post
	def initialize(post)
	@url = post.search("./a").first.attr("href")
	@post_id = @url.split("/")[4].to_i
	@tumblr_id = @url.split("/")[2].split(".").first
	puts "processing post_id #{@post_id} #{@url}"
	end
	end

	class TumblrImagePost < TumblrPost
	def initialize(post)
	@img_url = nil
	begin
	super(post)
	@img_url = post.search("img").first.attr('src')
	rescue => e
	puts "Not an image file!"
	end
	end

	def download(destination_dir=Dir.pwd)
	if @img_url.nil?
	return
	end
	out_fname = File.join(destination_dir, "#{@tumblr_id}_#{@post_id}.jpg")
	# %x[wget #{@img_url} -O #{out_fname}]
	puts "Downloaded #{@img_url} as #{out_fname}."
	end
	end

	# Scraper for Tumblr images
	class ImageScraper

	def initialize(tumblr_id)
	@url = "http://#{tumblr_id}.tumblr.com"
	@agent = Mechanize.new
	end

	# @param [Int] until - post ID that we will stop scraping at - note that post ID for a tumblr user monotonically increase with time
	def scrape(limit, download_dir=Dir.pwd)
	limit_reached = false
	url = @url
	page_num = 1

	while not limit_reached
	page = @agent.get(url + "/page/#{page_num}/")
	posts = page.search(".post").map {\|p\| TumblrImagePost.new(p)}
	posts.each do \|post\|
	if post.post_id <= limit
	limit_reached = true
	puts "Limit reached at post #{post.post_id} <= limit #{limit}. Aborting scraper..."
	break
	else
	post.download(download_dir)
	end
	end
	if posts.size == 0
	# no more pages left
	limit_reached = true
	puts "Reached end of archive. Aborting scraper..."
	break
	end

	# all image posts in this page downloaded, going backwards in history
	page_num += 1
	end

	end

	end
	end

	if __FILE__ == $0
	scraper = TumblrScraper::ImageScraper.new("tumblr-id-here")
	scraper.scrape(1)
	end