june29/tumblr-scraper.rb

## tumblr-scraper.rb
require "rubygems"
require "mechanize"
require "nokogiri"

module Tumblr
  class Scraper
    def initialize(email, password, trim_reblog_info = true)
      @email            = email
      @password         = password
      @trim_reblog_info = trim_reblog_info

      @agent = Mechanize.new
      @agent.max_history = 1

      login
    end

    def login
      login_page = @agent.get("http://www.tumblr.com/login")
      login_form = login_page.forms[1]
      login_form.fields.find { |field| field.name == "email" }.value    = @email
      login_form.fields.find { |field| field.name == "password" }.value = @password
      login_form.submit
    end

    def dashboard(page = 1)
      doc = Nokogiri::HTML(@agent.get("http://www.tumblr.com/dashboard/#{page}/").body)

      posts = []
      prev_user = nil

      doc.xpath("//li[contains(concat(' ', normalize-space(@class), ' '), ' post ')]").reject { |post|
        # Reject 'video' and 'audio' temporary, because these are unavailable in mobile Safari
        /new_post|video|audio/ =~ post.attributes["class"].to_s
      }.each do |post|
        begin
          id        = post.attributes["id"].to_s.sub(/^post/, "")
          permalink = post.xpath(".//a[@title='Permalink']").attr("href")

          post_info_link = post.xpath("./div[@class='post_info']/a").first

          user = nil

          unless post_info_link.nil?
            user_id     = post_info_link.text
            user_url    = post_info_link.attributes["href"].to_s
            user_avatar = post.xpath(".//a[@class='post_avatar']").attr("style").to_s.scan(/url\('(.+)'\)/)[0][0]
            user = User.new(user_id, user_url, user_avatar)
          else
            user = prev_user
          end

          prev_user = user

          c = post.attributes["class"].to_s

          case c
          when /regular/
            title = post.xpath("./div[@class='post_title']").text.strip
            post.xpath("./div").remove
            body = post.xpath("./*|./text()").plain_html

            posts << Post.new(id, permalink, user, :regular, title, body, nil)
          when /photo/
            body = post.xpath("./div//img[@class='image']").attr("src")
            caption = post.xpath(".//div[@class='caption']/*|.//div[@class='caption']/text()").plain_html

            posts << Post.new(id, permalink, user, :photo, nil, body, caption)
          when /quote/
            body = post.xpath("./span[@class='quote']/*|./span[@class='quote']/text()").plain_html
            caption = post.xpath(".//td[@class='quote_source']/*|.//td[@class='quote_source']/text()").plain_html

            posts << Post.new(id, permalink, user, :quote, nil, body, caption)
          when /link/
            title = post.xpath(".//div[@class='post_title']/*|.//div[@class='post_title']/text()").plain_html

            posts << Post.new(id, permalink, user, :link, title, body, caption)
          when /conversation/
            title = post.xpath("./div[@class='post_title']").text.strip
            post.xpath("./div").remove
            body = post.xpath("./*|./text()").plain_html

            posts << Post.new(id, permalink, user, :conversation, title, body, caption)
          end
        rescue => e
          puts "!!!!! #{e} in ID #{id} !!!!!"
          next
        end
      end

      posts
    end

    def reblog(url)
      doc = Nokogiri::HTML(@agent.get(url).body)

      url_of_controls = doc.xpath("//iframe[@id='tumblr_controls']").attr("src")
      doc_of_controls = Nokogiri::HTML(@agent.get(url_of_controls).body)

      puts @agent.inspect
      puts doc_of_controls.xpath(".//a")##########

      url_of_reblog  = "http://www.tumblr.com" + doc_of_controls.xpath("//a[starts-with(@href, '/reblog/')]").attr("href")
      page_of_reblog = @agent.get(url_of_reblog)
      form_of_reblog = page_of_reblog.forms[1]

      if @trim_reblog_info
        doc_of_reblog  = Nokogiri::HTML(page_of_reblog.body)
        post_type = doc_of_reblog.xpath("//input[@name='post[type]']").attr("value")

        case post_type
        when "link"
          field = form_of_reblog.fields.find { |field| field.name == "post[three]" }
          field.value = trim(field.value)
        when "regular", "photo", "video"
          field = form_of_reblog.fields.find { |field| field.name == "post[two]" }
          field.value = trim(field.value)
        when "quote"
          field = form_of_reblog.fields.find { |field| field.name == "post[two]" }
          field.value = field.value.gsub(/ \(via <a.*?<\/a>\)/, "")
        end
      end

      form_of_reblog.submit
    end

    def follow(url)
      page = @agent.get("http://www.tumblr.com/following")
      follow_form = page.forms[1]

      follow_form.fields.find { |field| field.name == "follow_this" }.value = url
      follow_form.submit
    end

    private
    def trim(str)
      str.gsub!(/<p><\/p>/, "").gsub!(/<p><a[^<]+<\/a>:<\/p>/, "")
      str = trim_quote(str)
      str.strip
    end

    private
    def trim_quote(str)
      str.sub(/<blockquote>(([\n\r]|.)+)<\/blockquote>/m) { trim_quote($1) }
    end
  end

  class User
    attr_reader :id, :url, :avatar_image_url

    def initialize(id, url, avatar_image_url)
      @id               = id
      @url              = url
      @avatar_image_url = avatar_image_url
    end

    def to_h
      {
        "id"               => @id,
        "url"              => @url,
        "avatar_image_url" => @avatar_image_url
      }
    end
  end

  class Post
    attr_reader :id, :permalink, :user, :post_type, :title, :body, :caption

    def initialize(id, permalink, user, post_type, title, body, caption)
      @id        = id
      @permalink = permalink
      @user      = user
      @post_type = post_type
      @title     = title
      @body      = body
      @caption   = caption
    end

    def to_h
      {
        :id        => @id,
        :permalink => @permalink,
        :user      => @user.to_h,
        :post_type => @post_type,
        :title     => @title,
        :body      => @body,
        :caption   => @caption
      }
    end
  end
end

class Nokogiri::XML::Node
  def plain_html
    if self.class == Nokogiri::XML::Text
      return self.text.gsub(/\s{2,}/, " ")
    else
      attrs = self.attributes.inject("") { |str, attr|
        str += " #{attr[0]}=\"#{attr[1]}\""
      }

      if self.children.empty?
        "<#{self.name}#{attrs} />"
      else
        "<#{self.name}#{attrs}>#{self.children.plain_html}</#{self.name}>"
      end
    end
  end
end

class Nokogiri::XML::NodeSet
  def plain_html
    self.inject([]) { |a, node|
      a << node.plain_html
    }.join("")
  end
end

class Nokogiri::XML::Element
  def plain_html
    super
  end
end
	require "rubygems"
	require "mechanize"
	require "nokogiri"

	module Tumblr
	class Scraper
	def initialize(email, password, trim_reblog_info = true)
	@email = email
	@password = password
	@trim_reblog_info = trim_reblog_info

	@agent = Mechanize.new
	@agent.max_history = 1

	login
	end

	def login
	login_page = @agent.get("http://www.tumblr.com/login")
	login_form = login_page.forms[1]
	login_form.fields.find { \|field\| field.name == "email" }.value = @email
	login_form.fields.find { \|field\| field.name == "password" }.value = @password
	login_form.submit
	end

	def dashboard(page = 1)
	doc = Nokogiri::HTML(@agent.get("http://www.tumblr.com/dashboard/#{page}/").body)

	posts = []
	prev_user = nil

	doc.xpath("//li[contains(concat(' ', normalize-space(@class), ' '), ' post ')]").reject { \|post\|
	# Reject 'video' and 'audio' temporary, because these are unavailable in mobile Safari
	/new_post\|video\|audio/ =~ post.attributes["class"].to_s
	}.each do \|post\|
	begin
	id = post.attributes["id"].to_s.sub(/^post/, "")
	permalink = post.xpath(".//a[@title='Permalink']").attr("href")

	post_info_link = post.xpath("./div[@class='post_info']/a").first

	user = nil

	unless post_info_link.nil?
	user_id = post_info_link.text
	user_url = post_info_link.attributes["href"].to_s
	user_avatar = post.xpath(".//a[@class='post_avatar']").attr("style").to_s.scan(/url\('(.+)'\)/)[0][0]
	user = User.new(user_id, user_url, user_avatar)
	else
	user = prev_user
	end

	prev_user = user

	c = post.attributes["class"].to_s

	case c
	when /regular/
	title = post.xpath("./div[@class='post_title']").text.strip
	post.xpath("./div").remove
	body = post.xpath("./*\|./text()").plain_html

	posts << Post.new(id, permalink, user, :regular, title, body, nil)
	when /photo/
	body = post.xpath("./div//img[@class='image']").attr("src")
	caption = post.xpath(".//div[@class='caption']/*\|.//div[@class='caption']/text()").plain_html

	posts << Post.new(id, permalink, user, :photo, nil, body, caption)
	when /quote/
	body = post.xpath("./span[@class='quote']/*\|./span[@class='quote']/text()").plain_html
	caption = post.xpath(".//td[@class='quote_source']/*\|.//td[@class='quote_source']/text()").plain_html

	posts << Post.new(id, permalink, user, :quote, nil, body, caption)
	when /link/
	title = post.xpath(".//div[@class='post_title']/*\|.//div[@class='post_title']/text()").plain_html

	posts << Post.new(id, permalink, user, :link, title, body, caption)
	when /conversation/
	title = post.xpath("./div[@class='post_title']").text.strip
	post.xpath("./div").remove
	body = post.xpath("./*\|./text()").plain_html

	posts << Post.new(id, permalink, user, :conversation, title, body, caption)
	end
	rescue => e
	puts "!!!!! #{e} in ID #{id} !!!!!"
	next
	end
	end

	posts
	end

	def reblog(url)
	doc = Nokogiri::HTML(@agent.get(url).body)

	url_of_controls = doc.xpath("//iframe[@id='tumblr_controls']").attr("src")
	doc_of_controls = Nokogiri::HTML(@agent.get(url_of_controls).body)

	puts @agent.inspect
	puts doc_of_controls.xpath(".//a")##########

	url_of_reblog = "http://www.tumblr.com" + doc_of_controls.xpath("//a[starts-with(@href, '/reblog/')]").attr("href")
	page_of_reblog = @agent.get(url_of_reblog)
	form_of_reblog = page_of_reblog.forms[1]

	if @trim_reblog_info
	doc_of_reblog = Nokogiri::HTML(page_of_reblog.body)
	post_type = doc_of_reblog.xpath("//input[@name='post[type]']").attr("value")

	case post_type
	when "link"
	field = form_of_reblog.fields.find { \|field\| field.name == "post[three]" }
	field.value = trim(field.value)
	when "regular", "photo", "video"
	field = form_of_reblog.fields.find { \|field\| field.name == "post[two]" }
	field.value = trim(field.value)
	when "quote"
	field = form_of_reblog.fields.find { \|field\| field.name == "post[two]" }
	field.value = field.value.gsub(/ \(via <a.*?<\/a>\)/, "")
	end
	end

	form_of_reblog.submit
	end

	def follow(url)
	page = @agent.get("http://www.tumblr.com/following")
	follow_form = page.forms[1]

	follow_form.fields.find { \|field\| field.name == "follow_this" }.value = url
	follow_form.submit
	end

	private
	def trim(str)
	str.gsub!(/<p><\/p>/, "").gsub!(/<p><a[^<]+<\/a>:<\/p>/, "")
	str = trim_quote(str)
	str.strip
	end

	private
	def trim_quote(str)
	str.sub(/<blockquote>(([\n\r]\|.)+)<\/blockquote>/m) { trim_quote($1) }
	end
	end

	class User
	attr_reader :id, :url, :avatar_image_url

	def initialize(id, url, avatar_image_url)
	@id = id
	@url = url
	@avatar_image_url = avatar_image_url
	end

	def to_h
	{
	"id" => @id,
	"url" => @url,
	"avatar_image_url" => @avatar_image_url
	}
	end
	end

	class Post
	attr_reader :id, :permalink, :user, :post_type, :title, :body, :caption

	def initialize(id, permalink, user, post_type, title, body, caption)
	@id = id
	@permalink = permalink
	@user = user
	@post_type = post_type
	@title = title
	@body = body
	@caption = caption
	end

	def to_h
	{
	:id => @id,
	:permalink => @permalink,
	:user => @user.to_h,
	:post_type => @post_type,
	:title => @title,
	:body => @body,
	:caption => @caption
	}
	end
	end
	end

	class Nokogiri::XML::Node
	def plain_html
	if self.class == Nokogiri::XML::Text
	return self.text.gsub(/\s{2,}/, " ")
	else
	attrs = self.attributes.inject("") { \|str, attr\|
	str += " #{attr[0]}=\"#{attr[1]}\""
	}

	if self.children.empty?
	"<#{self.name}#{attrs} />"
	else
	"<#{self.name}#{attrs}>#{self.children.plain_html}</#{self.name}>"
	end
	end
	end
	end

	class Nokogiri::XML::NodeSet
	def plain_html
	self.inject([]) { \|a, node\|
	a << node.plain_html
	}.join("")
	end
	end

	class Nokogiri::XML::Element
	def plain_html
	super
	end
	end