Skip to content

Instantly share code, notes, and snippets.

@june29
Created February 5, 2011 19:59
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save june29/812737 to your computer and use it in GitHub Desktop.
Save june29/812737 to your computer and use it in GitHub Desktop.
tumblr-scraper.rb
require "rubygems"
require "mechanize"
require "nokogiri"
module Tumblr
class Scraper
def initialize(email, password, trim_reblog_info = true)
@email = email
@password = password
@trim_reblog_info = trim_reblog_info
@agent = Mechanize.new
@agent.max_history = 1
login
end
def login
login_page = @agent.get("http://www.tumblr.com/login")
login_form = login_page.forms[1]
login_form.fields.find { |field| field.name == "email" }.value = @email
login_form.fields.find { |field| field.name == "password" }.value = @password
login_form.submit
end
def dashboard(page = 1)
doc = Nokogiri::HTML(@agent.get("http://www.tumblr.com/dashboard/#{page}/").body)
posts = []
prev_user = nil
doc.xpath("//li[contains(concat(' ', normalize-space(@class), ' '), ' post ')]").reject { |post|
# Reject 'video' and 'audio' temporary, because these are unavailable in mobile Safari
/new_post|video|audio/ =~ post.attributes["class"].to_s
}.each do |post|
begin
id = post.attributes["id"].to_s.sub(/^post/, "")
permalink = post.xpath(".//a[@title='Permalink']").attr("href")
post_info_link = post.xpath("./div[@class='post_info']/a").first
user = nil
unless post_info_link.nil?
user_id = post_info_link.text
user_url = post_info_link.attributes["href"].to_s
user_avatar = post.xpath(".//a[@class='post_avatar']").attr("style").to_s.scan(/url\('(.+)'\)/)[0][0]
user = User.new(user_id, user_url, user_avatar)
else
user = prev_user
end
prev_user = user
c = post.attributes["class"].to_s
case c
when /regular/
title = post.xpath("./div[@class='post_title']").text.strip
post.xpath("./div").remove
body = post.xpath("./*|./text()").plain_html
posts << Post.new(id, permalink, user, :regular, title, body, nil)
when /photo/
body = post.xpath("./div//img[@class='image']").attr("src")
caption = post.xpath(".//div[@class='caption']/*|.//div[@class='caption']/text()").plain_html
posts << Post.new(id, permalink, user, :photo, nil, body, caption)
when /quote/
body = post.xpath("./span[@class='quote']/*|./span[@class='quote']/text()").plain_html
caption = post.xpath(".//td[@class='quote_source']/*|.//td[@class='quote_source']/text()").plain_html
posts << Post.new(id, permalink, user, :quote, nil, body, caption)
when /link/
title = post.xpath(".//div[@class='post_title']/*|.//div[@class='post_title']/text()").plain_html
posts << Post.new(id, permalink, user, :link, title, body, caption)
when /conversation/
title = post.xpath("./div[@class='post_title']").text.strip
post.xpath("./div").remove
body = post.xpath("./*|./text()").plain_html
posts << Post.new(id, permalink, user, :conversation, title, body, caption)
end
rescue => e
puts "!!!!! #{e} in ID #{id} !!!!!"
next
end
end
posts
end
def reblog(url)
doc = Nokogiri::HTML(@agent.get(url).body)
url_of_controls = doc.xpath("//iframe[@id='tumblr_controls']").attr("src")
doc_of_controls = Nokogiri::HTML(@agent.get(url_of_controls).body)
puts @agent.inspect
puts doc_of_controls.xpath(".//a")##########
url_of_reblog = "http://www.tumblr.com" + doc_of_controls.xpath("//a[starts-with(@href, '/reblog/')]").attr("href")
page_of_reblog = @agent.get(url_of_reblog)
form_of_reblog = page_of_reblog.forms[1]
if @trim_reblog_info
doc_of_reblog = Nokogiri::HTML(page_of_reblog.body)
post_type = doc_of_reblog.xpath("//input[@name='post[type]']").attr("value")
case post_type
when "link"
field = form_of_reblog.fields.find { |field| field.name == "post[three]" }
field.value = trim(field.value)
when "regular", "photo", "video"
field = form_of_reblog.fields.find { |field| field.name == "post[two]" }
field.value = trim(field.value)
when "quote"
field = form_of_reblog.fields.find { |field| field.name == "post[two]" }
field.value = field.value.gsub(/ \(via <a.*?<\/a>\)/, "")
end
end
form_of_reblog.submit
end
def follow(url)
page = @agent.get("http://www.tumblr.com/following")
follow_form = page.forms[1]
follow_form.fields.find { |field| field.name == "follow_this" }.value = url
follow_form.submit
end
private
def trim(str)
str.gsub!(/<p><\/p>/, "").gsub!(/<p><a[^<]+<\/a>:<\/p>/, "")
str = trim_quote(str)
str.strip
end
private
def trim_quote(str)
str.sub(/<blockquote>(([\n\r]|.)+)<\/blockquote>/m) { trim_quote($1) }
end
end
class User
attr_reader :id, :url, :avatar_image_url
def initialize(id, url, avatar_image_url)
@id = id
@url = url
@avatar_image_url = avatar_image_url
end
def to_h
{
"id" => @id,
"url" => @url,
"avatar_image_url" => @avatar_image_url
}
end
end
class Post
attr_reader :id, :permalink, :user, :post_type, :title, :body, :caption
def initialize(id, permalink, user, post_type, title, body, caption)
@id = id
@permalink = permalink
@user = user
@post_type = post_type
@title = title
@body = body
@caption = caption
end
def to_h
{
:id => @id,
:permalink => @permalink,
:user => @user.to_h,
:post_type => @post_type,
:title => @title,
:body => @body,
:caption => @caption
}
end
end
end
class Nokogiri::XML::Node
def plain_html
if self.class == Nokogiri::XML::Text
return self.text.gsub(/\s{2,}/, " ")
else
attrs = self.attributes.inject("") { |str, attr|
str += " #{attr[0]}=\"#{attr[1]}\""
}
if self.children.empty?
"<#{self.name}#{attrs} />"
else
"<#{self.name}#{attrs}>#{self.children.plain_html}</#{self.name}>"
end
end
end
end
class Nokogiri::XML::NodeSet
def plain_html
self.inject([]) { |a, node|
a << node.plain_html
}.join("")
end
end
class Nokogiri::XML::Element
def plain_html
super
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment