require "rubygems" require "hpricot" require "json" # Scrape reviews out of StumbleUpon list view favourites def parse_review(review) title_elem = review.search("//dt")[0].children[1] href = title_elem["href"] title = title_elem.children[0] tags = review.search("//a").map{|x| x["href"]}.grep(/\/tag\//).map{|h| h.gsub("/tag/", "").gsub("/", "")} tags << "via:stumbleupon" contents = review.search("//dd").select{|x| x["id"] =~ /blog_contents/} if !contents.empty? contents = contents[0].children.select{|x| x.text?}.join("\n") else contents = nil end date_string = review.search(".stats")[0].to_plain_text.sub(/\[[^\]]+\]/, "").gsub(/(am|pm).*$/){$1}.strip.gsub!(",", " ").gsub(/ +/, " ") # Try and parse something useful out of the SU date string. time_re = /([0-9:]+(?:am|pm))/ date = DateTime.parse(date_string.gsub(time_re, "")) time = DateTime.parse(date_string.scan(time_re)[0][0]) dt = Time.utc(date.year, date.month, date.day, time.hour, time.min) it = {"url" => href, "description" => title, "tags" => tags.join(" "), "dt" => dt && dt.strftime("%Y-%m-%dT%H:%M:%SZ")} it["extended"] = contents if contents it end if __FILE__ == $0 su = Hpricot(STDIN) puts "[" first = true su.search("//dl.dlBlog").each do |rev| if !first puts "," else first = false end puts parse_review(rev).to_json end puts "]" end