DRMacIver (owner)

Revisions

gist: 129023 Download_button fork
public
Public Clone URL: git://gist.github.com/129023.git
Embed All Files: show embed
suparser.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
require "rubygems"
require "hpricot"
require "json"
 
# Scrape reviews out of StumbleUpon list view favourites
 
def parse_review(review)
  title_elem = review.search("//dt")[0].children[1]
  
  href = title_elem["href"]
  title = title_elem.children[0]
 
  tags = review.search("//a").map{|x| x["href"]}.grep(/\/tag\//).map{|h| h.gsub("/tag/", "").gsub("/", "")}
 
  tags << "via:stumbleupon"
 
  contents = review.search("//dd").select{|x| x["id"] =~ /blog_contents/}
  if !contents.empty?
    contents = contents[0].children.select{|x| x.text?}.join("\n")
  else
    contents = nil
  end
 
  date_string = review.search(".stats")[0].to_plain_text.sub(/\[[^\]]+\]/, "").gsub(/(am|pm).*$/){$1}.strip.gsub!(",", " ").gsub(/ +/, " ")
 
  # Try and parse something useful out of the SU date string.
  time_re = /([0-9:]+(?:am|pm))/
  date = DateTime.parse(date_string.gsub(time_re, ""))
  time = DateTime.parse(date_string.scan(time_re)[0][0])
  dt = Time.utc(date.year, date.month, date.day, time.hour, time.min)
 
  it = {"url" => href, "description" => title, "tags" => tags.join(" "), "dt" => dt && dt.strftime("%Y-%m-%dT%H:%M:%SZ")}
 
  it["extended"] = contents if contents
 
  it
end
 
if __FILE__ == $0
  su = Hpricot(STDIN)
  puts "["
  first = true
  su.search("//dl.dlBlog").each do |rev|
    if !first
      puts ","
    else
      first = false
    end
 
    puts parse_review(rev).to_json
  end
  puts "]"
end