wtnabe (owner)

Revisions

gist: 218535 Download_button fork
public
Public Clone URL: git://gist.github.com/218535.git
Embed All Files: show embed
rtm-printplanner-scraper.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#! /usr/bin/env ruby
 
require 'rubygems'
require 'nokogiri'
 
#
# RTM `printplanner' scraper
#
 
#TARGET = 'http://www.rememberthemilk.com/printplanner/USER/'
TARGET = 'rtm-weekly.html'
 
#
# Return list of list
#
# [{h1.inner_text => [ li, li, li, li, ...]},
# {h1.inner_text => [ li, li, li, li, ...]},
# ...
# ]
#
# each li
#
# li == { 'name' => li's text node,
# 'list' => list,
# 'limit' => due
# }
#
Nokogiri( open( TARGET ).read
          ).search( '//h1[following-sibling::ul]' ).map { |h1|
  li = h1.search( './following-sibling::ul[1]/li' )
  if ( li.size > 0 )
    {
      h1.inner_text =>
      li.map { |e|
        list, limit = e.search( '.tasklist'
                                  ).inner_text.sub( /\A\(/, ''
                                                    ).sub( /\)\z/, ''
                                                           ).split( /,/ )
        {
          'name' => e.children.map { |n|
            if ( n.node_name == 'text' )
              n
            else
              nil
            end
          }.join.strip,
          'list' => list.strip,
          'limit' => limit.strip
        }
      }
    }
  else
    nil
  end
}.compact