moriyoshi (owner)

Revisions

gist: 217436 Download_button fork
public
Public Clone URL: git://gist.github.com/217436.git
Embed All Files: show embed
atnd_scrape.py #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# encoding: utf-8
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
import urllib2
import re
import datetime
 
DEFAULT_CHARSET = 'utf-8'
 
def urlread(url):
    f = urllib2.urlopen(url)
    charset = DEFAULT_CHARSET
    content = f.read()
    f.close()
    for pp in f.headers.plist:
        p = pp.split('=')
        if len(p) == 2 and p[0].lower() == 'charset':
            charset = p[1]
    return unicode(content, charset)
 
def textify(nodelist):
    retval = ''
    for n in nodelist:
        if isinstance(n, Comment):
            pass
        elif isinstance(n, Tag):
            if n.name == 'br':
                retval += "\n"
            else:
                retval += textify(n)
        elif isinstance(n, NavigableString):
            retval += unicode(n)
    return retval
 
def parse_event_table(t):
    retval = []
    for r in t.tbody.findAll('tr'):
        cols = r.findAll('td')
        title_tag = cols[0].h4.a
        m = re.match(ur'/events/(\d+)', title_tag['href'])
        if m is not None:
            event_id = m.group(1)
        else:
            event_id = None
        m = re.match(ur'(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2}) to (\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})', textify(cols[1]))
        if m is not None:
            period = (
                datetime.datetime(
                    year=int(m.group(1)),
                    month=int(m.group(2)),
                    day=int(m.group(3)),
                    hour=int(m.group(4)),
                    minute=int(m.group(5))
                    ),
                datetime.datetime(
                    year=int(m.group(6)),
                    month=int(m.group(7)),
                    day=int(m.group(8)),
                    hour=int(m.group(9)),
                    minute=int(m.group(10))
                    )
                )
        else:
            period = (None, None)
        m = re.match(ur'作成日:(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})\s+更新日:(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})', textify(cols[4]))
        if m is not None:
            ctime = datetime.datetime(
                year=int(m.group(1)),
                month=int(m.group(2)),
                day=int(m.group(3)),
                hour=int(m.group(4)),
                minute=int(m.group(5))
                )
            utime = datetime.datetime(
                year=int(m.group(6)),
                month=int(m.group(7)),
                day=int(m.group(8)),
                hour=int(m.group(9)),
                minute=int(m.group(10))
                )
        else:
            ctime = None
            utime = None
 
        retval.append(
            {
                'title': textify(title_tag),
                'id': event_id,
                'period': period,
                'place': textify(cols[2]),
                'ctime': ctime,
                'utime': utime
                }
            )
    return retval
 
def scrape_my_page(url):
    root = BeautifulSoup(urlread(url)).html.body.find('div', id='wrapper').find('div', id='main')
    name = ''
    m = re.match(ur'(.*)さんのマイページ', textify(root.find('div', id='main_title').h1))
    if m:
        name = m.group(1)
 
    managing_events = []
    participating_events = []
 
    secs = root.findAll('div', 'title_hd')
    for s in secs:
        if s.h2 is not None and \
           textify(s.h2).strip().startswith(u'管理しているイベント'):
            managing_events = parse_event_table(s.findNextSibling('table', 'tb_list'))
        if s.h2 is not None and \
           textify(s.h2).strip().startswith(u'参加しているイベント'):
            participating_events = parse_event_table(s.findNextSibling('table', 'tb_list'))
 
 
    return {
        'name': name,
        'managing_events': managing_events,
        'participating_events': participating_events
        }
 
if __name__ == '__main__':
    s = scrape_my_page('http://atnd.org/users/4604')
    print s