# encoding: utf-8
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment
import urllib2
import re
import datetime
DEFAULT_CHARSET = 'utf-8'
def urlread(url):
f = urllib2.urlopen(url)
charset = DEFAULT_CHARSET
content = f.read()
f.close()
for pp in f.headers.plist:
p = pp.split('=')
if len(p) == 2 and p[0].lower() == 'charset':
charset = p[1]
return unicode(content, charset)
def textify(nodelist):
retval = ''
for n in nodelist:
if isinstance(n, Comment):
pass
elif isinstance(n, Tag):
if n.name == 'br':
retval += "\n"
else:
retval += textify(n)
elif isinstance(n, NavigableString):
retval += unicode(n)
return retval
def parse_event_table(t):
retval = []
for r in t.tbody.findAll('tr'):
cols = r.findAll('td')
title_tag = cols[0].h4.a
m = re.match(ur'/events/(\d+)', title_tag['href'])
if m is not None:
event_id = m.group(1)
else:
event_id = None
m = re.match(ur'(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2}) to (\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})', textify(cols[1]))
if m is not None:
period = (
datetime.datetime(
year=int(m.group(1)),
month=int(m.group(2)),
day=int(m.group(3)),
hour=int(m.group(4)),
minute=int(m.group(5))
),
datetime.datetime(
year=int(m.group(6)),
month=int(m.group(7)),
day=int(m.group(8)),
hour=int(m.group(9)),
minute=int(m.group(10))
)
)
else:
period = (None, None)
m = re.match(ur'作成日:(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})\s+更新日:(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})', textify(cols[4]))
if m is not None:
ctime = datetime.datetime(
year=int(m.group(1)),
month=int(m.group(2)),
day=int(m.group(3)),
hour=int(m.group(4)),
minute=int(m.group(5))
)
utime = datetime.datetime(
year=int(m.group(6)),
month=int(m.group(7)),
day=int(m.group(8)),
hour=int(m.group(9)),
minute=int(m.group(10))
)
else:
ctime = None
utime = None
retval.append(
{
'title': textify(title_tag),
'id': event_id,
'period': period,
'place': textify(cols[2]),
'ctime': ctime,
'utime': utime
}
)
return retval
def scrape_my_page(url):
root = BeautifulSoup(urlread(url)).html.body.find('div', id='wrapper').find('div', id='main')
name = ''
m = re.match(ur'(.*)さんのマイページ', textify(root.find('div', id='main_title').h1))
if m:
name = m.group(1)
managing_events = []
participating_events = []
secs = root.findAll('div', 'title_hd')
for s in secs:
if s.h2 is not None and \
textify(s.h2).strip().startswith(u'管理しているイベント'):
managing_events = parse_event_table(s.findNextSibling('table', 'tb_list'))
if s.h2 is not None and \
textify(s.h2).strip().startswith(u'参加しているイベント'):
participating_events = parse_event_table(s.findNextSibling('table', 'tb_list'))
return {
'name': name,
'managing_events': managing_events,
'participating_events': participating_events
}
if __name__ == '__main__':
s = scrape_my_page('http://atnd.org/users/4604')
print s