# encoding: utf-8
import urllib2
from urlparse import urljoin
import datetime
import re
DEFAULT_CHARSET = 'utf-8'
def urlread(url, default_charset=DEFAULT_CHARSET):
f = urllib2.urlopen(url)
charset = default_charset
content = f.read()
f.close()
for pp in f.headers.plist:
p = pp.split('=')
if len(p) == 2 and p[0].lower() == 'charset':
charset = p[1]
if charset.lower() == 'shift_jis':
charset = 'cp932'
return unicode(content, charset, 'ignore')
def get_topic_list(url):
retval = []
title_re = re.compile(ur'\s*\(\d+\)\s*$')
for i in urlread(urljoin(url + '/', 'subject.txt'), 'cp932').split(u"\n"):
i = i.strip()
if i == '':
continue
dat_url, title = i.split(u"<>")
retval.append(
{
'url': urljoin(url + '/dat/', dat_url),
'title': title_re.sub(u'', title),
}
)
return retval
def get_topic(url):
date_re = re.compile(ur'(?P<aprilfool>皇紀)(?P<year>(?:\d{2}|\d{4}))/(?P<month>\d{2})/(?P<day>\d{2})(?:\([^)]+\))? (?P<hour>\d{2}):(?P<minute>\d{2})(?::(?P<second>\d{2})(?:\.(?P<microsecond>\d{2}))?)?\s+(?:<a href="[^"]*">[^<]+</a>\s+)?ID:(?P<token>.+)')
content_re = re.compile(ur'\s*<br>\s*')
retval = []
for i in urlread(url, 'cp932').split(u"\n"):
i = i.strip()
if i == '':
continue
name, mail_address, date_and_id, content, title = i.split(u"<>")
if date_and_id == u'あぼーん':
continue
try:
gd = date_re.match(date_and_id).groupdict()
except:
raise RuntimeError("Wrong date_and_id format", date_and_id)
year = int(gd['year'])
if year < 50:
year += 2000
elif year < 100:
year += 1900
if gd['aprilfool']:
year -= 660
ctime = datetime.datetime(
year=year,
month=int(gd['month']),
day=int(gd['day']),
hour=int(gd['hour']),
minute=int(gd['minute']),
second=int(gd['second'] or 0),
microsecond=int(gd['microsecond'] or 0)
)
retval.append(
{
'name': name,
'mail_address': mail_address,
'ctime': ctime,
'token': gd[u'token'],
'content': content_re.sub(u"\n", content),
'title': title,
}
)
return retval
if __name__ == '__main__':
topics = get_topic_list('http://gimpo.2ch.net/nissin')
print topics[0]['title']
print get_topic(topics[0]['url'])[0]['content']