moriyoshi (owner)

Revisions

gist: 217433 Download_button fork
public
Public Clone URL: git://gist.github.com/217433.git
Embed All Files: show embed
ch2_scrape.py #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# encoding: utf-8
import urllib2
from urlparse import urljoin
import datetime
import re
 
DEFAULT_CHARSET = 'utf-8'
 
def urlread(url, default_charset=DEFAULT_CHARSET):
    f = urllib2.urlopen(url)
    charset = default_charset
    content = f.read()
    f.close()
    for pp in f.headers.plist:
        p = pp.split('=')
        if len(p) == 2 and p[0].lower() == 'charset':
            charset = p[1]
    if charset.lower() == 'shift_jis':
        charset = 'cp932'
    return unicode(content, charset, 'ignore')
 
def get_topic_list(url):
    retval = []
    title_re = re.compile(ur'\s*\(\d+\)\s*$')
    for i in urlread(urljoin(url + '/', 'subject.txt'), 'cp932').split(u"\n"):
        i = i.strip()
        if i == '':
            continue
        dat_url, title = i.split(u"<>")
        retval.append(
            {
                'url': urljoin(url + '/dat/', dat_url),
                'title': title_re.sub(u'', title),
                }
            )
    return retval
 
def get_topic(url):
    date_re = re.compile(ur'(?P<aprilfool>皇紀)(?P<year>(?:\d{2}|\d{4}))/(?P<month>\d{2})/(?P<day>\d{2})(?:\([^)]+\))? (?P<hour>\d{2}):(?P<minute>\d{2})(?::(?P<second>\d{2})(?:\.(?P<microsecond>\d{2}))?)?\s+(?:<a href="[^"]*">[^<]+</a>\s+)?ID:(?P<token>.+)')
    content_re = re.compile(ur'\s*<br>\s*')
    retval = []
    for i in urlread(url, 'cp932').split(u"\n"):
        i = i.strip()
        if i == '':
            continue
        name, mail_address, date_and_id, content, title = i.split(u"<>")
        if date_and_id == u'あぼーん':
            continue
        try:
            gd = date_re.match(date_and_id).groupdict()
        except:
            raise RuntimeError("Wrong date_and_id format", date_and_id)
        year = int(gd['year'])
        if year < 50:
            year += 2000
        elif year < 100:
            year += 1900
        if gd['aprilfool']:
            year -= 660
        ctime = datetime.datetime(
            year=year,
            month=int(gd['month']),
            day=int(gd['day']),
            hour=int(gd['hour']),
            minute=int(gd['minute']),
            second=int(gd['second'] or 0),
            microsecond=int(gd['microsecond'] or 0)
            )
        retval.append(
            {
                'name': name,
                'mail_address': mail_address,
                'ctime': ctime,
                'token': gd[u'token'],
                'content': content_re.sub(u"\n", content),
                'title': title,
                }
            )
    return retval
 
if __name__ == '__main__':
    topics = get_topic_list('http://gimpo.2ch.net/nissin')
    print topics[0]['title']
    print get_topic(topics[0]['url'])[0]['content']