moriyoshi (owner)

Revisions

gist: 216686 Download_button fork
public
Public Clone URL: git://gist.github.com/216686.git
Embed All Files: show embed
komachi_scrape.py #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from BeautifulSoup import BeautifulSoup, NavigableString, Tag, Comment
import urllib2
import re
from urlparse import urljoin
 
TOP_URL = 'http://komachi.yomiuri.co.jp/?g=%d'
RANKING_URL = 'http://komachi.yomiuri.co.jp/ranking/'
DEFAULT_CHARSET = 'utf-8'
 
def urlread(url, default_charset=DEFAULT_CHARSET):
    f = urllib2.urlopen(url)
    charset = default_charset
    content = f.read()
    f.close()
    for pp in f.headers.plist:
        p = pp.split('=')
        if len(p) == 2 and p[0].lower() == 'charset':
            charset = p[1]
    return unicode(content, charset)
 
def textify(nodelist):
    retval = ''
    for n in nodelist:
        if isinstance(n, Comment):
            pass
        elif isinstance(n, Tag):
            if n.name == 'br':
                retval += "\n"
            else:
                retval += textify(n)
        elif isinstance(n, NavigableString):
            retval += unicode(n)
    return retval
 
def scrape_ranking_page(kind):
    soup = BeautifulSoup(urlread(RANKING_URL))
    retval = []
    for e in soup.html.body.find('div', id='d2-reader').find('div', id='d3-ge0').find('div', id='d4-ranking').find('table', 'layout').tr.find('td', 'main').find('div', id=('rankAccess%d' % kind)).find('table', 'topicslist').findAll('tr'):
        no, q = e.findAll('td')
        retval.append(
            {
                'rank': int(no.string),
                'title': q.a.string,
                'url': urljoin(RANKING_URL, q.a['href'])
                }
            )
    return retval
 
def scrape_recent_entris_page(url):
    soup = BeautifulSoup(urlread(url))
    topics = []
    pages = []
    list_root = soup.html.body.find('div', id='d2-reader').find('div', id='d3-ge0').div.find('table', 'layout').tr.find('td', 'main')
    for e in list_root.find('table', 'topiclisttitle').tr.find('td', 'pagelist').div.contents:
        if isinstance(e, Tag):
            if e.name == 'a':
                pages.append(urljoin(url, e['href']))
            else:
                pages.append(None)
        
    for e in list_root.find('table', 'topicslist').findAll('tr'):
        cols = e.findAll('td')
        if len(cols) == 5:
            topics.append(
                {
                    'title': cols[0].a.string,
                    'url': urljoin(url, cols[0].a['href'])
                    }
                )
    return {
        'pages': pages,
        'topics': topics,
        }
 
def scrape_recent_entries(kind=0):
    url = TOP_URL % kind
    n = scrape_recent_entris_page(url)
    topics = n['topics']
    pages = n['pages']
    for p in pages:
        if p is None:
            continue
        n = scrape_recent_entris_page(p)
        topics += n['topics']
    return topics
 
def scrape_topic(url):
    topic_top = BeautifulSoup(re.compile(r"</scr'").sub("<' + '/scr'", urlread(url))).html.body.find('div', id='d2-reader').div.div.div.find('div', id='d4-res').find('td', 'main')
    return {
        'title': textify(topic_top.find('div', 'topicbox').find('div', 'inr').find('table', 'topichd').tr.findAll('td')[0].h1).strip(),
        'content': textify(topic_top.find('table', id='topiccontent').tr.findAll('td')[1].p)
        }
 
if __name__ == '__main__':
    print scrape_recent_entries()
    #entries = scrape_ranking_page(0)
    #for entry in entries:
    # print entry['title']
    # print scrape_topic(entry['url'])['content']