Skip to content

Instantly share code, notes, and snippets.

@moriyoshi
Created October 23, 2009 05:52
Show Gist options
  • Save moriyoshi/216686 to your computer and use it in GitHub Desktop.
Save moriyoshi/216686 to your computer and use it in GitHub Desktop.
from BeautifulSoup import BeautifulSoup, NavigableString, Tag, Comment
import urllib2
import re
from urlparse import urljoin
TOP_URL = 'http://komachi.yomiuri.co.jp/?g=%d'
RANKING_URL = 'http://komachi.yomiuri.co.jp/ranking/'
DEFAULT_CHARSET = 'utf-8'
def urlread(url, default_charset=DEFAULT_CHARSET):
f = urllib2.urlopen(url)
charset = default_charset
content = f.read()
f.close()
for pp in f.headers.plist:
p = pp.split('=')
if len(p) == 2 and p[0].lower() == 'charset':
charset = p[1]
return unicode(content, charset)
def textify(nodelist):
retval = ''
for n in nodelist:
if isinstance(n, Comment):
pass
elif isinstance(n, Tag):
if n.name == 'br':
retval += "\n"
else:
retval += textify(n)
elif isinstance(n, NavigableString):
retval += unicode(n)
return retval
def scrape_ranking_page(kind):
soup = BeautifulSoup(urlread(RANKING_URL))
retval = []
for e in soup.html.body.find('div', id='d2-reader').find('div', id='d3-ge0').find('div', id='d4-ranking').find('table', 'layout').tr.find('td', 'main').find('div', id=('rankAccess%d' % kind)).find('table', 'topicslist').findAll('tr'):
no, q = e.findAll('td')
retval.append(
{
'rank': int(no.string),
'title': q.a.string,
'url': urljoin(RANKING_URL, q.a['href'])
}
)
return retval
def scrape_recent_entris_page(url):
soup = BeautifulSoup(urlread(url))
topics = []
pages = []
list_root = soup.html.body.find('div', id='d2-reader').find('div', id='d3-ge0').div.find('table', 'layout').tr.find('td', 'main')
for e in list_root.find('table', 'topiclisttitle').tr.find('td', 'pagelist').div.contents:
if isinstance(e, Tag):
if e.name == 'a':
pages.append(urljoin(url, e['href']))
else:
pages.append(None)
for e in list_root.find('table', 'topicslist').findAll('tr'):
cols = e.findAll('td')
if len(cols) == 5:
topics.append(
{
'title': cols[0].a.string,
'url': urljoin(url, cols[0].a['href'])
}
)
return {
'pages': pages,
'topics': topics,
}
def scrape_recent_entries(kind=0):
url = TOP_URL % kind
n = scrape_recent_entris_page(url)
topics = n['topics']
pages = n['pages']
for p in pages:
if p is None:
continue
n = scrape_recent_entris_page(p)
topics += n['topics']
return topics
def scrape_topic(url):
topic_top = BeautifulSoup(re.compile(r"</scr'").sub("<' + '/scr'", urlread(url))).html.body.find('div', id='d2-reader').div.div.div.find('div', id='d4-res').find('td', 'main')
return {
'title': textify(topic_top.find('div', 'topicbox').find('div', 'inr').find('table', 'topichd').tr.findAll('td')[0].h1).strip(),
'content': textify(topic_top.find('table', id='topiccontent').tr.findAll('td')[1].p)
}
if __name__ == '__main__':
print scrape_recent_entries()
#entries = scrape_ranking_page(0)
#for entry in entries:
# print entry['title']
# print scrape_topic(entry['url'])['content']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment