Skip to content

Instantly share code, notes, and snippets.

@binux
Created March 8, 2014 03:15
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save binux/9424801 to your computer and use it in GitHub Desktop.
Save binux/9424801 to your computer and use it in GitHub Desktop.
demo script for douban.com
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-02-28 00:15:53
from libs.pprint import pprint
from libs.base_handler import *
class Handler(BaseHandler):
'''
this is a sample handler
'''
def on_start(self):
self.crawl('http://movie.douban.com/', callback=self.index_page)
def index_page(self, response):
self.crawl([x.attr.href for x in \
response.doc('a[href^="http://movie.douban.com/subject"]').items()],
callback=self.detail_page)
def detail_page(self, response):
self.crawl([x.attr.href for x in \
response.doc('a[href^="http://movie.douban.com/subject"]').items()],
callback=self.detail_page)
return {
"title": response.doc('*[property="v:itemreviewed"]').text(),
"directedBy": [x.text() for x in response.doc('*[rel="v:directedBy"]').items()],
"genre": [x.text() for x in response.doc('*[property="v:genre"]').items()],
"编剧": [x.text() for x in response.doc(u'#info .pl:contains("编剧") ~ a').items()],
"主演": [x.text() for x in response.doc('*[rel="v:starring"]').items()],
"制片国家 ": response.doc(u'#info .pl:contains("制片国家")')[0].tail,
"集数 ": int(response.doc(u'#info .pl:contains("集数")')[0].tail),
}
def on_result(self, result):
if not result:
return
pprint(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment