binux/douban.py

## douban.py
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-09-10 12:05:42

import os
import re
import json
import datetime
from libs.pprint import pprint
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    def on_start(self):
        self.crawl('http://www.douban.com/location/china/', callback=self.location_page)

    @config(age=60)
    def location_page(self, response):
        if response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A'):
            return self.index_page(response)

        for each in response.doc('DIV.location>A').items():
            if 'douban.com/location/' in each.attr.href:
                city = each.attr.href.split('/')[-2]
            else:
                city = each.attr.href.split('.')[0][7:]
            self.crawl('http://www.douban.com/location/%s/events/week-all' % city,
                       callback=self.location_page)


    @config(age=24*60*60)
    def index_page(self, response):
        for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A').items():
            self.crawl(each.attr.href, callback=self.detail_page)

        for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>DIV.paginator>A').items():
            self.crawl(each.attr.href, callback=self.index_page)

    @config(age=24*60*60)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc('HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>H1').text(),
            "place": response.doc("HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.micro-address").text(),
            "time": [x.text() for x in response.doc("DIV.article>DIV.related_info DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.buy-tickets-itemcon.tickets-con-stage").items()] or [response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item").text()] if response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item") else [],
            "price": [x.text() for x in response.doc("DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.tickets-con-price>A.buy-tickets-info").items()] or [response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")')[0].tail.strip()] if response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")') else [],
            "mcid": response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("类型") ~ a').text(),
            "city": response.doc('DIV.nav-primary>DIV.local-label>A.label').text(),
        }


## haixiuzu.py
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-10-25 14:31:24

import re
import json
from libs.pprint import pprint
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''

    def on_start(self):
        self.crawl('http://www.douban.com/group/haixiuzu/discussion', callback=self.index_page)

    @every(0, 30)
    def on_cronjob(self):
        self.on_start()

    @config(age=10)
    def index_page(self, response):
        for each in response.doc('.title a[href^="http://"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(age=30*24*60*60)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc("#content h1").text(),
            "author": response.doc(".topic-content .from a").text(),
            "author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href,
            "imgs": [x.attr.src for x in response.doc('.topic-doc img').items()]
        }

    def on_result(self, result):
        if not result or not result['imgs']:
            return
        post_id = re.search("\d+", self.response.url).group(0)
        self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST",
            data={
            "short_name": "database",
            "secret": "8e5a5be8873ad7e9a59147c3cfd10e73",
            "posts[0][post_key]": post_id,
            "posts[0][thread_key]": "haixiuzu",
            "posts[0][message]": json.dumps(result).encode("base64").replace("\n", "")
        }, callback=self.post_to_duoshuo)


    def post_to_duoshuo(self):
        pass

## huodongxing.py
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-09-10 16:23:29

import os
import re
import json
import datetime
from libs.pprint import pprint
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    def on_start(self):
        for each in range(1, 34):
            self.crawl('http://www.huodongxing.com/events?type=0&show=list&d=t1&page=%d' % each, callback=self.index_page)

    @every(60)
    def cronjob(self):
        self.on_start()

    @config(age=60*60)
    def index_page(self, response):
        for each in response.doc('#container > div.article > ul > li > h3 > a').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(age=24*60*60)
    def detail_page(self, response):
        return {
            "url": response.url,
            "title": response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>H2.media-heading").text(),
            "place": response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV.address>A").text(),
            "time": [response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV>em.icon-time").parents()[-1].text_content().strip()] if response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV>em.icon-time") else None,
            "price": json.loads(re.search('var eventTicketsJson\s*=\s*([^;]+)', response.text).group(1)),
        }

## yongle.py
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-09-10 17:13:25

import os
import re
import json
import datetime
from libs.pprint import pprint
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    def on_start(self):
        self.crawl('http://www.228.com.cn/s/thisWeek-update/?j=1&p=1',
                   callback=self.index_page)

    @every(minutes=10)
    def on_cron_job(self):
        self.on_start()

    @config(age=10*60)
    def index_page(self, response):
        for page in range(2, response.json['pageSize']+1):
            self.crawl('http://www.228.com.cn/s/thisWeek-update/?j=1&p=%d' % page,
                       callback=self.index_page)
        for i, each in enumerate(response.json['products']):
            self.send_message(self.project_name, {
                "title": each['name'],
                "ccid": each['typeaname'],
                'mcid': each['typebname'],
                'city': each['cityname'],
                'place': each['vname'],
                'time': each['likedata'].split(' '),
                'price': each['prices'].split(','),
                'url': "http://www.228.com.cn/ticket-%s.html" % each['productid'],
            }, url="%s#%s" % (response.url, i))

    def on_message(self, project, msg):
        return msg
	#!/usr/bin/env python
	# -- encoding: utf-8 --
	# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
	# Created on 2014-09-10 12:05:42

	import os
	import re
	import json
	import datetime
	from libs.pprint import pprint
	from libs.base_handler import *

	class Handler(BaseHandler):
	'''
	this is a sample handler
	'''
	def on_start(self):
	self.crawl('http://www.douban.com/location/china/', callback=self.location_page)

	@config(age=60)
	def location_page(self, response):
	if response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A'):
	return self.index_page(response)

	for each in response.doc('DIV.location>A').items():
	if 'douban.com/location/' in each.attr.href:
	city = each.attr.href.split('/')[-2]
	else:
	city = each.attr.href.split('.')[0][7:]
	self.crawl('http://www.douban.com/location/%s/events/week-all' % city,
	callback=self.location_page)


	@config(age=246060)
	def index_page(self, response):
	for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A').items():
	self.crawl(each.attr.href, callback=self.detail_page)

	for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>DIV.paginator>A').items():
	self.crawl(each.attr.href, callback=self.index_page)

	@config(age=246060)
	def detail_page(self, response):
	return {
	"url": response.url,
	"title": response.doc('HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>H1').text(),
	"place": response.doc("HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.micro-address").text(),
	"time": [x.text() for x in response.doc("DIV.article>DIV.related_info DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.buy-tickets-itemcon.tickets-con-stage").items()] or [response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item").text()] if response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item") else [],
	"price": [x.text() for x in response.doc("DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.tickets-con-price>A.buy-tickets-info").items()] or [response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")')[0].tail.strip()] if response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")') else [],
	"mcid": response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("类型") ~ a').text(),
	"city": response.doc('DIV.nav-primary>DIV.local-label>A.label').text(),
	}