Skip to content

Instantly share code, notes, and snippets.

@binux
Last active August 29, 2015 14:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save binux/e5f382facfc48470f96a to your computer and use it in GitHub Desktop.
Save binux/e5f382facfc48470f96a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-09-10 12:05:42
import os
import re
import json
import datetime
from libs.pprint import pprint
from libs.base_handler import *
class Handler(BaseHandler):
'''
this is a sample handler
'''
def on_start(self):
self.crawl('http://www.douban.com/location/china/', callback=self.location_page)
@config(age=60)
def location_page(self, response):
if response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A'):
return self.index_page(response)
for each in response.doc('DIV.location>A').items():
if 'douban.com/location/' in each.attr.href:
city = each.attr.href.split('/')[-2]
else:
city = each.attr.href.split('.')[0][7:]
self.crawl('http://www.douban.com/location/%s/events/week-all' % city,
callback=self.location_page)
@config(age=24*60*60)
def index_page(self, response):
for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>UL.events-list>LI.list-entry>DIV.info>DIV.title>A').items():
self.crawl(each.attr.href, callback=self.detail_page)
for each in response.doc('HTML>BODY>DIV#wrapper>DIV#content>DIV.grid-free.clearfix>DIV.article>DIV#db-events-list>DIV.paginator>A').items():
self.crawl(each.attr.href, callback=self.index_page)
@config(age=24*60*60)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc('HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>H1').text(),
"place": response.doc("HTML>BODY>DIV#wrapper>DIV#content DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.micro-address").text(),
"time": [x.text() for x in response.doc("DIV.article>DIV.related_info DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.buy-tickets-itemcon.tickets-con-stage").items()] or [response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item").text()] if response.doc("DIV.article>DIV.eventwrap>DIV#event-info>DIV.event-info>DIV.event-detail>UL.calendar-strs>LI.calendar-str-item") else [],
"price": [x.text() for x in response.doc("DIV.buy-tickets-bd-con>DIV.buy-tickets-bd>DIV.buy-tickets-item>DIV.tickets-con-price>A.buy-tickets-info").items()] or [response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")')[0].tail.strip()] if response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("费用")') else [],
"mcid": response.doc(u'DIV.article DIV#event-info>DIV.event-info>DIV.event-detail>SPAN.pl:contains("类型") ~ a').text(),
"city": response.doc('DIV.nav-primary>DIV.local-label>A.label').text(),
}
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-10-25 14:31:24
import re
import json
from libs.pprint import pprint
from libs.base_handler import *
class Handler(BaseHandler):
'''
this is a sample handler
'''
def on_start(self):
self.crawl('http://www.douban.com/group/haixiuzu/discussion', callback=self.index_page)
@every(0, 30)
def on_cronjob(self):
self.on_start()
@config(age=10)
def index_page(self, response):
for each in response.doc('.title a[href^="http://"]').items():
self.crawl(each.attr.href, callback=self.detail_page)
@config(age=30*24*60*60)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc("#content h1").text(),
"author": response.doc(".topic-content .from a").text(),
"author_url": response.doc("DIV.topic-doc>H3>SPAN.from>A").attr.href,
"imgs": [x.attr.src for x in response.doc('.topic-doc img').items()]
}
def on_result(self, result):
if not result or not result['imgs']:
return
post_id = re.search("\d+", self.response.url).group(0)
self.crawl("https://api.duoshuo.com/posts/import.json#"+post_id, method="POST",
data={
"short_name": "database",
"secret": "8e5a5be8873ad7e9a59147c3cfd10e73",
"posts[0][post_key]": post_id,
"posts[0][thread_key]": "haixiuzu",
"posts[0][message]": json.dumps(result).encode("base64").replace("\n", "")
}, callback=self.post_to_duoshuo)
def post_to_duoshuo(self):
pass
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-09-10 16:23:29
import os
import re
import json
import datetime
from libs.pprint import pprint
from libs.base_handler import *
class Handler(BaseHandler):
'''
this is a sample handler
'''
def on_start(self):
for each in range(1, 34):
self.crawl('http://www.huodongxing.com/events?type=0&show=list&d=t1&page=%d' % each, callback=self.index_page)
@every(60)
def cronjob(self):
self.on_start()
@config(age=60*60)
def index_page(self, response):
for each in response.doc('#container > div.article > ul > li > h3 > a').items():
self.crawl(each.attr.href, callback=self.detail_page)
@config(age=24*60*60)
def detail_page(self, response):
return {
"url": response.url,
"title": response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>H2.media-heading").text(),
"place": response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV.address>A").text(),
"time": [response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV>em.icon-time").parents()[-1].text_content().strip()] if response.doc("HTML>BODY>DIV#container>DIV.jumbotron.media>DIV.media-body>DIV>em.icon-time") else None,
"price": json.loads(re.search('var eventTicketsJson\s*=\s*([^;]+)', response.text).group(1)),
}
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-09-10 17:13:25
import os
import re
import json
import datetime
from libs.pprint import pprint
from libs.base_handler import *
class Handler(BaseHandler):
'''
this is a sample handler
'''
def on_start(self):
self.crawl('http://www.228.com.cn/s/thisWeek-update/?j=1&p=1',
callback=self.index_page)
@every(minutes=10)
def on_cron_job(self):
self.on_start()
@config(age=10*60)
def index_page(self, response):
for page in range(2, response.json['pageSize']+1):
self.crawl('http://www.228.com.cn/s/thisWeek-update/?j=1&p=%d' % page,
callback=self.index_page)
for i, each in enumerate(response.json['products']):
self.send_message(self.project_name, {
"title": each['name'],
"ccid": each['typeaname'],
'mcid': each['typebname'],
'city': each['cityname'],
'place': each['vname'],
'time': each['likedata'].split(' '),
'price': each['prices'].split(','),
'url': "http://www.228.com.cn/ticket-%s.html" % each['productid'],
}, url="%s#%s" % (response.url, i))
def on_message(self, project, msg):
return msg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment