Skip to content

Instantly share code, notes, and snippets.

@lloydzhou
Last active August 28, 2015 09:03
Show Gist options
  • Save lloydzhou/4fe51b6da798e90ca2c1 to your computer and use it in GitHub Desktop.
Save lloydzhou/4fe51b6da798e90ca2c1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding=utf-8
import bs4
import tornado.ioloop
import tornado.options
from tornado.httpclient import AsyncHTTPClient
from adesk.db import ipai_mongo
import datetime
import os
duanzi = ipai_mongo.aipai['duanzi1']
class Queue(object):
def __init__(self, max=1):
self.client = AsyncHTTPClient()
self.queue = []
self.max = max < 1 and 1 or max
self.running = 0
def add(self, request, handler):
self.queue.append((request, handler))
self.next()
def done(self):
self.running = self.running - 1
if len(self.queue) == 0 and self.running < 1:
self.client.close()
tornado.ioloop.IOLoop.instance().stop()
else:
self.next()
def next(self):
if self.running < self.max and len(self.queue):
self.running = self.running + 1
request, handler = self.queue.pop()
print request, self.running
self.client.fetch(request, handler)
class Spider(object):
def __init__(self, queue, url, **kwargs):
self.request = tornado.httpclient.HTTPRequest(url, **kwargs)
self.queue = queue
def handler(self, response):
if response.error:
print "Error", response.error
else:
print "end", response
self.queue.done()
def run(self):
self.queue.add(self.request, self.handler)
class FileSpider(Spider):
def __init__(self, queue, url, name='out', **kwargs):
self.name = name
self.fh = open(self.name, 'w')
kwargs['streaming_callback'] = self.streaming_callback
Spider.__init__(self, queue, url, **kwargs)
def streaming_callback(self, data):
print "==== length of data: %d write to %s ====" % (len(data), self.name)
self.fh.write(data)
def handler(self, response):
self.fh.flush()
self.fh.close()
if response.error:
print "Error", response.error
os.unlink(self.name)
else:
print "end", response
self.queue.done()
class GridfsSpider(FileSpider):
def __init__(self, queue, url, name=None, **kwargs):
self.fh = ipai_mongo.filefs.new_file(filename=name)
kwargs['streaming_callback'] = self.streaming_callback
Spider.__init__(self, queue, url, **kwargs)
def streaming_callback(self, data):
print "==== length of data: %d write to %s ====" % (len(data), self.fh.name or str(self.fh._id))
self.fh.write(data)
def handler(self, response):
self.fh.close()
if response.error:
print "Error", response.error
ipai_mongo.filefs.delete(self.fh._id)
else:
print "end", response, self.fh._id
self.queue.done()
class BudejieSpider(Spider):
def __init__(self, queue, typ='video', count=1):
self.count = count
self.handle_response = getattr(self, 'handle_%s' % typ)
self.url = "http://www.budejie.com/%s" % typ
Spider.__init__(self, queue, self.url)
def handler(self, response):
if response.error:
print "Error", response.error
else:
# print "end", response
soup = bs4.BeautifulSoup(response.body)
posts = soup.select('.web_left')
for p in posts:
self.handle_response(p)
self.queue.done()
def run(self):
for i in range(self.count):
# print "start fetch", i
url = "%s/%d" % (self.url, i)
self.queue.add(tornado.httpclient.HTTPRequest(url), self.handler)
def handle_video(self, p):
user_info = p.select_one('.user_info')
avatar = user_info.select_one('img').attrs.get('src')
username = user_info.select_one('li.user_name').select_one('p').get_text()
atime = user_info.select_one('p.time').get_text()
content = p.select_one('p.web_size').get_text()
video_info = p.select_one('div.jp-video-player').attrs
video = video_info.get('data-src')
poster = video_info.get('data-poster')
other = p.select_one('.budejie_mutual')
up = int(other.select_one('.no_love').get_text())
down = int(other.select_one('.no_cai').get_text())
poster_spider = FileSpider(self.queue, poster)
poster_spider.run()
video_spider = FileSpider(self.queue, video)
video_spider.run()
print (avatar, username, content, video, poster, up, down, atime)
def handle_duanzi(self, p):
user_info = p.select_one('.user_info')
avatar = user_info.select_one('img').attrs.get('src')
username = user_info.select_one('li.user_name').select_one('p').get_text()
atime = user_info.select_one('p.time').get_text()
content = p.select_one('p.web_size').get_text()
other = p.select_one('.budejie_mutual')
up = int(other.select_one('.no_love').get_text())
down = int(other.select_one('.no_cai').get_text())
print duanzi.insert({'avatar': avatar, 'username': username, 'content': content, 'atime': datetime.datetime.now()})
print (avatar, username, content, up, down, atime)
if __name__ == "__main__":
tornado.options.parse_command_line()
queue = Queue(4)
# spider = BudejieSpider(queue, 'duanzi', 202)
# spider.run()
# spider = FileSpider(queue, 'http://wimg.spriteapp.cn/profile/large/2015/07/24/55b1db99bbbda_mini.jpg', 'out.jpg')
# spider.run()
spider = GridfsSpider(queue, 'http://wvideo.spriteapp.cn/video/2015/0810/55c81937974da_wpd.mp4', 'out.mp4')
spider.run()
tornado.ioloop.IOLoop.instance().start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment