Skip to content

Instantly share code, notes, and snippets.

@cuimuxi
Created September 14, 2012 02:54
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save cuimuxi/3719535 to your computer and use it in GitHub Desktop.
Save cuimuxi/3719535 to your computer and use it in GitHub Desktop.
gevent crawler test
from gcrawler import GCrawler, Downloader
import unittest
import urllib2
import logging
import traceback
from datetime import datetime
import re
logging.basicConfig(level=logging.DEBUG)
urls=['http://www.163.com', 'http://www.qq.com', 'http://www.sina.com.cn', 'http://www.sohu.com', 'http://www.yahoo.com', 'http://www.baidu.com', 'http://www.google.com', 'http://www.microsoft.com']
class DummySpider:
def scheduler(self):
for u in urls:
yield u
def worker(self, item):
try:
f = urllib2.urlopen(item)
data = f.read()
f.close()
return (item, len(data))
except:
traceback.print_exc()
return None
def pipeline(self, results):
for r in results:
print "Downloaded : %s(%s)" % r
class DummyDownloader(Downloader):
def pipeline(self, results):
for r in results:
r['datasize'] = len(r['data'])
print "Data fetched : %(url)s(%(datasize)s)" % r
class ImagesDownloader(Downloader):
def parse(self, result):
pat_imgs = re.compile(u"<img.*src=\"([^\"]*)", re.I | re.U)
imgs = list(set(pat_imgs.findall(result['data'])))
print "Images count %s" % len(imgs)
new_items = []
for i in imgs:
if i != "":
item = dict(url=i, parse=None)
new_items.append(item)
return new_items, result
def pipeline(self, results):
print "== %s" % datetime.now()
for r in results:
r['datasize'] = len(r['data'])
print "Image Data fetched : %(url)s(%(datasize)s)" % r
class TestGCrawler(unittest.TestCase):
'''
def testSpider(self):
spider = DummySpider()
crawler = GCrawler(spider, workers_count = 3)
crawler.start()
def testDownloader(self):
spider = DummyDownloader(urls)
crawler = GCrawler(spider, workers_count = 3)
crawler.start()
'''
def testBatch(self):
print "start at : %s" % datetime.now()
spider = ImagesDownloader(['http://f1.163.com'])
crawler = GCrawler(spider, workers_count = 3)
crawler.start()
unittest.main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment