Last active
August 29, 2015 14:13
-
-
Save tomotaka/68d194738f799fd70642 to your computer and use it in GitHub Desktop.
concurrent crawler using gevent
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gevent.queue as gq | |
class ConcurrentCrawler(object): | |
def __init__(self, func=crawl_all, concurrency=10, q_max=100): | |
self._func = func | |
self._concurrency = concurrency | |
self._q_max = q_max | |
self._queue = gq.Queue(maxsize=self._q_max) | |
self._workers = None | |
def crawl(self, query_obj): | |
self._queue.put([query_obj, False]) | |
def start_workers(self): | |
assert self._workers is None | |
def _worker(_self, _worker_number): | |
queue = _self._queue | |
logging.debug('[crawl-worker:%d] started' % _worker_number) | |
while True: | |
query_obj, is_shutdown = queue.get() | |
if is_shutdown: | |
break # received shutdown signal | |
try: | |
_self._func(query_obj) # crawl | |
except: | |
logging.exception('[crawl-worker:%d] something bad happened' % _worker_number) | |
raise | |
workers = [] | |
for i in xrange(self._concurrency): | |
new_worker = gevent.spawn(_worker, self, i) | |
workers.append(new_worker) | |
self._workers = workers | |
logging.debug('[ConcurrentCrawler] started %d crawling workers' % self._concurrency) | |
def join(self): | |
# send finish signal | |
for i in xrange(self._concurrency): | |
self._queue.put([None, True]) | |
# join all workers | |
for worker in self._workers: | |
worker.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment