Skip to content

Instantly share code, notes, and snippets.

@tomotaka
Last active August 29, 2015 14:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tomotaka/68d194738f799fd70642 to your computer and use it in GitHub Desktop.
Save tomotaka/68d194738f799fd70642 to your computer and use it in GitHub Desktop.
concurrent crawler using gevent
import gevent.queue as gq
class ConcurrentCrawler(object):
def __init__(self, func=crawl_all, concurrency=10, q_max=100):
self._func = func
self._concurrency = concurrency
self._q_max = q_max
self._queue = gq.Queue(maxsize=self._q_max)
self._workers = None
def crawl(self, query_obj):
self._queue.put([query_obj, False])
def start_workers(self):
assert self._workers is None
def _worker(_self, _worker_number):
queue = _self._queue
logging.debug('[crawl-worker:%d] started' % _worker_number)
while True:
query_obj, is_shutdown = queue.get()
if is_shutdown:
break # received shutdown signal
try:
_self._func(query_obj) # crawl
except:
logging.exception('[crawl-worker:%d] something bad happened' % _worker_number)
raise
workers = []
for i in xrange(self._concurrency):
new_worker = gevent.spawn(_worker, self, i)
workers.append(new_worker)
self._workers = workers
logging.debug('[ConcurrentCrawler] started %d crawling workers' % self._concurrency)
def join(self):
# send finish signal
for i in xrange(self._concurrency):
self._queue.put([None, True])
# join all workers
for worker in self._workers:
worker.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment