Skip to content

Instantly share code, notes, and snippets.

@venkat
Created May 30, 2014 22:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save venkat/8037fe12095f10f9dd5a to your computer and use it in GitHub Desktop.
Save venkat/8037fe12095f10f9dd5a to your computer and use it in GitHub Desktop.
Using gevent to crawl 100 links given a seed link, based off of http://blog.hownowstephen.com/post/50743415449/gevent-tutorial
# monkey-patch
import gevent.monkey
gevent.monkey.patch_all()
import gevent.pool
import gevent.queue
import sys
import re
import requests
# Prepare a pool for 5 workers and a messaging queue
pool = gevent.pool.Pool(5)
queue = gevent.queue.Queue()
crawled = 0
crawler_id = 0
def crawler(_id):
'''A very simple queued gevent web crawler'''
print 'starting crawler...', _id
global crawled
while 1:
try:
#print 'queue size', queue.qsize()
u = queue.get(timeout=0)
response = requests.get(u)
print _id, crawled, response.status_code, u
# Extract some links to follow
for link in re.findall('<a href="(http.*?)"', response.content):
# Limit to 10 pages (ignores links when the pool is already full)
if crawled < 100:
crawled += 1
queue.put(link)
except gevent.queue.Empty:
print 'queue empty', _id
break
print 'stopping crawler...', _id
queue.put(sys.argv[1])
pool.spawn(crawler, crawler_id)
crawler_id += 1
while 1:
if queue.empty() and pool.free_count() == 5:
print 'no more links left and nothing running', crawler_id
break
#print 'qsize', queue.qsize(), 'free count', pool.free_count()
for x in xrange(0, min(queue.qsize(), pool.free_count())):
print 'spawning'
pool.spawn(crawler, crawler_id)
crawler_id += 1
gevent.sleep(0.1)
#print 'qsize', queue.qsize(), 'free count', pool.free_count()
# Wait for everything to complete
pool.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment