Skip to content

Instantly share code, notes, and snippets.

@jkal
Created May 12, 2011 11:43
Show Gist options
  • Save jkal/968356 to your computer and use it in GitHub Desktop.
Save jkal/968356 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import eventlet
from eventlet.green import urllib2
from BeautifulSoup import BeautifulSoup as bs
import urlparse
class Crawler(object):
def __init__(self, start_url, callback):
self.start_url = start_url
self.callback = callback
def _fetch(self, url, seen, pool):
print 'fetching', url
data = ''
# 2 sec timeout
with eventlet.Timeout(2, False):
data = urllib2.urlopen(url).read()
self.callback(data)
for link in bs(data).findAll('a', href=True):
url_new = urlparse.urljoin(url, link['href'])
if url_new not in seen:
seen.add(url_new)
pool.spawn_n(self._fetch, url_new, seen, pool)
def crawl(self, n=None):
pool = eventlet.GreenPool()
seen = set()
self._fetch(self.start_url, seen, pool)
pool.waitall()
return seen
def my_cb(data):
# do something with the data
print data
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) > 1:
c = Crawler(sys.argv[1])
else:
c = Crawler('http://localhost:8000', my_cb)
seen = c.crawl()
print len(seen)
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from BeautifulSoup import BeautifulSoup as bs
import urlparse
import urllib2
try:
import gevent
import gevent.pool
from gevent import monkey
monkey.patch_all()
except ImportError:
sys.exit('Gevent library is required: http://www.gevent.org/')
class Crawler(object):
def __init__(self, start_url, callback):
self.start_url = start_url
self.callback = callback
def _fetch(self, url, seen, pool):
print 'fetching', url
sys.stdout.flush()
data = ''
print pool.free_count()
# 2 sec timeout
with gevent.Timeout(2, False):
data = urllib2.urlopen(url).read()
self.callback(data)
# do something with the data
for link in bs(data).findAll('a', href=True):
url_new = urlparse.urljoin(url, link['href'])
if url_new not in seen:
seen.add(url_new)
pool.spawn(self._fetch, url_new, seen, pool)
def crawl(self, n=None):
pool = gevent.pool.Pool()
seen = set()
self._fetch(self.start_url, seen, pool)
pool.wait_available()
return seen
def my_cb(data):
print len(data)
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) > 1:
c = Crawler(sys.argv[1])
else:
c = Crawler('http://localhost:8000', my_cb)
seen = c.crawl()
print len(seen)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment