Created
May 12, 2011 11:43
-
-
Save jkal/968356 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import sys | |
import eventlet | |
from eventlet.green import urllib2 | |
from BeautifulSoup import BeautifulSoup as bs | |
import urlparse | |
class Crawler(object): | |
def __init__(self, start_url, callback): | |
self.start_url = start_url | |
self.callback = callback | |
def _fetch(self, url, seen, pool): | |
print 'fetching', url | |
data = '' | |
# 2 sec timeout | |
with eventlet.Timeout(2, False): | |
data = urllib2.urlopen(url).read() | |
self.callback(data) | |
for link in bs(data).findAll('a', href=True): | |
url_new = urlparse.urljoin(url, link['href']) | |
if url_new not in seen: | |
seen.add(url_new) | |
pool.spawn_n(self._fetch, url_new, seen, pool) | |
def crawl(self, n=None): | |
pool = eventlet.GreenPool() | |
seen = set() | |
self._fetch(self.start_url, seen, pool) | |
pool.waitall() | |
return seen | |
def my_cb(data): | |
# do something with the data | |
print data | |
sys.stdout.flush() | |
if __name__ == '__main__': | |
if len(sys.argv) > 1: | |
c = Crawler(sys.argv[1]) | |
else: | |
c = Crawler('http://localhost:8000', my_cb) | |
seen = c.crawl() | |
print len(seen) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import sys | |
from BeautifulSoup import BeautifulSoup as bs | |
import urlparse | |
import urllib2 | |
try: | |
import gevent | |
import gevent.pool | |
from gevent import monkey | |
monkey.patch_all() | |
except ImportError: | |
sys.exit('Gevent library is required: http://www.gevent.org/') | |
class Crawler(object): | |
def __init__(self, start_url, callback): | |
self.start_url = start_url | |
self.callback = callback | |
def _fetch(self, url, seen, pool): | |
print 'fetching', url | |
sys.stdout.flush() | |
data = '' | |
print pool.free_count() | |
# 2 sec timeout | |
with gevent.Timeout(2, False): | |
data = urllib2.urlopen(url).read() | |
self.callback(data) | |
# do something with the data | |
for link in bs(data).findAll('a', href=True): | |
url_new = urlparse.urljoin(url, link['href']) | |
if url_new not in seen: | |
seen.add(url_new) | |
pool.spawn(self._fetch, url_new, seen, pool) | |
def crawl(self, n=None): | |
pool = gevent.pool.Pool() | |
seen = set() | |
self._fetch(self.start_url, seen, pool) | |
pool.wait_available() | |
return seen | |
def my_cb(data): | |
print len(data) | |
sys.stdout.flush() | |
if __name__ == '__main__': | |
if len(sys.argv) > 1: | |
c = Crawler(sys.argv[1]) | |
else: | |
c = Crawler('http://localhost:8000', my_cb) | |
seen = c.crawl() | |
print len(seen) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment