Skip to content

Instantly share code, notes, and snippets.

@gurisko
Created August 21, 2015 08:33
Show Gist options
  • Save gurisko/70085f4bae7a2ed7fff8 to your computer and use it in GitHub Desktop.
Save gurisko/70085f4bae7a2ed7fff8 to your computer and use it in GitHub Desktop.
Simple crawler written in Python
# default starting url
seed = "http://opera.com/"
# workers number
workers = 20
# max number of discovered URL, None if no limit
limit = 10000
# user-agent name and logging name
name = 'crawlerlog'
# allowed url schemes
allowed_schemes = ['http', 'https']
# parsing tags with web address attribute
urltags = {'a': 'href', 'form': 'action', 'frame': 'src', 'iframe': 'src'}
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import urllib2
import urlparse
import logging
from robotparser import RobotFileParser
from HTMLParser import HTMLParser
from gevent import Timeout, Greenlet
import gevent.pool as GeventPool
import gevent.queue as GeventQueue
import gevent.event as GeventEvent
import gevent.monkey
import config
gevent.monkey.patch_all()
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
filename=config.name+'.log', level=logging.INFO)
log = logging.getLogger()
discovered = set()
permissions = {}
class LogException(Exception):
def __init__(self, val, url):
self.value = val
self.url = url
def __str__(self):
return (self.value + ': ' + self.url)
class ProcessPage(HTMLParser):
def __init__(self, url, queue):
HTMLParser.__init__(self)
self.url = url
self.queue = queue
log.info('Parsing '+self.url)
def normalize_url(self, url):
url = urlparse.urljoin(self.url, url)
url = urlparse.urldefrag(url)[0].lower()
return url
def can_follow(self, attrs):
attrs = dict(attrs)
if 'name' in attrs and attrs['name'] == 'robots':
attrs['content'] = [x.strip() for x in attrs['content'].split(',')]
if 'nofollow' in attrs['content'] and \
'follow' not in attrs['content']:
return False
return True
def has_allowed_scheme(self, url):
return (urlparse.urlparse(url).scheme in config.allowed_schemes)
def handle_starttag(self, tag, attrs):
if tag == 'meta' and self.can_follow(attrs) == False:
raise LogException('Nofollow', self.url)
if tag not in config.urltags:
return
attrs = dict(attrs)
if config.urltags[tag] not in attrs:
return
url = self.normalize_url(attrs[config.urltags[tag]])
if self.has_allowed_scheme(url) and (url not in discovered):
log.info(self.url + ' -> ' + url)
discovered.add(url)
self.queue.put(url)
class Extractor(Greenlet):
def __init__(self, url, queue):
Greenlet.__init__(self)
self.url = url
self.queue = queue
def can_read(self):
domain = urlparse.urlparse(self.url).netloc
robot_url = urlparse.urljoin('http://' + domain, 'robots.txt')
try:
if domain not in permissions:
robot = RobotFileParser()
robot.set_url(robot_url)
robot.read()
permissions[domain] = robot
res = permissions[domain].can_fetch('*', self.url)
except:
raise LogException('RobotError', robot_url)
return res
def get_source_code(self):
if not self.can_read():
raise LogException('ProtectedAddress', self.url)
try:
opener = urllib2.urlopen(self.url)
except urllib2.HTTPError as ex:
raise LogException('HTTPError', self.url)
except urllib2.URLError as ex:
raise LogException('URLError', self.url)
opener_type = opener.info().gettype()
if opener_type != "text/html":
raise LogException('InvalidFormatException', self.url)
content = opener.read().decode('ascii', 'ignore')
return content
def extract_links(self):
with Timeout(10):
data = self.get_source_code()
if not data:
return set()
parser = ProcessPage(self.url, self.queue)
parser.feed(data)
parser.close()
class Crawler(object):
def __init__(self):
self.pool = GeventPool.Pool(config.workers)
self.frontier = GeventQueue.Queue()
self.flag_finished = GeventEvent.Event()
self.start()
def start(self):
discovered.add(config.seed)
self.frontier.put(config.seed)
gevent.spawn(self.scheduler).join()
def scheduler(self):
while True:
try:
url = self.frontier.get_nowait()
except GeventQueue.Empty:
if self.pool.free_count() != self.pool.size:
self.flag_finished.wait()
self.flag_finished.clear()
else:
self.pool.join()
return
if url is not None and (config.limit is None or
len(discovered) <= config.limit):
self.pool.spawn(self.worker, url)
url = None
def worker(self, url):
try:
Extractor(url, self.frontier).extract_links()
except LogException as ex:
log.warn(ex)
except Timeout as t:
log.info('Timeout: ' + t)
self.flag_finished.set()
def main():
log.info('[-- Hello, world! --]')
try:
print '[-- Crawling in progress --]'
Crawler()
except KeyboardInterrupt as ex:
log.info('KeyboardInterrupt')
except Exception as ex:
log.exception(ex)
finally:
log.info('[-- Goodbye, world! --]\n')
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment