Last active
August 29, 2015 14:04
-
-
Save FZambia/9e3f6b693dfd7c7f3bac to your computer and use it in GitHub Desktop.
Web spider to check site URL consistency built on top of Tornado and Toro
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
import HTMLParser | |
import time | |
import urlparse | |
from datetime import timedelta | |
from tornado import httpclient, gen, ioloop | |
import tornado.options | |
from tornado.options import define, options | |
from tornado.httpclient import HTTPError | |
import toro | |
import logging | |
logger = logging.getLogger('spider') | |
define( | |
"url", default="", help="Base URL", type=str | |
) | |
define( | |
"concurrency", default=10, help="Concurrency", type=int | |
) | |
define( | |
"timeout", default=300, help="Timeout in seconds", type=int | |
) | |
define( | |
"encoding", default="utf-8", help="Encoding of documents", type=str | |
) | |
@gen.coroutine | |
def spider(base_url, concurrency, timeout, encoding): | |
q = toro.JoinableQueue() | |
sem = toro.BoundedSemaphore(concurrency) | |
start = time.time() | |
fetching, fetched = set(), set() | |
@gen.coroutine | |
def fetch_url(): | |
current_url = yield q.get() | |
try: | |
if current_url in fetching: | |
return | |
logger.debug("fetching: {0}".format(current_url)) | |
fetching.add(current_url) | |
urls = yield get_links_from_url(current_url, encoding) | |
fetched.add(current_url) | |
for new_url in urls: | |
# Only follow links beneath the base URL | |
if new_url.startswith(base_url): | |
yield q.put(new_url) | |
finally: | |
q.task_done() | |
sem.release() | |
@gen.coroutine | |
def worker(): | |
while True: | |
yield sem.acquire() | |
# Launch a subtask | |
fetch_url() | |
q.put(base_url) | |
# Start worker, then wait for the work queue to be empty. | |
worker() | |
yield q.join(deadline=timedelta(seconds=timeout)) | |
assert fetching == fetched | |
print 'Done in %d seconds, fetched %s URLs.' % ( | |
time.time() - start, len(fetched)) | |
@gen.coroutine | |
def get_links_from_url(url, encoding): | |
"""Download the page at `url` and parse it for links. Returned links have | |
had the fragment after `#` removed, and have been made absolute so, e.g. | |
the URL 'gen.html#tornado.gen.coroutine' becomes | |
'http://www.tornadoweb.org/en/stable/gen.html'. | |
""" | |
try: | |
response = yield httpclient.AsyncHTTPClient().fetch(url) | |
logger.debug("fetched: {0}".format(url)) | |
urls = [urlparse.urljoin(url, remove_fragment(new_url)) | |
for new_url in get_links(url, response.body, encoding)] | |
except HTTPError as err: | |
logger.error("HTTP {0}: {1}".format(err.code, url)) | |
raise gen.Return([]) | |
except Exception as e: | |
logger.exception(e) | |
raise gen.Return([]) | |
raise gen.Return(urls) | |
def remove_fragment(url): | |
scheme, netloc, url, params, query, fragment = urlparse.urlparse(url) | |
return urlparse.urlunparse((scheme, netloc, url, params, query, '')) | |
def get_links(url, html, encoding): | |
class URLSeeker(HTMLParser.HTMLParser): | |
def __init__(self): | |
HTMLParser.HTMLParser.__init__(self) | |
self.urls = [] | |
def handle_starttag(self, tag, attrs): | |
href = dict(attrs).get('href') | |
if href and tag == 'a': | |
self.urls.append(href) | |
url_seeker = URLSeeker() | |
try: | |
decoded_html = html.decode(encoding) | |
except UnicodeDecodeError: | |
logger.error("Encoding error: {0}".format(url)) | |
return [] | |
url_seeker.feed(decoded_html) | |
return url_seeker.urls | |
def stop(fut): | |
loop.stop() | |
fut.result() | |
if __name__ == '__main__': | |
tornado.options.parse_command_line() | |
logger.setLevel(getattr(logging, options.logging.upper(), 'INFO')) | |
loop = ioloop.IOLoop.current() | |
future = spider(options.url, options.concurrency, options.timeout, options.encoding) | |
future.add_done_callback(stop) | |
try: | |
loop.start() | |
except KeyboardInterrupt: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment