Skip to content

Instantly share code, notes, and snippets.

@FZambia
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FZambia/9e3f6b693dfd7c7f3bac to your computer and use it in GitHub Desktop.
Save FZambia/9e3f6b693dfd7c7f3bac to your computer and use it in GitHub Desktop.
Web spider to check site URL consistency built on top of Tornado and Toro
# coding: utf-8
import HTMLParser
import time
import urlparse
from datetime import timedelta
from tornado import httpclient, gen, ioloop
import tornado.options
from tornado.options import define, options
from tornado.httpclient import HTTPError
import toro
import logging
logger = logging.getLogger('spider')
define(
"url", default="", help="Base URL", type=str
)
define(
"concurrency", default=10, help="Concurrency", type=int
)
define(
"timeout", default=300, help="Timeout in seconds", type=int
)
define(
"encoding", default="utf-8", help="Encoding of documents", type=str
)
@gen.coroutine
def spider(base_url, concurrency, timeout, encoding):
q = toro.JoinableQueue()
sem = toro.BoundedSemaphore(concurrency)
start = time.time()
fetching, fetched = set(), set()
@gen.coroutine
def fetch_url():
current_url = yield q.get()
try:
if current_url in fetching:
return
logger.debug("fetching: {0}".format(current_url))
fetching.add(current_url)
urls = yield get_links_from_url(current_url, encoding)
fetched.add(current_url)
for new_url in urls:
# Only follow links beneath the base URL
if new_url.startswith(base_url):
yield q.put(new_url)
finally:
q.task_done()
sem.release()
@gen.coroutine
def worker():
while True:
yield sem.acquire()
# Launch a subtask
fetch_url()
q.put(base_url)
# Start worker, then wait for the work queue to be empty.
worker()
yield q.join(deadline=timedelta(seconds=timeout))
assert fetching == fetched
print 'Done in %d seconds, fetched %s URLs.' % (
time.time() - start, len(fetched))
@gen.coroutine
def get_links_from_url(url, encoding):
"""Download the page at `url` and parse it for links. Returned links have
had the fragment after `#` removed, and have been made absolute so, e.g.
the URL 'gen.html#tornado.gen.coroutine' becomes
'http://www.tornadoweb.org/en/stable/gen.html'.
"""
try:
response = yield httpclient.AsyncHTTPClient().fetch(url)
logger.debug("fetched: {0}".format(url))
urls = [urlparse.urljoin(url, remove_fragment(new_url))
for new_url in get_links(url, response.body, encoding)]
except HTTPError as err:
logger.error("HTTP {0}: {1}".format(err.code, url))
raise gen.Return([])
except Exception as e:
logger.exception(e)
raise gen.Return([])
raise gen.Return(urls)
def remove_fragment(url):
scheme, netloc, url, params, query, fragment = urlparse.urlparse(url)
return urlparse.urlunparse((scheme, netloc, url, params, query, ''))
def get_links(url, html, encoding):
class URLSeeker(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.urls = []
def handle_starttag(self, tag, attrs):
href = dict(attrs).get('href')
if href and tag == 'a':
self.urls.append(href)
url_seeker = URLSeeker()
try:
decoded_html = html.decode(encoding)
except UnicodeDecodeError:
logger.error("Encoding error: {0}".format(url))
return []
url_seeker.feed(decoded_html)
return url_seeker.urls
def stop(fut):
loop.stop()
fut.result()
if __name__ == '__main__':
tornado.options.parse_command_line()
logger.setLevel(getattr(logging, options.logging.upper(), 'INFO'))
loop = ioloop.IOLoop.current()
future = spider(options.url, options.concurrency, options.timeout, options.encoding)
future.add_done_callback(stop)
try:
loop.start()
except KeyboardInterrupt:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment