FZambia/spider.py

## spider.py
# coding: utf-8
import HTMLParser
import time
import urlparse
from datetime import timedelta

from tornado import httpclient, gen, ioloop
import tornado.options
from tornado.options import define, options
from tornado.httpclient import HTTPError
import toro

import logging
logger = logging.getLogger('spider')


define(
    "url", default="", help="Base URL", type=str
)

define(
    "concurrency", default=10, help="Concurrency", type=int
)

define(
    "timeout", default=300, help="Timeout in seconds", type=int
)

define(
    "encoding", default="utf-8", help="Encoding of documents", type=str
)


@gen.coroutine
def spider(base_url, concurrency, timeout, encoding):
    q = toro.JoinableQueue()
    sem = toro.BoundedSemaphore(concurrency)

    start = time.time()
    fetching, fetched = set(), set()

    @gen.coroutine
    def fetch_url():
        current_url = yield q.get()
        try:
            if current_url in fetching:
                return

            logger.debug("fetching: {0}".format(current_url))
            fetching.add(current_url)
            urls = yield get_links_from_url(current_url, encoding)
            fetched.add(current_url)

            for new_url in urls:
                # Only follow links beneath the base URL
                if new_url.startswith(base_url):
                    yield q.put(new_url)

        finally:
            q.task_done()
            sem.release()

    @gen.coroutine
    def worker():
        while True:
            yield sem.acquire()
            # Launch a subtask
            fetch_url()

    q.put(base_url)

    # Start worker, then wait for the work queue to be empty.
    worker()
    yield q.join(deadline=timedelta(seconds=timeout))
    assert fetching == fetched
    print 'Done in %d seconds, fetched %s URLs.' % (
        time.time() - start, len(fetched))


@gen.coroutine
def get_links_from_url(url, encoding):
    """Download the page at `url` and parse it for links. Returned links have
    had the fragment after `#` removed, and have been made absolute so, e.g.
    the URL 'gen.html#tornado.gen.coroutine' becomes
    'http://www.tornadoweb.org/en/stable/gen.html'.
    """
    try:
        response = yield httpclient.AsyncHTTPClient().fetch(url)
        logger.debug("fetched: {0}".format(url))
        urls = [urlparse.urljoin(url, remove_fragment(new_url))
                for new_url in get_links(url, response.body, encoding)]
    except HTTPError as err:
        logger.error("HTTP {0}: {1}".format(err.code, url))
        raise gen.Return([])
    except Exception as e:
        logger.exception(e)
        raise gen.Return([])

    raise gen.Return(urls)


def remove_fragment(url):
    scheme, netloc, url, params, query, fragment = urlparse.urlparse(url)
    return urlparse.urlunparse((scheme, netloc, url, params, query, ''))


def get_links(url, html, encoding):
    class URLSeeker(HTMLParser.HTMLParser):
        def __init__(self):
            HTMLParser.HTMLParser.__init__(self)
            self.urls = []

        def handle_starttag(self, tag, attrs):
            href = dict(attrs).get('href')
            if href and tag == 'a':
                self.urls.append(href)

    url_seeker = URLSeeker()
    try:
        decoded_html = html.decode(encoding)
    except UnicodeDecodeError:
        logger.error("Encoding error: {0}".format(url))
        return []
    url_seeker.feed(decoded_html)
    return url_seeker.urls


def stop(fut):
    loop.stop()
    fut.result()


if __name__ == '__main__':
    tornado.options.parse_command_line()
    logger.setLevel(getattr(logging, options.logging.upper(), 'INFO'))
    loop = ioloop.IOLoop.current()
    future = spider(options.url, options.concurrency, options.timeout, options.encoding)
    future.add_done_callback(stop)
    try:
        loop.start()
    except KeyboardInterrupt:
        pass
	# coding: utf-8
	import HTMLParser
	import time
	import urlparse
	from datetime import timedelta

	from tornado import httpclient, gen, ioloop
	import tornado.options
	from tornado.options import define, options
	from tornado.httpclient import HTTPError
	import toro

	import logging
	logger = logging.getLogger('spider')


	define(
	"url", default="", help="Base URL", type=str
	)

	define(
	"concurrency", default=10, help="Concurrency", type=int
	)

	define(
	"timeout", default=300, help="Timeout in seconds", type=int
	)

	define(
	"encoding", default="utf-8", help="Encoding of documents", type=str
	)


	@gen.coroutine
	def spider(base_url, concurrency, timeout, encoding):
	q = toro.JoinableQueue()
	sem = toro.BoundedSemaphore(concurrency)

	start = time.time()
	fetching, fetched = set(), set()

	@gen.coroutine
	def fetch_url():
	current_url = yield q.get()
	try:
	if current_url in fetching:
	return

	logger.debug("fetching: {0}".format(current_url))
	fetching.add(current_url)
	urls = yield get_links_from_url(current_url, encoding)
	fetched.add(current_url)

	for new_url in urls:
	# Only follow links beneath the base URL
	if new_url.startswith(base_url):
	yield q.put(new_url)

	finally:
	q.task_done()
	sem.release()

	@gen.coroutine
	def worker():
	while True:
	yield sem.acquire()
	# Launch a subtask
	fetch_url()

	q.put(base_url)

	# Start worker, then wait for the work queue to be empty.
	worker()
	yield q.join(deadline=timedelta(seconds=timeout))
	assert fetching == fetched
	print 'Done in %d seconds, fetched %s URLs.' % (
	time.time() - start, len(fetched))


	@gen.coroutine
	def get_links_from_url(url, encoding):
	"""Download the page at `url` and parse it for links. Returned links have
	had the fragment after `#` removed, and have been made absolute so, e.g.
	the URL 'gen.html#tornado.gen.coroutine' becomes
	'http://www.tornadoweb.org/en/stable/gen.html'.
	"""
	try:
	response = yield httpclient.AsyncHTTPClient().fetch(url)
	logger.debug("fetched: {0}".format(url))
	urls = [urlparse.urljoin(url, remove_fragment(new_url))
	for new_url in get_links(url, response.body, encoding)]
	except HTTPError as err:
	logger.error("HTTP {0}: {1}".format(err.code, url))
	raise gen.Return([])
	except Exception as e:
	logger.exception(e)
	raise gen.Return([])

	raise gen.Return(urls)


	def remove_fragment(url):
	scheme, netloc, url, params, query, fragment = urlparse.urlparse(url)
	return urlparse.urlunparse((scheme, netloc, url, params, query, ''))


	def get_links(url, html, encoding):
	class URLSeeker(HTMLParser.HTMLParser):
	def __init__(self):
	HTMLParser.HTMLParser.__init__(self)
	self.urls = []

	def handle_starttag(self, tag, attrs):
	href = dict(attrs).get('href')
	if href and tag == 'a':
	self.urls.append(href)

	url_seeker = URLSeeker()
	try:
	decoded_html = html.decode(encoding)
	except UnicodeDecodeError:
	logger.error("Encoding error: {0}".format(url))
	return []
	url_seeker.feed(decoded_html)
	return url_seeker.urls


	def stop(fut):
	loop.stop()
	fut.result()


	if __name__ == '__main__':
	tornado.options.parse_command_line()
	logger.setLevel(getattr(logging, options.logging.upper(), 'INFO'))
	loop = ioloop.IOLoop.current()
	future = spider(options.url, options.concurrency, options.timeout, options.encoding)
	future.add_done_callback(stop)
	try:
	loop.start()
	except KeyboardInterrupt:
	pass