Skip to content

Instantly share code, notes, and snippets.

@AstraLuma
Created July 10, 2013 21:49
Show Gist options
  • Save AstraLuma/5970631 to your computer and use it in GitHub Desktop.
Save AstraLuma/5970631 to your computer and use it in GitHub Desktop.
A basic spider to aggregate statuses
#!/usr/bin/python -i
import argparse
import collections
import Queue
import logging
import requests
import threading
import urlparse
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser(
description='Spider a site looking for errors'
)
parser.add_argument('url', nargs='+',
help='Root URLs')
parser.add_argument('--threads', default=4, type=int,
help='How many threads to use')
urls = {}
statuses = collections.Counter()
session = requests.Session()
def download(url, referer=None):
global urls, statuses, session
logging.info("-> %s", url)
r = session.get(url)
urls[url] = (r.status_code, referer)
logging.info("<- %s %i", url, r.status_code)
statuses.update([r.status_code])
if r.status_code == 200:
return BeautifulSoup(r.content)
def worker(queue):
while True:
j = queue.get()
url, referer = j
try:
url, _ = urlparse.urldefrag(url)
if url in urls:
continue
urls[url] = None
doc = download(url, referer)
if doc is None:
continue
for link in doc.find_all('a'):
next = urlparse.urljoin(url, link.get('href'))
if next not in urls:
queue.put((next, url))
finally:
queue.task_done()
args = parser.parse_args()
logging.basicConfig(
format='%(asctime)s %(name)s %(levelname)-8s %(message)s',
level=logging.INFO,
)
logging.getLogger('requests').level = logging.ERROR
queue = Queue.Queue()
for url in args.url:
queue.put((url, None))
for i in xrange(args.threads):
t = threading.Thread(target=worker, args=(queue,), name='Worker %u' % i)
t.daemon = True
t.start()
queue.join()
print statuses
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment