Skip to content

Instantly share code, notes, and snippets.

Created September 5, 2014 19:10
Show Gist options
  • Save wolf0403/ca15111d91705742a295 to your computer and use it in GitHub Desktop.
Save wolf0403/ca15111d91705742a295 to your computer and use it in GitHub Desktop.
Gevent Spider
import gevent
import logging
import requests
import simplejson as json
import json
from gevent.pool import Group, Pool
from gevent.queue import Queue, Empty
from itertools import groupby
from lxml import html
from pprint import pprint
from urlparse import urlparse
def _p(f):
def _w(*a, **kw):
logging.debug('filter_url: %s', a[0])
r = f(*a, **kw)
print r
return r
return _w
target_host = 'http://localhost:8000'
_target_host = urlparse(target_host).hostname
def filter_url(url, refer):
if not url:
return None
p = urlparse(url)
if p.scheme not in ('http', 'https', ''):
return None
if not any([p.path, p.fragment]):
return None
if urlparse(refer).fragment:
return None
EF = '_escaped_fragment_='
q = urlparse(refer)
if p.fragment:
if not EF in q.query:
l = '?'
if q.query:
l = '&'
url = url.replace('#', l + EF)
return None
if p.hostname in (_target_host, None):
if not url.startswith('/'):
return refer + url
return url
return None
urls = Queue(None)
history = {} #{'#': 200}
graph = []
external = set([])
def _u(path, refer):
s = ''
if path[0] != '/':
s = refer + '/'
return target_host + s + path
def do_url(url, refer, R=requests):
if url is None or url in history:
logging.debug('start: ' + url + ' | ' + refer)
resp = R.get(_u(url, refer), headers={'referer': refer})
history[url] = resp.status_code
tree = html.fromstring(resp.text)
alllinks = tree.xpath('//a')
for a in alllinks:
href = a.attrib.get('href')
a = filter_url(href, url)
if not a:
graph.append((a, href))
logging.debug('External: %s', href)
elif a not in history:
urls.put((a, url))
graph.append((url, a))
logging.debug('New URL : %s', a)
logging.exception('F: ' + url)
def worker(wid, R=requests):
retry = 3
while not urls.empty():
url, refer = urls.get()
assert isinstance(url, (str, unicode)), repr(url)
if url is None or url in history:
do_url(url, refer, R)
logging.debug('do_url done for %s', url[:20])
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
target_host = sys.argv[1]
p = urlparse(target_host)
target_host = (p.scheme or 'http') + '://' + p.hostname
_target_host = p.hostname
start = p.path
if p.query:
start += ('?' + p.query)
if p.fragment:
start += ('#' + p.fragment)
start = '/'
print 'target: ', target_host
print 'start : ', start
urls.put((start, start))
pool_size = 10
while not urls.empty():
g = Pool(size=pool_size), range(pool_size))
except Empty:
print (json.dumps(history, indent=4))
g = sorted(graph, key=lambda x: x[0])
g = groupby(g, key=lambda x: x[0])
g = {k: sorted(set([p[1] for p in v])) for k, v in g}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment