Skip to content

Instantly share code, notes, and snippets.

@wolf0403
Created September 5, 2014 19:10
Show Gist options
  • Save wolf0403/ca15111d91705742a295 to your computer and use it in GitHub Desktop.
Save wolf0403/ca15111d91705742a295 to your computer and use it in GitHub Desktop.
Gevent Spider
import gevent
import logging
import requests
try:
import simplejson as json
except:
import json
from gevent.pool import Group, Pool
from gevent.queue import Queue, Empty
from itertools import groupby
from lxml import html
from pprint import pprint
from urlparse import urlparse
logging.basicConfig(level=logging.DEBUG)
def _p(f):
def _w(*a, **kw):
logging.debug('filter_url: %s', a[0])
r = f(*a, **kw)
print r
return r
return _w
target_host = 'http://localhost:8000'
_target_host = urlparse(target_host).hostname
#@_p
def filter_url(url, refer):
if not url:
return None
p = urlparse(url)
if p.scheme not in ('http', 'https', ''):
return None
if not any([p.path, p.fragment]):
return None
if urlparse(refer).fragment:
return None
EF = '_escaped_fragment_='
q = urlparse(refer)
if p.fragment:
if not EF in q.query:
l = '?'
if q.query:
l = '&'
url = url.replace('#', l + EF)
else:
return None
if p.hostname in (_target_host, None):
if not url.startswith('/'):
return refer + url
return url
return None
urls = Queue(None)
history = {} #{'#': 200}
graph = []
external = set([])
def _u(path, refer):
s = ''
if path[0] != '/':
s = refer + '/'
return target_host + s + path
def do_url(url, refer, R=requests):
if url is None or url in history:
return
logging.debug('start: ' + url + ' | ' + refer)
try:
resp = R.get(_u(url, refer), headers={'referer': refer})
history[url] = resp.status_code
resp.raise_for_status()
tree = html.fromstring(resp.text)
alllinks = tree.xpath('//a')
for a in alllinks:
href = a.attrib.get('href')
a = filter_url(href, url)
if not a:
external.add(href)
graph.append((a, href))
logging.debug('External: %s', href)
elif a not in history:
urls.put((a, url))
graph.append((url, a))
logging.debug('New URL : %s', a)
except:
logging.exception('F: ' + url)
def worker(wid, R=requests):
retry = 3
while not urls.empty():
url, refer = urls.get()
assert isinstance(url, (str, unicode)), repr(url)
if url is None or url in history:
continue
do_url(url, refer, R)
logging.debug('do_url done for %s', url[:20])
gevent.sleep()
if __name__ == '__main__':
import sys
if len(sys.argv) > 1:
target_host = sys.argv[1]
p = urlparse(target_host)
target_host = (p.scheme or 'http') + '://' + p.hostname
_target_host = p.hostname
start = p.path
if p.query:
start += ('?' + p.query)
if p.fragment:
start += ('#' + p.fragment)
else:
start = '/'
print 'target: ', target_host
print 'start : ', start
urls.put((start, start))
pool_size = 10
try:
while not urls.empty():
g = Pool(size=pool_size)
g.map(worker, range(pool_size))
g.join()
except Empty:
pass
print (json.dumps(history, indent=4))
g = sorted(graph, key=lambda x: x[0])
g = groupby(g, key=lambda x: x[0])
g = {k: sorted(set([p[1] for p in v])) for k, v in g}
pprint(g)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment