Created
September 5, 2014 19:10
-
-
Save wolf0403/ca15111d91705742a295 to your computer and use it in GitHub Desktop.
Gevent Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import gevent | |
import logging | |
import requests | |
try: | |
import simplejson as json | |
except: | |
import json | |
from gevent.pool import Group, Pool | |
from gevent.queue import Queue, Empty | |
from itertools import groupby | |
from lxml import html | |
from pprint import pprint | |
from urlparse import urlparse | |
logging.basicConfig(level=logging.DEBUG) | |
def _p(f): | |
def _w(*a, **kw): | |
logging.debug('filter_url: %s', a[0]) | |
r = f(*a, **kw) | |
print r | |
return r | |
return _w | |
target_host = 'http://localhost:8000' | |
_target_host = urlparse(target_host).hostname | |
#@_p | |
def filter_url(url, refer): | |
if not url: | |
return None | |
p = urlparse(url) | |
if p.scheme not in ('http', 'https', ''): | |
return None | |
if not any([p.path, p.fragment]): | |
return None | |
if urlparse(refer).fragment: | |
return None | |
EF = '_escaped_fragment_=' | |
q = urlparse(refer) | |
if p.fragment: | |
if not EF in q.query: | |
l = '?' | |
if q.query: | |
l = '&' | |
url = url.replace('#', l + EF) | |
else: | |
return None | |
if p.hostname in (_target_host, None): | |
if not url.startswith('/'): | |
return refer + url | |
return url | |
return None | |
urls = Queue(None) | |
history = {} #{'#': 200} | |
graph = [] | |
external = set([]) | |
def _u(path, refer): | |
s = '' | |
if path[0] != '/': | |
s = refer + '/' | |
return target_host + s + path | |
def do_url(url, refer, R=requests): | |
if url is None or url in history: | |
return | |
logging.debug('start: ' + url + ' | ' + refer) | |
try: | |
resp = R.get(_u(url, refer), headers={'referer': refer}) | |
history[url] = resp.status_code | |
resp.raise_for_status() | |
tree = html.fromstring(resp.text) | |
alllinks = tree.xpath('//a') | |
for a in alllinks: | |
href = a.attrib.get('href') | |
a = filter_url(href, url) | |
if not a: | |
external.add(href) | |
graph.append((a, href)) | |
logging.debug('External: %s', href) | |
elif a not in history: | |
urls.put((a, url)) | |
graph.append((url, a)) | |
logging.debug('New URL : %s', a) | |
except: | |
logging.exception('F: ' + url) | |
def worker(wid, R=requests): | |
retry = 3 | |
while not urls.empty(): | |
url, refer = urls.get() | |
assert isinstance(url, (str, unicode)), repr(url) | |
if url is None or url in history: | |
continue | |
do_url(url, refer, R) | |
logging.debug('do_url done for %s', url[:20]) | |
gevent.sleep() | |
if __name__ == '__main__': | |
import sys | |
if len(sys.argv) > 1: | |
target_host = sys.argv[1] | |
p = urlparse(target_host) | |
target_host = (p.scheme or 'http') + '://' + p.hostname | |
_target_host = p.hostname | |
start = p.path | |
if p.query: | |
start += ('?' + p.query) | |
if p.fragment: | |
start += ('#' + p.fragment) | |
else: | |
start = '/' | |
print 'target: ', target_host | |
print 'start : ', start | |
urls.put((start, start)) | |
pool_size = 10 | |
try: | |
while not urls.empty(): | |
g = Pool(size=pool_size) | |
g.map(worker, range(pool_size)) | |
g.join() | |
except Empty: | |
pass | |
print (json.dumps(history, indent=4)) | |
g = sorted(graph, key=lambda x: x[0]) | |
g = groupby(g, key=lambda x: x[0]) | |
g = {k: sorted(set([p[1] for p in v])) for k, v in g} | |
pprint(g) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment