wolf0403/gspider.py

## gspider.py
import gevent
import logging
import requests

try:
    import simplejson as json
except:
    import json

from gevent.pool import Group, Pool
from gevent.queue import Queue, Empty
from itertools import groupby
from lxml import html
from pprint import pprint
from urlparse import urlparse

logging.basicConfig(level=logging.DEBUG)


def _p(f):
    def _w(*a, **kw):
        logging.debug('filter_url: %s', a[0])
        r = f(*a, **kw)
        print r
        return r
    return _w

target_host = 'http://localhost:8000'
_target_host = urlparse(target_host).hostname

#@_p
def filter_url(url, refer):
    if not url:
        return None
    p = urlparse(url)
    if p.scheme not in ('http', 'https', ''):
        return None
    if not any([p.path, p.fragment]):
        return None
    if urlparse(refer).fragment:
        return None

    EF = '_escaped_fragment_='
    q = urlparse(refer)
    if p.fragment:
        if not EF in q.query:
            l = '?'
            if q.query:
                l = '&'
            url = url.replace('#', l + EF)
        else:
            return None

    if p.hostname in (_target_host, None):
        if not url.startswith('/'):
            return refer + url
        return url
    return None


urls = Queue(None)

history = {} #{'#': 200}
graph = []
external = set([])

def _u(path, refer):
    s = ''
    if path[0] != '/':
        s = refer + '/'
    return target_host + s + path


def do_url(url, refer, R=requests):
    if url is None or url in history:
        return

    logging.debug('start: ' + url + ' | ' + refer)
    try:
        resp = R.get(_u(url, refer), headers={'referer': refer})
        history[url] = resp.status_code
        resp.raise_for_status()

        tree = html.fromstring(resp.text)
        alllinks = tree.xpath('//a')
        for a in alllinks:
            href = a.attrib.get('href')

            a = filter_url(href, url)
            if not a:
                external.add(href)
                graph.append((a, href))
                logging.debug('External: %s', href)
            elif a not in history:
                urls.put((a, url))
                graph.append((url, a))
                logging.debug('New URL : %s', a)
    except:
        logging.exception('F: ' + url)


def worker(wid, R=requests):
    retry = 3
    while not urls.empty():
        url, refer = urls.get()
        assert isinstance(url, (str, unicode)), repr(url)
        if url is None or url in history:
            continue
        do_url(url, refer, R)
        logging.debug('do_url done for %s', url[:20])
        gevent.sleep()

if __name__ == '__main__':
    import sys
    if len(sys.argv) > 1:
        target_host = sys.argv[1]
        p = urlparse(target_host)
        target_host = (p.scheme or 'http') + '://' + p.hostname
        _target_host = p.hostname
        start = p.path
        if p.query:
            start += ('?' + p.query)
        if p.fragment:
            start += ('#' + p.fragment)
    else:
        start = '/'
    print 'target: ', target_host
    print 'start : ', start
    urls.put((start, start))
    pool_size = 10
    try:
        while not urls.empty():
            g = Pool(size=pool_size)
            g.map(worker, range(pool_size))
            g.join()
    except Empty:
        pass
    print (json.dumps(history, indent=4))
    g = sorted(graph, key=lambda x: x[0])
    g = groupby(g, key=lambda x: x[0])
    g = {k: sorted(set([p[1] for p in v])) for k, v in g}
    pprint(g)
	import gevent
	import logging
	import requests

	try:
	import simplejson as json
	except:
	import json

	from gevent.pool import Group, Pool
	from gevent.queue import Queue, Empty
	from itertools import groupby
	from lxml import html
	from pprint import pprint
	from urlparse import urlparse

	logging.basicConfig(level=logging.DEBUG)


	def _p(f):
	def _w(a, *kw):
	logging.debug('filter_url: %s', a[0])
	r = f(a, *kw)
	print r
	return r
	return _w

	target_host = 'http://localhost:8000'
	_target_host = urlparse(target_host).hostname

	#@_p
	def filter_url(url, refer):
	if not url:
	return None
	p = urlparse(url)
	if p.scheme not in ('http', 'https', ''):
	return None
	if not any([p.path, p.fragment]):
	return None
	if urlparse(refer).fragment:
	return None

	EF = '_escaped_fragment_='
	q = urlparse(refer)
	if p.fragment:
	if not EF in q.query:
	l = '?'
	if q.query:
	l = '&'
	url = url.replace('#', l + EF)
	else:
	return None

	if p.hostname in (_target_host, None):
	if not url.startswith('/'):
	return refer + url
	return url
	return None


	urls = Queue(None)

	history = {} #{'#': 200}
	graph = []
	external = set([])

	def _u(path, refer):
	s = ''
	if path[0] != '/':
	s = refer + '/'
	return target_host + s + path


	def do_url(url, refer, R=requests):
	if url is None or url in history:
	return

	logging.debug('start: ' + url + ' \| ' + refer)
	try:
	resp = R.get(_u(url, refer), headers={'referer': refer})
	history[url] = resp.status_code
	resp.raise_for_status()

	tree = html.fromstring(resp.text)
	alllinks = tree.xpath('//a')
	for a in alllinks:
	href = a.attrib.get('href')

	a = filter_url(href, url)
	if not a:
	external.add(href)
	graph.append((a, href))
	logging.debug('External: %s', href)
	elif a not in history:
	urls.put((a, url))
	graph.append((url, a))
	logging.debug('New URL : %s', a)
	except:
	logging.exception('F: ' + url)


	def worker(wid, R=requests):
	retry = 3
	while not urls.empty():
	url, refer = urls.get()
	assert isinstance(url, (str, unicode)), repr(url)
	if url is None or url in history:
	continue
	do_url(url, refer, R)
	logging.debug('do_url done for %s', url[:20])
	gevent.sleep()

	if __name__ == '__main__':
	import sys
	if len(sys.argv) > 1:
	target_host = sys.argv[1]
	p = urlparse(target_host)
	target_host = (p.scheme or 'http') + '://' + p.hostname
	_target_host = p.hostname
	start = p.path
	if p.query:
	start += ('?' + p.query)
	if p.fragment:
	start += ('#' + p.fragment)
	else:
	start = '/'
	print 'target: ', target_host
	print 'start : ', start
	urls.put((start, start))
	pool_size = 10
	try:
	while not urls.empty():
	g = Pool(size=pool_size)
	g.map(worker, range(pool_size))
	g.join()
	except Empty:
	pass
	print (json.dumps(history, indent=4))
	g = sorted(graph, key=lambda x: x[0])
	g = groupby(g, key=lambda x: x[0])
	g = {k: sorted(set([p[1] for p in v])) for k, v in g}
	pprint(g)