mdipierro/gist:6624cb39851ca750bb2b Secret

## gistfile1.py
# author: Massimo Di Pierro
# license: BSD
from BeautifulSoup import BeautifulSoup
import collections
import cPickle
import urllib
import shelve
import re

EXTENSIONS = ('asp','aspx','php','html','htm')
regex_remove = re.compile(
    '\<script.*?\</script\>|\<style.*?\</style\>|\<!--.*?--\>',re.DOTALL)
regex_words = re.compile('[a-zA-Z\-][a-zA-Z\-]+')
regex_url = re.compile('(?P<scheme>\w+)\://(?P<host>.*?)(?P<path_info>/.*)')

def cached_download(url):
    d = shelve.open('cached.pages')
    try:
        data = d[url]
    except KeyError:
        data = d[url] = urllib.urlopen(url).read()
    return data

def parse_words(html):
    texts = html.find('body').findAll(text=True)
    texts = map(lambda e:e.string.strip().lower(),texts)
    texts = filter(lambda e:e,texts)
    words = reduce(lambda a,b:a|b,[set(regex_words.findall(t)) for t in texts])
    return words

def normalize_url(new, scheme, host, path_info):
    new = new.encode('utf8')
    extension =  new.split('?',1)[0].split('#',1)[0].rsplit('.')[-1].lower()
    if (new.startswith('mailto:') or new.startswith('#') or
        (not '/' in extension and not extension in EXTENSIONS) or
        ('://' in new and not new.split('://')[0] in ('http','https'))):
        return None
    elif '://' in new:
        return new
    elif new.startswith('//'):
        return scheme+':'+new
    elif new.startswith('/'):
        return '%s://%s%s' % (scheme, host, new)
    else:
        relative = path_info.rsplit('/',1)[0]
    return '%s://%s%s/%s' % (scheme, host, relative, new)

def find_links(html, url = 'http://cdm.depaul.edu/'):
    match = regex_url.match(url)
    (scheme, host, path_info) = match.groups()
    start = '/'.join(url.split('/')[:3])
    links = html.findAll('a')
    links = map(lambda link:link.get('href'),links)
    links = filter(lambda e:e,links)
    links = map(lambda link:normalize_url(link,scheme,host,path_info),links)
    links = filter(lambda e:e and e.startswith(start),links)
    urls = set(links)
    return urls

def crawl(url = 'http://web2py.com/', max_links=100, filename='maps.pickle'):
    maps = collections.defaultdict(set)
    discovered = set([url])
    queue = [url]
    n = 0
    while queue and n<max_links:
        url = queue.pop()
        print url
        data =  cached_download(url)
        html = BeautifulSoup(data)
        urls = find_links(html,url)
        words = parse_words(html)
        for word in words:
            maps[word].add(url)
        for new in urls:
            if not new in discovered:
                queue.append(new)
                discovered.add(new)
        print len(maps)
        n += 1
    cPickle.dump(maps,open(filename,'w'))

def search(words, filename='maps.pickle'):
    words = filter(lambda w:len(w)>2,words.lower().split())
    maps = cPickle.load(open(filename))
    return reduce(lambda a,b:a&b,[maps[word] for word in words])

if __name__=='__main__':
    print crawl('http://web2py.com/',max_links=300)
    print search('Python programming')
	# author: Massimo Di Pierro
	# license: BSD
	from BeautifulSoup import BeautifulSoup
	import collections
	import cPickle
	import urllib
	import shelve
	import re

	EXTENSIONS = ('asp','aspx','php','html','htm')
	regex_remove = re.compile(
	'\<script.?\</script\>\|\<style.?\</style\>\|\<!--.*?--\>',re.DOTALL)
	regex_words = re.compile('[a-zA-Z\-][a-zA-Z\-]+')
	regex_url = re.compile('(?P<scheme>\w+)\://(?P<host>.?)(?P<path_info>/.)')

	def cached_download(url):
	d = shelve.open('cached.pages')
	try:
	data = d[url]
	except KeyError:
	data = d[url] = urllib.urlopen(url).read()
	return data

	def parse_words(html):
	texts = html.find('body').findAll(text=True)
	texts = map(lambda e:e.string.strip().lower(),texts)
	texts = filter(lambda e:e,texts)
	words = reduce(lambda a,b:a\|b,[set(regex_words.findall(t)) for t in texts])
	return words

	def normalize_url(new, scheme, host, path_info):
	new = new.encode('utf8')
	extension = new.split('?',1)[0].split('#',1)[0].rsplit('.')[-1].lower()
	if (new.startswith('mailto:') or new.startswith('#') or
	(not '/' in extension and not extension in EXTENSIONS) or
	('://' in new and not new.split('://')[0] in ('http','https'))):
	return None
	elif '://' in new:
	return new
	elif new.startswith('//'):
	return scheme+':'+new
	elif new.startswith('/'):
	return '%s://%s%s' % (scheme, host, new)
	else:
	relative = path_info.rsplit('/',1)[0]
	return '%s://%s%s/%s' % (scheme, host, relative, new)

	def find_links(html, url = 'http://cdm.depaul.edu/'):
	match = regex_url.match(url)
	(scheme, host, path_info) = match.groups()
	start = '/'.join(url.split('/')[:3])
	links = html.findAll('a')
	links = map(lambda link:link.get('href'),links)
	links = filter(lambda e:e,links)
	links = map(lambda link:normalize_url(link,scheme,host,path_info),links)
	links = filter(lambda e:e and e.startswith(start),links)
	urls = set(links)
	return urls

	def crawl(url = 'http://web2py.com/', max_links=100, filename='maps.pickle'):
	maps = collections.defaultdict(set)
	discovered = set([url])
	queue = [url]
	n = 0
	while queue and n<max_links:
	url = queue.pop()
	print url
	data = cached_download(url)
	html = BeautifulSoup(data)
	urls = find_links(html,url)
	words = parse_words(html)
	for word in words:
	maps[word].add(url)
	for new in urls:
	if not new in discovered:
	queue.append(new)
	discovered.add(new)
	print len(maps)
	n += 1
	cPickle.dump(maps,open(filename,'w'))

	def search(words, filename='maps.pickle'):
	words = filter(lambda w:len(w)>2,words.lower().split())
	maps = cPickle.load(open(filename))
	return reduce(lambda a,b:a&b,[maps[word] for word in words])

	if __name__=='__main__':
	print crawl('http://web2py.com/',max_links=300)
	print search('Python programming')