secret
Created

A simple crawler to index and search your web pages.

  • Download Gist
gistfile1.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89
# author: Massimo Di Pierro
# license: BSD
from BeautifulSoup import BeautifulSoup
import collections
import cPickle
import urllib
import shelve
import re
 
EXTENSIONS = ('asp','aspx','php','html','htm')
regex_remove = re.compile(
'\<script.*?\</script\>|\<style.*?\</style\>|\<!--.*?--\>',re.DOTALL)
regex_words = re.compile('[a-zA-Z\-][a-zA-Z\-]+')
regex_url = re.compile('(?P<scheme>\w+)\://(?P<host>.*?)(?P<path_info>/.*)')
 
def cached_download(url):
d = shelve.open('cached.pages')
try:
data = d[url]
except KeyError:
data = d[url] = urllib.urlopen(url).read()
return data
 
def parse_words(html):
texts = html.find('body').findAll(text=True)
texts = map(lambda e:e.string.strip().lower(),texts)
texts = filter(lambda e:e,texts)
words = reduce(lambda a,b:a|b,[set(regex_words.findall(t)) for t in texts])
return words
 
def normalize_url(new, scheme, host, path_info):
new = new.encode('utf8')
extension = new.split('?',1)[0].split('#',1)[0].rsplit('.')[-1].lower()
if (new.startswith('mailto:') or new.startswith('#') or
(not '/' in extension and not extension in EXTENSIONS) or
('://' in new and not new.split('://')[0] in ('http','https'))):
return None
elif '://' in new:
return new
elif new.startswith('//'):
return scheme+':'+new
elif new.startswith('/'):
return '%s://%s%s' % (scheme, host, new)
else:
relative = path_info.rsplit('/',1)[0]
return '%s://%s%s/%s' % (scheme, host, relative, new)
 
def find_links(html, url = 'http://cdm.depaul.edu/'):
match = regex_url.match(url)
(scheme, host, path_info) = match.groups()
start = '/'.join(url.split('/')[:3])
links = html.findAll('a')
links = map(lambda link:link.get('href'),links)
links = filter(lambda e:e,links)
links = map(lambda link:normalize_url(link,scheme,host,path_info),links)
links = filter(lambda e:e and e.startswith(start),links)
urls = set(links)
return urls
 
def crawl(url = 'http://web2py.com/', max_links=100, filename='maps.pickle'):
maps = collections.defaultdict(set)
discovered = set([url])
queue = [url]
n = 0
while queue and n<max_links:
url = queue.pop()
print url
data = cached_download(url)
html = BeautifulSoup(data)
urls = find_links(html,url)
words = parse_words(html)
for word in words:
maps[word].add(url)
for new in urls:
if not new in discovered:
queue.append(new)
discovered.add(new)
print len(maps)
n += 1
cPickle.dump(maps,open(filename,'w'))
 
def search(words, filename='maps.pickle'):
words = filter(lambda w:len(w)>2,words.lower().split())
maps = cPickle.load(open(filename))
return reduce(lambda a,b:a&b,[maps[word] for word in words])
 
if __name__=='__main__':
print crawl('http://web2py.com/',max_links=300)
print search('Python programming')

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.