Skip to content

Instantly share code, notes, and snippets.

@mdipierro
Created October 11, 2013 02:09
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdipierro/6624cb39851ca750bb2b to your computer and use it in GitHub Desktop.
Save mdipierro/6624cb39851ca750bb2b to your computer and use it in GitHub Desktop.
A simple crawler to index and search your web pages.
# author: Massimo Di Pierro
# license: BSD
from BeautifulSoup import BeautifulSoup
import collections
import cPickle
import urllib
import shelve
import re
EXTENSIONS = ('asp','aspx','php','html','htm')
regex_remove = re.compile(
'\<script.*?\</script\>|\<style.*?\</style\>|\<!--.*?--\>',re.DOTALL)
regex_words = re.compile('[a-zA-Z\-][a-zA-Z\-]+')
regex_url = re.compile('(?P<scheme>\w+)\://(?P<host>.*?)(?P<path_info>/.*)')
def cached_download(url):
d = shelve.open('cached.pages')
try:
data = d[url]
except KeyError:
data = d[url] = urllib.urlopen(url).read()
return data
def parse_words(html):
texts = html.find('body').findAll(text=True)
texts = map(lambda e:e.string.strip().lower(),texts)
texts = filter(lambda e:e,texts)
words = reduce(lambda a,b:a|b,[set(regex_words.findall(t)) for t in texts])
return words
def normalize_url(new, scheme, host, path_info):
new = new.encode('utf8')
extension = new.split('?',1)[0].split('#',1)[0].rsplit('.')[-1].lower()
if (new.startswith('mailto:') or new.startswith('#') or
(not '/' in extension and not extension in EXTENSIONS) or
('://' in new and not new.split('://')[0] in ('http','https'))):
return None
elif '://' in new:
return new
elif new.startswith('//'):
return scheme+':'+new
elif new.startswith('/'):
return '%s://%s%s' % (scheme, host, new)
else:
relative = path_info.rsplit('/',1)[0]
return '%s://%s%s/%s' % (scheme, host, relative, new)
def find_links(html, url = 'http://cdm.depaul.edu/'):
match = regex_url.match(url)
(scheme, host, path_info) = match.groups()
start = '/'.join(url.split('/')[:3])
links = html.findAll('a')
links = map(lambda link:link.get('href'),links)
links = filter(lambda e:e,links)
links = map(lambda link:normalize_url(link,scheme,host,path_info),links)
links = filter(lambda e:e and e.startswith(start),links)
urls = set(links)
return urls
def crawl(url = 'http://web2py.com/', max_links=100, filename='maps.pickle'):
maps = collections.defaultdict(set)
discovered = set([url])
queue = [url]
n = 0
while queue and n<max_links:
url = queue.pop()
print url
data = cached_download(url)
html = BeautifulSoup(data)
urls = find_links(html,url)
words = parse_words(html)
for word in words:
maps[word].add(url)
for new in urls:
if not new in discovered:
queue.append(new)
discovered.add(new)
print len(maps)
n += 1
cPickle.dump(maps,open(filename,'w'))
def search(words, filename='maps.pickle'):
words = filter(lambda w:len(w)>2,words.lower().split())
maps = cPickle.load(open(filename))
return reduce(lambda a,b:a&b,[maps[word] for word in words])
if __name__=='__main__':
print crawl('http://web2py.com/',max_links=300)
print search('Python programming')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment