Skip to content

@mdipierro /gist:6624cb39851ca750bb2b secret

Embed URL


Subversion checkout URL

You can clone with
Download ZIP
A simple crawler to index and search your web pages.
# author: Massimo Di Pierro
# license: BSD
from BeautifulSoup import BeautifulSoup
import collections
import cPickle
import urllib
import shelve
import re
EXTENSIONS = ('asp','aspx','php','html','htm')
regex_remove = re.compile(
regex_words = re.compile('[a-zA-Z\-][a-zA-Z\-]+')
regex_url = re.compile('(?P<scheme>\w+)\://(?P<host>.*?)(?P<path_info>/.*)')
def cached_download(url):
d ='cached.pages')
data = d[url]
except KeyError:
data = d[url] = urllib.urlopen(url).read()
return data
def parse_words(html):
texts = html.find('body').findAll(text=True)
texts = map(lambda e:e.string.strip().lower(),texts)
texts = filter(lambda e:e,texts)
words = reduce(lambda a,b:a|b,[set(regex_words.findall(t)) for t in texts])
return words
def normalize_url(new, scheme, host, path_info):
new = new.encode('utf8')
extension = new.split('?',1)[0].split('#',1)[0].rsplit('.')[-1].lower()
if (new.startswith('mailto:') or new.startswith('#') or
(not '/' in extension and not extension in EXTENSIONS) or
('://' in new and not new.split('://')[0] in ('http','https'))):
return None
elif '://' in new:
return new
elif new.startswith('//'):
return scheme+':'+new
elif new.startswith('/'):
return '%s://%s%s' % (scheme, host, new)
relative = path_info.rsplit('/',1)[0]
return '%s://%s%s/%s' % (scheme, host, relative, new)
def find_links(html, url = ''):
match = regex_url.match(url)
(scheme, host, path_info) = match.groups()
start = '/'.join(url.split('/')[:3])
links = html.findAll('a')
links = map(lambda link:link.get('href'),links)
links = filter(lambda e:e,links)
links = map(lambda link:normalize_url(link,scheme,host,path_info),links)
links = filter(lambda e:e and e.startswith(start),links)
urls = set(links)
return urls
def crawl(url = '', max_links=100, filename='maps.pickle'):
maps = collections.defaultdict(set)
discovered = set([url])
queue = [url]
n = 0
while queue and n<max_links:
url = queue.pop()
print url
data = cached_download(url)
html = BeautifulSoup(data)
urls = find_links(html,url)
words = parse_words(html)
for word in words:
for new in urls:
if not new in discovered:
print len(maps)
n += 1
def search(words, filename='maps.pickle'):
words = filter(lambda w:len(w)>2,words.lower().split())
maps = cPickle.load(open(filename))
return reduce(lambda a,b:a&b,[maps[word] for word in words])
if __name__=='__main__':
print crawl('',max_links=300)
print search('Python programming')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.