Skip to content

Instantly share code, notes, and snippets.

@hackjoy
Last active December 11, 2015 08:58
Show Gist options
  • Save hackjoy/4576494 to your computer and use it in GitHub Desktop.
Save hackjoy/4576494 to your computer and use it in GitHub Desktop.
Simple implementation of web crawler and search engine
# pull content of a url
def get_page(url):
try:
import urllib
return urllib.urlopen(url).read()
except:
return ""
# find the links within a page
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
# find all links in a url page
def get_all_links(page):
links = []
while True:
# because get_next_target returns two values, the return of the first value is assigned to url, second assigned to end pos! Amazing!
url,endpos = get_next_target(page)
if url:
# if we returned a value for another url - we add it to our list of links
links.append(url)
# we start the next search for a link from the last position a url was found
page = page[endpos:]
else:
break
return links
# We start with a seed page and keep track of the depth and what has / hasnt been crawled.
def crawl_web(seed,max_depth):
tocrawl = [seed]
crawled = []
index = []
next_depth = []
depth = 0
while tocrawl and depth <= max_depth:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
# chops up the content of a page into keywords and adds to the index in the format [[<keyword>, [<url1>, <url1>, ...]], ...]
add_page_to_index(index, page, content)
# add pages to the next depth
union(next_depth, get_all_links(contnet))
crawled.append(page)
# once we have finished this depth
if not tocrawl:
tocrawl, next_depth = next_depth, []
depth = depth + 1
return crawled
index = []
# check if the keyword exists - if yes then append the url to it, if not then add a new list to index with the keyword and url
def add_to_index(index, keyword, url):
for entry in index:
if entry[0] == keyword:
if url in entry[1]:
# dont add the same url twice
return
else:
entry[1].append(url)
return
# not found, add new keyword to index
index.append([keyword, [url]])
# return urls that match the keyword
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
# take a string of words associated with a url, choop them up and add them to the index
def add_page_to_index(index,url,content):
# split up the words
words = content.split()
for keyword in words:
add_to_index(index,keyword,url)
# Modify the crawl_web procedure so that instead of just returning the
# index, it returns an index and a graph. The graph should be a
# Dictionary where the key:value entries are:
# url: [list of pages url links to]
def crawl_web(seed): # returns index, graph of outlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>:[list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = [outlinks]
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
cache = {
'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>
For more expert opinions, check out the
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>
""",
'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.
</body>
</html>
""",
'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>
</body>
</html>
""",
'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>
<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>
</body>
</html>
""",
'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>
<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>
</body>
</html>
""",
'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>
<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>
</body>
</html>
""",
}
def get_page(url):
if url in cache:
return cache[url]
else:
return None
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]
def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
index , graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
if 'http://udacity.com/cs101x/urank/index.html' in graph:
print graph['http://udacity.com/cs101x/urank/index.html']
#>>> ['http://udacity.com/cs101x/urank/hummus.html',
#'http://udacity.com/cs101x/urank/arsenic.html',
#'http://udacity.com/cs101x/urank/kathleen.html',
#'http://udacity.com/cs101x/urank/nickel.html',
#'http://udacity.com/cs101x/urank/zinc.html']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment