Last active
December 11, 2015 08:58
-
-
Save hackjoy/4576494 to your computer and use it in GitHub Desktop.
Simple implementation of web crawler and search engine
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pull content of a url | |
def get_page(url): | |
try: | |
import urllib | |
return urllib.urlopen(url).read() | |
except: | |
return "" | |
# find the links within a page | |
def get_next_target(page): | |
start_link = page.find('<a href=') | |
if start_link == -1: | |
return None, 0 | |
start_quote = page.find('"', start_link) | |
end_quote = page.find('"', start_quote + 1) | |
url = page[start_quote + 1:end_quote] | |
return url, end_quote | |
def union(p,q): | |
for e in q: | |
if e not in p: | |
p.append(e) | |
# find all links in a url page | |
def get_all_links(page): | |
links = [] | |
while True: | |
# because get_next_target returns two values, the return of the first value is assigned to url, second assigned to end pos! Amazing! | |
url,endpos = get_next_target(page) | |
if url: | |
# if we returned a value for another url - we add it to our list of links | |
links.append(url) | |
# we start the next search for a link from the last position a url was found | |
page = page[endpos:] | |
else: | |
break | |
return links | |
# We start with a seed page and keep track of the depth and what has / hasnt been crawled. | |
def crawl_web(seed,max_depth): | |
tocrawl = [seed] | |
crawled = [] | |
index = [] | |
next_depth = [] | |
depth = 0 | |
while tocrawl and depth <= max_depth: | |
page = tocrawl.pop() | |
if page not in crawled: | |
content = get_page(page) | |
# chops up the content of a page into keywords and adds to the index in the format [[<keyword>, [<url1>, <url1>, ...]], ...] | |
add_page_to_index(index, page, content) | |
# add pages to the next depth | |
union(next_depth, get_all_links(contnet)) | |
crawled.append(page) | |
# once we have finished this depth | |
if not tocrawl: | |
tocrawl, next_depth = next_depth, [] | |
depth = depth + 1 | |
return crawled | |
index = [] | |
# check if the keyword exists - if yes then append the url to it, if not then add a new list to index with the keyword and url | |
def add_to_index(index, keyword, url): | |
for entry in index: | |
if entry[0] == keyword: | |
if url in entry[1]: | |
# dont add the same url twice | |
return | |
else: | |
entry[1].append(url) | |
return | |
# not found, add new keyword to index | |
index.append([keyword, [url]]) | |
# return urls that match the keyword | |
def lookup(index,keyword): | |
for entry in index: | |
if entry[0] == keyword: | |
return entry[1] | |
return [] | |
# take a string of words associated with a url, choop them up and add them to the index | |
def add_page_to_index(index,url,content): | |
# split up the words | |
words = content.split() | |
for keyword in words: | |
add_to_index(index,keyword,url) | |
# Modify the crawl_web procedure so that instead of just returning the | |
# index, it returns an index and a graph. The graph should be a | |
# Dictionary where the key:value entries are: | |
# url: [list of pages url links to] | |
def crawl_web(seed): # returns index, graph of outlinks | |
tocrawl = [seed] | |
crawled = [] | |
graph = {} # <url>:[list of pages it links to] | |
index = {} | |
while tocrawl: | |
page = tocrawl.pop() | |
if page not in crawled: | |
content = get_page(page) | |
add_page_to_index(index, page, content) | |
outlinks = get_all_links(content) | |
graph[page] = [outlinks] | |
union(tocrawl, outlinks) | |
crawled.append(page) | |
return index, graph | |
cache = { | |
'http://udacity.com/cs101x/urank/index.html': """<html> | |
<body> | |
<h1>Dave's Cooking Algorithms</h1> | |
<p> | |
Here are my favorite recipies: | |
<ul> | |
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a> | |
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a> | |
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a> | |
</ul> | |
For more expert opinions, check out the | |
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a> | |
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>. | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/zinc.html': """<html> | |
<body> | |
<h1>The Zinc Chef</h1> | |
<p> | |
I learned everything I know from | |
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>. | |
</p> | |
<p> | |
For great hummus, try | |
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>. | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/nickel.html': """<html> | |
<body> | |
<h1>The Nickel Chef</h1> | |
<p> | |
This is the | |
<a href="http://udacity.com/cs101x/urank/kathleen.html"> | |
best Hummus recipe! | |
</a> | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/kathleen.html': """<html> | |
<body> | |
<h1> | |
Kathleen's Hummus Recipe | |
</h1> | |
<p> | |
<ol> | |
<li> Open a can of garbonzo beans. | |
<li> Crush them in a blender. | |
<li> Add 3 tablesppons of tahini sauce. | |
<li> Squeeze in one lemon. | |
<li> Add salt, pepper, and buttercream frosting to taste. | |
</ol> | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/arsenic.html': """<html> | |
<body> | |
<h1> | |
The Arsenic Chef's World Famous Hummus Recipe | |
</h1> | |
<p> | |
<ol> | |
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>. | |
<li> Force her to make hummus for you. | |
</ol> | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/hummus.html': """<html> | |
<body> | |
<h1> | |
Hummus Recipe | |
</h1> | |
<p> | |
<ol> | |
<li> Go to the store and buy a container of hummus. | |
<li> Open it. | |
</ol> | |
</body> | |
</html> | |
""", | |
} | |
def get_page(url): | |
if url in cache: | |
return cache[url] | |
else: | |
return None | |
def get_next_target(page): | |
start_link = page.find('<a href=') | |
if start_link == -1: | |
return None, 0 | |
start_quote = page.find('"', start_link) | |
end_quote = page.find('"', start_quote + 1) | |
url = page[start_quote + 1:end_quote] | |
return url, end_quote | |
def get_all_links(page): | |
links = [] | |
while True: | |
url, endpos = get_next_target(page) | |
if url: | |
links.append(url) | |
page = page[endpos:] | |
else: | |
break | |
return links | |
def union(a, b): | |
for e in b: | |
if e not in a: | |
a.append(e) | |
def add_page_to_index(index, url, content): | |
words = content.split() | |
for word in words: | |
add_to_index(index, word, url) | |
def add_to_index(index, keyword, url): | |
if keyword in index: | |
index[keyword].append(url) | |
else: | |
index[keyword] = [url] | |
def lookup(index, keyword): | |
if keyword in index: | |
return index[keyword] | |
else: | |
return None | |
index , graph = crawl_web('http://udacity.com/cs101x/urank/index.html') | |
if 'http://udacity.com/cs101x/urank/index.html' in graph: | |
print graph['http://udacity.com/cs101x/urank/index.html'] | |
#>>> ['http://udacity.com/cs101x/urank/hummus.html', | |
#'http://udacity.com/cs101x/urank/arsenic.html', | |
#'http://udacity.com/cs101x/urank/kathleen.html', | |
#'http://udacity.com/cs101x/urank/nickel.html', | |
#'http://udacity.com/cs101x/urank/zinc.html'] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment