Skip to content

Instantly share code, notes, and snippets.

@Per48edjes
Created January 21, 2017 21:51
Show Gist options
  • Save Per48edjes/ea2ab19755a13eabd9a373a803d99dec to your computer and use it in GitHub Desktop.
Save Per48edjes/ea2ab19755a13eabd9a373a803d99dec to your computer and use it in GitHub Desktop.
Returning pages returned by keyword lookup by ranks
# Triple Gold Star
# Only A Little Lucky
# The Feeling Lucky question (from the regular homework) assumed it was enough
# to find the best-ranked page for a given query. For most queries, though, we
# don't just want the best page (according to the page ranking algorithm), we
# want a list of many pages that match the query, ordered from the most likely
# to be useful to the least likely.
# Your goal for this question is to define a procedure, ordered_search(index,
# ranks, keyword), that takes the same inputs as lucky_search from Question 5,
# but returns an ordered list of all the URLs that match the query.
# To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in
# 1959. Quicksort provides a way to sort any list of data, using an expected
# number of comparisons that scales as n log n where n is the number of elements
# in the list.
# The idea of quicksort is quite simple:
# If the list has zero or one elements, it is already sorted.
# Otherwise, pick a pivot element, and split the list into two partitions: one
# contains all the elements equal to or lower than the value of the pivot
# element, and the other contains all the elements that are greater than the
# pivot element. Recursively sort each of the sub-lists, and then return the
# result of concatenating the sorted left sub-list, the pivot element, and the
# sorted right sub-list.
# For simplicity, use the first element in the list as your pivot element (this
# is not usually a good choice, since it means if the input list is already
# nearly sorted, the actual work will be much worse than expected).
def ordered_search(index, ranks, keyword):
keyword_pages = lookup(index, keyword)
if keyword_pages == None:
return None
keyword_page_dic = {}
keyword_page_ranks = []
output = []
for page in keyword_pages:
keyword_page_dic[page] = ranks[page]
keyword_page_ranks.append(ranks[page])
sorted_page_ranks = quicksort(keyword_page_ranks)
for rank in sorted_page_ranks[::-1]:
for page in keyword_pages:
if ranks[page] == rank:
output.append(page)
return output
# Quicksort function
def quicksort(big_list):
if len(big_list) == 1 or len(big_list) == 0:
return big_list
else:
pivot = big_list[0]
less_list, more_list = [], []
for element in big_list[1:]:
if element > pivot:
more_list.append(element)
else:
less_list.append(element)
x, y = quicksort(less_list), quicksort(more_list)
return x + [pivot] + y
def lucky_search(index, ranks, keyword):
rank = 0
if lookup(index, keyword) <> None:
for keyword_page in lookup(index, keyword):
if ranks[keyword_page] > rank:
rank = ranks[keyword_page]
x = [keyword_page][0]
return x
return lookup(index, keyword)
cache = {
'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>
For more expert opinions, check out the
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>
""",
'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.
</body>
</html>
""",
'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>
</body>
</html>
""",
'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>
<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>
</body>
</html>
""",
'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>
<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>
</body>
</html>
""",
'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>
<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>
</body>
</html>
""",
}
def get_page(url):
if url in cache:
return cache[url]
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]
def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
def crawl_web(seed): # returns index, graph of inlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>, [list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
def compute_ranks(graph):
d = 0.8 # damping factor
numloops = 10
ranks = {}
npages = len(graph)
for page in graph:
ranks[page] = 1.0 / npages
for i in range(0, numloops):
newranks = {}
for page in graph:
newrank = (1 - d) / npages
for node in graph:
if page in graph[node]:
newrank = newrank + d * (ranks[node] / len(graph[node]))
newranks[page] = newrank
ranks = newranks
return ranks
# Here are some example showing what ordered_search should do:
# Observe that the result list is sorted so the highest-ranking site is at the
# beginning of the list.
# Note: the intent of this question is for students to write their own sorting
# code, not to use the built-in sort procedure.
index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)
print ordered_search(index, ranks, 'Hummus')
#>>> ['http://udacity.com/cs101x/urank/kathleen.html',
# 'http://udacity.com/cs101x/urank/nickel.html',
# 'http://udacity.com/cs101x/urank/arsenic.html',
# 'http://udacity.com/cs101x/urank/hummus.html',
# 'http://udacity.com/cs101x/urank/index.html']
print ordered_search(index, ranks, 'the')
#>>> ['http://udacity.com/cs101x/urank/nickel.html',
# 'http://udacity.com/cs101x/urank/arsenic.html',
# 'http://udacity.com/cs101x/urank/hummus.html',
# 'http://udacity.com/cs101x/urank/index.html']
print ordered_search(index, ranks, 'babaganoush')
#>>> None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment