Created
January 21, 2017 21:51
-
-
Save Per48edjes/ea2ab19755a13eabd9a373a803d99dec to your computer and use it in GitHub Desktop.
Returning pages returned by keyword lookup by ranks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Triple Gold Star | |
# Only A Little Lucky | |
# The Feeling Lucky question (from the regular homework) assumed it was enough | |
# to find the best-ranked page for a given query. For most queries, though, we | |
# don't just want the best page (according to the page ranking algorithm), we | |
# want a list of many pages that match the query, ordered from the most likely | |
# to be useful to the least likely. | |
# Your goal for this question is to define a procedure, ordered_search(index, | |
# ranks, keyword), that takes the same inputs as lucky_search from Question 5, | |
# but returns an ordered list of all the URLs that match the query. | |
# To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in | |
# 1959. Quicksort provides a way to sort any list of data, using an expected | |
# number of comparisons that scales as n log n where n is the number of elements | |
# in the list. | |
# The idea of quicksort is quite simple: | |
# If the list has zero or one elements, it is already sorted. | |
# Otherwise, pick a pivot element, and split the list into two partitions: one | |
# contains all the elements equal to or lower than the value of the pivot | |
# element, and the other contains all the elements that are greater than the | |
# pivot element. Recursively sort each of the sub-lists, and then return the | |
# result of concatenating the sorted left sub-list, the pivot element, and the | |
# sorted right sub-list. | |
# For simplicity, use the first element in the list as your pivot element (this | |
# is not usually a good choice, since it means if the input list is already | |
# nearly sorted, the actual work will be much worse than expected). | |
def ordered_search(index, ranks, keyword): | |
keyword_pages = lookup(index, keyword) | |
if keyword_pages == None: | |
return None | |
keyword_page_dic = {} | |
keyword_page_ranks = [] | |
output = [] | |
for page in keyword_pages: | |
keyword_page_dic[page] = ranks[page] | |
keyword_page_ranks.append(ranks[page]) | |
sorted_page_ranks = quicksort(keyword_page_ranks) | |
for rank in sorted_page_ranks[::-1]: | |
for page in keyword_pages: | |
if ranks[page] == rank: | |
output.append(page) | |
return output | |
# Quicksort function | |
def quicksort(big_list): | |
if len(big_list) == 1 or len(big_list) == 0: | |
return big_list | |
else: | |
pivot = big_list[0] | |
less_list, more_list = [], [] | |
for element in big_list[1:]: | |
if element > pivot: | |
more_list.append(element) | |
else: | |
less_list.append(element) | |
x, y = quicksort(less_list), quicksort(more_list) | |
return x + [pivot] + y | |
def lucky_search(index, ranks, keyword): | |
rank = 0 | |
if lookup(index, keyword) <> None: | |
for keyword_page in lookup(index, keyword): | |
if ranks[keyword_page] > rank: | |
rank = ranks[keyword_page] | |
x = [keyword_page][0] | |
return x | |
return lookup(index, keyword) | |
cache = { | |
'http://udacity.com/cs101x/urank/index.html': """<html> | |
<body> | |
<h1>Dave's Cooking Algorithms</h1> | |
<p> | |
Here are my favorite recipies: | |
<ul> | |
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a> | |
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a> | |
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a> | |
</ul> | |
For more expert opinions, check out the | |
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a> | |
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>. | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/zinc.html': """<html> | |
<body> | |
<h1>The Zinc Chef</h1> | |
<p> | |
I learned everything I know from | |
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>. | |
</p> | |
<p> | |
For great hummus, try | |
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>. | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/nickel.html': """<html> | |
<body> | |
<h1>The Nickel Chef</h1> | |
<p> | |
This is the | |
<a href="http://udacity.com/cs101x/urank/kathleen.html"> | |
best Hummus recipe! | |
</a> | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/kathleen.html': """<html> | |
<body> | |
<h1> | |
Kathleen's Hummus Recipe | |
</h1> | |
<p> | |
<ol> | |
<li> Open a can of garbonzo beans. | |
<li> Crush them in a blender. | |
<li> Add 3 tablesppons of tahini sauce. | |
<li> Squeeze in one lemon. | |
<li> Add salt, pepper, and buttercream frosting to taste. | |
</ol> | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/arsenic.html': """<html> | |
<body> | |
<h1> | |
The Arsenic Chef's World Famous Hummus Recipe | |
</h1> | |
<p> | |
<ol> | |
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>. | |
<li> Force her to make hummus for you. | |
</ol> | |
</body> | |
</html> | |
""", | |
'http://udacity.com/cs101x/urank/hummus.html': """<html> | |
<body> | |
<h1> | |
Hummus Recipe | |
</h1> | |
<p> | |
<ol> | |
<li> Go to the store and buy a container of hummus. | |
<li> Open it. | |
</ol> | |
</body> | |
</html> | |
""", | |
} | |
def get_page(url): | |
if url in cache: | |
return cache[url] | |
return "" | |
def get_next_target(page): | |
start_link = page.find('<a href=') | |
if start_link == -1: | |
return None, 0 | |
start_quote = page.find('"', start_link) | |
end_quote = page.find('"', start_quote + 1) | |
url = page[start_quote + 1:end_quote] | |
return url, end_quote | |
def get_all_links(page): | |
links = [] | |
while True: | |
url, endpos = get_next_target(page) | |
if url: | |
links.append(url) | |
page = page[endpos:] | |
else: | |
break | |
return links | |
def union(a, b): | |
for e in b: | |
if e not in a: | |
a.append(e) | |
def add_page_to_index(index, url, content): | |
words = content.split() | |
for word in words: | |
add_to_index(index, word, url) | |
def add_to_index(index, keyword, url): | |
if keyword in index: | |
index[keyword].append(url) | |
else: | |
index[keyword] = [url] | |
def lookup(index, keyword): | |
if keyword in index: | |
return index[keyword] | |
else: | |
return None | |
def crawl_web(seed): # returns index, graph of inlinks | |
tocrawl = [seed] | |
crawled = [] | |
graph = {} # <url>, [list of pages it links to] | |
index = {} | |
while tocrawl: | |
page = tocrawl.pop() | |
if page not in crawled: | |
content = get_page(page) | |
add_page_to_index(index, page, content) | |
outlinks = get_all_links(content) | |
graph[page] = outlinks | |
union(tocrawl, outlinks) | |
crawled.append(page) | |
return index, graph | |
def compute_ranks(graph): | |
d = 0.8 # damping factor | |
numloops = 10 | |
ranks = {} | |
npages = len(graph) | |
for page in graph: | |
ranks[page] = 1.0 / npages | |
for i in range(0, numloops): | |
newranks = {} | |
for page in graph: | |
newrank = (1 - d) / npages | |
for node in graph: | |
if page in graph[node]: | |
newrank = newrank + d * (ranks[node] / len(graph[node])) | |
newranks[page] = newrank | |
ranks = newranks | |
return ranks | |
# Here are some example showing what ordered_search should do: | |
# Observe that the result list is sorted so the highest-ranking site is at the | |
# beginning of the list. | |
# Note: the intent of this question is for students to write their own sorting | |
# code, not to use the built-in sort procedure. | |
index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html') | |
ranks = compute_ranks(graph) | |
print ordered_search(index, ranks, 'Hummus') | |
#>>> ['http://udacity.com/cs101x/urank/kathleen.html', | |
# 'http://udacity.com/cs101x/urank/nickel.html', | |
# 'http://udacity.com/cs101x/urank/arsenic.html', | |
# 'http://udacity.com/cs101x/urank/hummus.html', | |
# 'http://udacity.com/cs101x/urank/index.html'] | |
print ordered_search(index, ranks, 'the') | |
#>>> ['http://udacity.com/cs101x/urank/nickel.html', | |
# 'http://udacity.com/cs101x/urank/arsenic.html', | |
# 'http://udacity.com/cs101x/urank/hummus.html', | |
# 'http://udacity.com/cs101x/urank/index.html'] | |
print ordered_search(index, ranks, 'babaganoush') | |
#>>> None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment