Per48edjes/Only a Little Lucky

## Only a Little Lucky
# Triple Gold Star

# Only A Little Lucky

# The Feeling Lucky question (from the regular homework) assumed it was enough
# to find the best-ranked page for a given query. For most queries, though, we
# don't just want the best page (according to the page ranking algorithm), we
# want a list of many pages that match the query, ordered from the most likely
# to be useful to the least likely.

# Your goal for this question is to define a procedure, ordered_search(index,
# ranks, keyword), that takes the same inputs as lucky_search from Question 5,
# but returns an ordered list of all the URLs that match the query.

# To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in
# 1959. Quicksort provides a way to sort any list of data, using an expected
# number of comparisons that scales as n log n where n is the number of elements
# in the list.

# The idea of quicksort is quite simple:

# If the list has zero or one elements, it is already sorted.

# Otherwise, pick a pivot element, and split the list into two partitions: one
# contains all the elements equal to or lower than the value of the pivot
# element, and the other contains all the elements that are greater than the
# pivot element. Recursively sort each of the sub-lists, and then return the
# result of concatenating the sorted left sub-list, the pivot element, and the
# sorted right sub-list.

# For simplicity, use the first element in the list as your pivot element (this
# is not usually a good choice, since it means if the input list is already
# nearly sorted, the actual work will be much worse than expected).

def ordered_search(index, ranks, keyword):
    keyword_pages = lookup(index, keyword)
    if keyword_pages == None:
        return None
    keyword_page_dic = {}
    keyword_page_ranks = []
    output = []
    for page in keyword_pages:
        keyword_page_dic[page] = ranks[page]
        keyword_page_ranks.append(ranks[page])
    sorted_page_ranks = quicksort(keyword_page_ranks)
    for rank in sorted_page_ranks[::-1]:
        for page in keyword_pages:
            if ranks[page] == rank:
                output.append(page)
    return output

# Quicksort function
def quicksort(big_list):
    if len(big_list) == 1 or len(big_list) == 0:
        return big_list
    else:
        pivot = big_list[0]
        less_list, more_list = [], []
        for element in big_list[1:]:
            if element > pivot:
                more_list.append(element)
            else:
                less_list.append(element)
        x, y = quicksort(less_list), quicksort(more_list)
        return x + [pivot] + y

def lucky_search(index, ranks, keyword):
    rank = 0
    if lookup(index, keyword) <>  None:
        for keyword_page in lookup(index, keyword):
            if ranks[keyword_page] > rank:
                rank = ranks[keyword_page]
                x = [keyword_page][0]
        return x
    return lookup(index, keyword)

cache = {
   'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>

For more expert opinions, check out the
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>


""",
   'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

</body>
</html>


""",
   'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>

</body>
</html>


""",
   'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""",
   'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""",
   'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>


""",
}

def get_page(url):
    if url in cache:
        return cache[url]
    return ""

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links

def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {}
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            graph[page] = outlinks
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph

def compute_ranks(graph):
    d = 0.8 # damping factor
    numloops = 10

    ranks = {}
    npages = len(graph)
    for page in graph:
        ranks[page] = 1.0 / npages

    for i in range(0, numloops):
        newranks = {}
        for page in graph:
            newrank = (1 - d) / npages
            for node in graph:
                if page in graph[node]:
                    newrank = newrank + d * (ranks[node] / len(graph[node]))
            newranks[page] = newrank
        ranks = newranks
    return ranks


# Here are some example showing what ordered_search should do:

# Observe that the result list is sorted so the highest-ranking site is at the
# beginning of the list.

# Note: the intent of this question is for students to write their own sorting
# code, not to use the built-in sort procedure.

index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
ranks = compute_ranks(graph)

print ordered_search(index, ranks, 'Hummus')
#>>> ['http://udacity.com/cs101x/urank/kathleen.html',
#    'http://udacity.com/cs101x/urank/nickel.html',
#    'http://udacity.com/cs101x/urank/arsenic.html',
#    'http://udacity.com/cs101x/urank/hummus.html',
#    'http://udacity.com/cs101x/urank/index.html']

print ordered_search(index, ranks, 'the')
#>>> ['http://udacity.com/cs101x/urank/nickel.html',
#    'http://udacity.com/cs101x/urank/arsenic.html',
#    'http://udacity.com/cs101x/urank/hummus.html',
#    'http://udacity.com/cs101x/urank/index.html']


print ordered_search(index, ranks, 'babaganoush')
#>>> None
	# Triple Gold Star

	# Only A Little Lucky

	# The Feeling Lucky question (from the regular homework) assumed it was enough
	# to find the best-ranked page for a given query. For most queries, though, we
	# don't just want the best page (according to the page ranking algorithm), we
	# want a list of many pages that match the query, ordered from the most likely
	# to be useful to the least likely.

	# Your goal for this question is to define a procedure, ordered_search(index,
	# ranks, keyword), that takes the same inputs as lucky_search from Question 5,
	# but returns an ordered list of all the URLs that match the query.

	# To order the pages, use the quicksort algorithm, invented by Sir Tony Hoare in
	# 1959. Quicksort provides a way to sort any list of data, using an expected
	# number of comparisons that scales as n log n where n is the number of elements
	# in the list.

	# The idea of quicksort is quite simple:

	# If the list has zero or one elements, it is already sorted.

	# Otherwise, pick a pivot element, and split the list into two partitions: one
	# contains all the elements equal to or lower than the value of the pivot
	# element, and the other contains all the elements that are greater than the
	# pivot element. Recursively sort each of the sub-lists, and then return the
	# result of concatenating the sorted left sub-list, the pivot element, and the
	# sorted right sub-list.

	# For simplicity, use the first element in the list as your pivot element (this
	# is not usually a good choice, since it means if the input list is already
	# nearly sorted, the actual work will be much worse than expected).

	def ordered_search(index, ranks, keyword):
	keyword_pages = lookup(index, keyword)
	if keyword_pages == None:
	return None
	keyword_page_dic = {}
	keyword_page_ranks = []
	output = []
	for page in keyword_pages:
	keyword_page_dic[page] = ranks[page]
	keyword_page_ranks.append(ranks[page])
	sorted_page_ranks = quicksort(keyword_page_ranks)
	for rank in sorted_page_ranks[::-1]:
	for page in keyword_pages:
	if ranks[page] == rank:
	output.append(page)
	return output

	# Quicksort function
	def quicksort(big_list):
	if len(big_list) == 1 or len(big_list) == 0:
	return big_list
	else:
	pivot = big_list[0]
	less_list, more_list = [], []
	for element in big_list[1:]:
	if element > pivot:
	more_list.append(element)
	else:
	less_list.append(element)
	x, y = quicksort(less_list), quicksort(more_list)
	return x + [pivot] + y

	def lucky_search(index, ranks, keyword):
	rank = 0
	if lookup(index, keyword) <> None:
	for keyword_page in lookup(index, keyword):
	if ranks[keyword_page] > rank:
	rank = ranks[keyword_page]
	x = [keyword_page][0]
	return x
	return lookup(index, keyword)

	cache = {
	'http://udacity.com/cs101x/urank/index.html': """<html>
	<body>
	<h1>Dave's Cooking Algorithms</h1>
	<p>
	Here are my favorite recipies:
	<ul>
	<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
	<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
	<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
	</ul>

	For more expert opinions, check out the
	<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
	and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
	</body>
	</html>






	""",
	'http://udacity.com/cs101x/urank/zinc.html': """<html>
	<body>
	<h1>The Zinc Chef</h1>
	<p>
	I learned everything I know from
	<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
	</p>
	<p>
	For great hummus, try
	<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

	</body>
	</html>






	""",
	'http://udacity.com/cs101x/urank/nickel.html': """<html>
	<body>
	<h1>The Nickel Chef</h1>
	<p>
	This is the
	<a href="http://udacity.com/cs101x/urank/kathleen.html">
	best Hummus recipe!
	</a>

	</body>
	</html>






	""",
	'http://udacity.com/cs101x/urank/kathleen.html': """<html>
	<body>
	<h1>
	Kathleen's Hummus Recipe
	</h1>
	<p>

	<ol>
	<li> Open a can of garbonzo beans.
	<li> Crush them in a blender.
	<li> Add 3 tablesppons of tahini sauce.
	<li> Squeeze in one lemon.
	<li> Add salt, pepper, and buttercream frosting to taste.
	</ol>

	</body>
	</html>

	""",
	'http://udacity.com/cs101x/urank/arsenic.html': """<html>
	<body>
	<h1>
	The Arsenic Chef's World Famous Hummus Recipe
	</h1>
	<p>

	<ol>
	<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
	<li> Force her to make hummus for you.
	</ol>

	</body>
	</html>

	""",
	'http://udacity.com/cs101x/urank/hummus.html': """<html>
	<body>
	<h1>
	Hummus Recipe
	</h1>
	<p>

	<ol>
	<li> Go to the store and buy a container of hummus.
	<li> Open it.
	</ol>

	</body>
	</html>




	""",
	}

	def get_page(url):
	if url in cache:
	return cache[url]
	return ""

	def get_next_target(page):
	start_link = page.find('<a href=')
	if start_link == -1:
	return None, 0
	start_quote = page.find('"', start_link)
	end_quote = page.find('"', start_quote + 1)
	url = page[start_quote + 1:end_quote]
	return url, end_quote

	def get_all_links(page):
	links = []
	while True:
	url, endpos = get_next_target(page)
	if url:
	links.append(url)
	page = page[endpos:]
	else:
	break
	return links

	def union(a, b):
	for e in b:
	if e not in a:
	a.append(e)

	def add_page_to_index(index, url, content):
	words = content.split()
	for word in words:
	add_to_index(index, word, url)

	def add_to_index(index, keyword, url):
	if keyword in index:
	index[keyword].append(url)
	else:
	index[keyword] = [url]

	def lookup(index, keyword):
	if keyword in index:
	return index[keyword]
	else:
	return None

	def crawl_web(seed): # returns index, graph of inlinks
	tocrawl = [seed]
	crawled = []
	graph = {} # <url>, [list of pages it links to]
	index = {}
	while tocrawl:
	page = tocrawl.pop()
	if page not in crawled:
	content = get_page(page)
	add_page_to_index(index, page, content)
	outlinks = get_all_links(content)
	graph[page] = outlinks
	union(tocrawl, outlinks)
	crawled.append(page)
	return index, graph

	def compute_ranks(graph):
	d = 0.8 # damping factor
	numloops = 10

	ranks = {}
	npages = len(graph)
	for page in graph:
	ranks[page] = 1.0 / npages

	for i in range(0, numloops):
	newranks = {}
	for page in graph:
	newrank = (1 - d) / npages
	for node in graph:
	if page in graph[node]:
	newrank = newrank + d * (ranks[node] / len(graph[node]))
	newranks[page] = newrank
	ranks = newranks
	return ranks


	# Here are some example showing what ordered_search should do:

	# Observe that the result list is sorted so the highest-ranking site is at the
	# beginning of the list.

	# Note: the intent of this question is for students to write their own sorting
	# code, not to use the built-in sort procedure.

	index, graph = crawl_web('http://udacity.com/cs101x/urank/index.html')
	ranks = compute_ranks(graph)

	print ordered_search(index, ranks, 'Hummus')
	#>>> ['http://udacity.com/cs101x/urank/kathleen.html',
	# 'http://udacity.com/cs101x/urank/nickel.html',
	# 'http://udacity.com/cs101x/urank/arsenic.html',
	# 'http://udacity.com/cs101x/urank/hummus.html',
	# 'http://udacity.com/cs101x/urank/index.html']

	print ordered_search(index, ranks, 'the')
	#>>> ['http://udacity.com/cs101x/urank/nickel.html',
	# 'http://udacity.com/cs101x/urank/arsenic.html',
	# 'http://udacity.com/cs101x/urank/hummus.html',
	# 'http://udacity.com/cs101x/urank/index.html']


	print ordered_search(index, ranks, 'babaganoush')
	#>>> None