hackjoy/search_engine.py

## search_engine.py
# pull content of a url
def get_page(url):
  try:
    import urllib
    return urllib.urlopen(url).read()
  except:
    return ""

# find the links within a page
def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

# find all links in a url page
def get_all_links(page):
    links = []
    while True:
        # because get_next_target returns two values, the return of the first value is assigned to url, second assigned to end pos! Amazing!
        url,endpos = get_next_target(page)
        if url:
            # if we returned a value for another url - we add it to our list of links
            links.append(url)
            # we start the next search for a link from the last position a url was found
            page = page[endpos:]
        else:
            break
    return links

# We start with a seed page and keep track of the depth and what has / hasnt been crawled.
def crawl_web(seed,max_depth):
    tocrawl = [seed]
    crawled = []
    index = []
    next_depth = []
    depth = 0
    while tocrawl and depth <= max_depth:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            # chops up the content of a page into keywords and adds to the index in the format [[<keyword>, [<url1>, <url1>, ...]], ...]
            add_page_to_index(index, page, content)
            # add pages to the next depth
            union(next_depth, get_all_links(contnet))
            crawled.append(page)
        # once we have finished this depth
        if not tocrawl:
            tocrawl, next_depth = next_depth, []
            depth = depth + 1
    return crawled

index = []

# check if the keyword exists - if yes then append the url to it, if not then add a new list to index with the keyword and url
def add_to_index(index, keyword, url):
    for entry in index:
        if entry[0] == keyword:
            if url in entry[1]:
              # dont add the same url twice
              return
            else:
                entry[1].append(url)
                return
    # not found, add new keyword to index
    index.append([keyword, [url]])


# return urls that match the keyword
def lookup(index,keyword):
    for entry in index:
        if entry[0] == keyword:
            return entry[1]
    return []

# take a string of words associated with a url, choop them up and add them to the index
def add_page_to_index(index,url,content):
    # split up the words
    words = content.split()
    for keyword in words:
        add_to_index(index,keyword,url)


        # Modify the crawl_web procedure so that instead of just returning the
# index, it returns an index and a graph. The graph should be a
# Dictionary where the key:value entries are:

#  url: [list of pages url links to]


def crawl_web(seed): # returns index, graph of outlinks
    tocrawl = [seed]
    crawled = []
    graph = {}  # <url>:[list of pages it links to]
    index = {}
    while tocrawl:
        page = tocrawl.pop()
        if page not in crawled:
            content = get_page(page)
            add_page_to_index(index, page, content)
            outlinks = get_all_links(content)
            graph[page] = [outlinks]
            union(tocrawl, outlinks)
            crawled.append(page)
    return index, graph


cache = {
   'http://udacity.com/cs101x/urank/index.html': """<html>
<body>
<h1>Dave's Cooking Algorithms</h1>
<p>
Here are my favorite recipies:
<ul>
<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
</ul>

For more expert opinions, check out the
<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
</body>
</html>


""",
   'http://udacity.com/cs101x/urank/zinc.html': """<html>
<body>
<h1>The Zinc Chef</h1>
<p>
I learned everything I know from
<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
</p>
<p>
For great hummus, try
<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

</body>
</html>


""",
   'http://udacity.com/cs101x/urank/nickel.html': """<html>
<body>
<h1>The Nickel Chef</h1>
<p>
This is the
<a href="http://udacity.com/cs101x/urank/kathleen.html">
best Hummus recipe!
</a>

</body>
</html>


""",
   'http://udacity.com/cs101x/urank/kathleen.html': """<html>
<body>
<h1>
Kathleen's Hummus Recipe
</h1>
<p>

<ol>
<li> Open a can of garbonzo beans.
<li> Crush them in a blender.
<li> Add 3 tablesppons of tahini sauce.
<li> Squeeze in one lemon.
<li> Add salt, pepper, and buttercream frosting to taste.
</ol>

</body>
</html>

""",
   'http://udacity.com/cs101x/urank/arsenic.html': """<html>
<body>
<h1>
The Arsenic Chef's World Famous Hummus Recipe
</h1>
<p>

<ol>
<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
<li> Force her to make hummus for you.
</ol>

</body>
</html>

""",
   'http://udacity.com/cs101x/urank/hummus.html': """<html>
<body>
<h1>
Hummus Recipe
</h1>
<p>

<ol>
<li> Go to the store and buy a container of hummus.
<li> Open it.
</ol>

</body>
</html>


""",
}

def get_page(url):
    if url in cache:
        return cache[url]
    else:
        return None

def get_next_target(page):
    start_link = page.find('<a href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    return links


def union(a, b):
    for e in b:
        if e not in a:
            a.append(e)

def add_page_to_index(index, url, content):
    words = content.split()
    for word in words:
        add_to_index(index, word, url)

def add_to_index(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)
    else:
        index[keyword] = [url]

def lookup(index, keyword):
    if keyword in index:
        return index[keyword]
    else:
        return None


index , graph = crawl_web('http://udacity.com/cs101x/urank/index.html')

if 'http://udacity.com/cs101x/urank/index.html' in graph:
    print graph['http://udacity.com/cs101x/urank/index.html']
#>>> ['http://udacity.com/cs101x/urank/hummus.html',
#'http://udacity.com/cs101x/urank/arsenic.html',
#'http://udacity.com/cs101x/urank/kathleen.html',
#'http://udacity.com/cs101x/urank/nickel.html',
#'http://udacity.com/cs101x/urank/zinc.html']
	# pull content of a url
	def get_page(url):
	try:
	import urllib
	return urllib.urlopen(url).read()
	except:
	return ""

	# find the links within a page
	def get_next_target(page):
	start_link = page.find('<a href=')
	if start_link == -1:
	return None, 0
	start_quote = page.find('"', start_link)
	end_quote = page.find('"', start_quote + 1)
	url = page[start_quote + 1:end_quote]
	return url, end_quote

	def union(p,q):
	for e in q:
	if e not in p:
	p.append(e)

	# find all links in a url page
	def get_all_links(page):
	links = []
	while True:
	# because get_next_target returns two values, the return of the first value is assigned to url, second assigned to end pos! Amazing!
	url,endpos = get_next_target(page)
	if url:
	# if we returned a value for another url - we add it to our list of links
	links.append(url)
	# we start the next search for a link from the last position a url was found
	page = page[endpos:]
	else:
	break
	return links

	# We start with a seed page and keep track of the depth and what has / hasnt been crawled.
	def crawl_web(seed,max_depth):
	tocrawl = [seed]
	crawled = []
	index = []
	next_depth = []
	depth = 0
	while tocrawl and depth <= max_depth:
	page = tocrawl.pop()
	if page not in crawled:
	content = get_page(page)
	# chops up the content of a page into keywords and adds to the index in the format [[<keyword>, [<url1>, <url1>, ...]], ...]
	add_page_to_index(index, page, content)
	# add pages to the next depth
	union(next_depth, get_all_links(contnet))
	crawled.append(page)
	# once we have finished this depth
	if not tocrawl:
	tocrawl, next_depth = next_depth, []
	depth = depth + 1
	return crawled

	index = []

	# check if the keyword exists - if yes then append the url to it, if not then add a new list to index with the keyword and url
	def add_to_index(index, keyword, url):
	for entry in index:
	if entry[0] == keyword:
	if url in entry[1]:
	# dont add the same url twice
	return
	else:
	entry[1].append(url)
	return
	# not found, add new keyword to index
	index.append([keyword, [url]])



	# return urls that match the keyword
	def lookup(index,keyword):
	for entry in index:
	if entry[0] == keyword:
	return entry[1]
	return []

	# take a string of words associated with a url, choop them up and add them to the index
	def add_page_to_index(index,url,content):
	# split up the words
	words = content.split()
	for keyword in words:
	add_to_index(index,keyword,url)



	# Modify the crawl_web procedure so that instead of just returning the
	# index, it returns an index and a graph. The graph should be a
	# Dictionary where the key:value entries are:

	# url: [list of pages url links to]


	def crawl_web(seed): # returns index, graph of outlinks
	tocrawl = [seed]
	crawled = []
	graph = {} # <url>:[list of pages it links to]
	index = {}
	while tocrawl:
	page = tocrawl.pop()
	if page not in crawled:
	content = get_page(page)
	add_page_to_index(index, page, content)
	outlinks = get_all_links(content)
	graph[page] = [outlinks]
	union(tocrawl, outlinks)
	crawled.append(page)
	return index, graph


	cache = {
	'http://udacity.com/cs101x/urank/index.html': """<html>
	<body>
	<h1>Dave's Cooking Algorithms</h1>
	<p>
	Here are my favorite recipies:
	<ul>
	<li> <a href="http://udacity.com/cs101x/urank/hummus.html">Hummus Recipe</a>
	<li> <a href="http://udacity.com/cs101x/urank/arsenic.html">World's Best Hummus</a>
	<li> <a href="http://udacity.com/cs101x/urank/kathleen.html">Kathleen's Hummus Recipe</a>
	</ul>

	For more expert opinions, check out the
	<a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>
	and <a href="http://udacity.com/cs101x/urank/zinc.html">Zinc Chef</a>.
	</body>
	</html>






	""",
	'http://udacity.com/cs101x/urank/zinc.html': """<html>
	<body>
	<h1>The Zinc Chef</h1>
	<p>
	I learned everything I know from
	<a href="http://udacity.com/cs101x/urank/nickel.html">the Nickel Chef</a>.
	</p>
	<p>
	For great hummus, try
	<a href="http://udacity.com/cs101x/urank/arsenic.html">this recipe</a>.

	</body>
	</html>






	""",
	'http://udacity.com/cs101x/urank/nickel.html': """<html>
	<body>
	<h1>The Nickel Chef</h1>
	<p>
	This is the
	<a href="http://udacity.com/cs101x/urank/kathleen.html">
	best Hummus recipe!
	</a>

	</body>
	</html>






	""",
	'http://udacity.com/cs101x/urank/kathleen.html': """<html>
	<body>
	<h1>
	Kathleen's Hummus Recipe
	</h1>
	<p>

	<ol>
	<li> Open a can of garbonzo beans.
	<li> Crush them in a blender.
	<li> Add 3 tablesppons of tahini sauce.
	<li> Squeeze in one lemon.
	<li> Add salt, pepper, and buttercream frosting to taste.
	</ol>

	</body>
	</html>

	""",
	'http://udacity.com/cs101x/urank/arsenic.html': """<html>
	<body>
	<h1>
	The Arsenic Chef's World Famous Hummus Recipe
	</h1>
	<p>

	<ol>
	<li> Kidnap the <a href="http://udacity.com/cs101x/urank/nickel.html">Nickel Chef</a>.
	<li> Force her to make hummus for you.
	</ol>

	</body>
	</html>

	""",
	'http://udacity.com/cs101x/urank/hummus.html': """<html>
	<body>
	<h1>
	Hummus Recipe
	</h1>
	<p>

	<ol>
	<li> Go to the store and buy a container of hummus.
	<li> Open it.
	</ol>

	</body>
	</html>




	""",
	}

	def get_page(url):
	if url in cache:
	return cache[url]
	else:
	return None

	def get_next_target(page):
	start_link = page.find('<a href=')
	if start_link == -1:
	return None, 0
	start_quote = page.find('"', start_link)
	end_quote = page.find('"', start_quote + 1)
	url = page[start_quote + 1:end_quote]
	return url, end_quote

	def get_all_links(page):
	links = []
	while True:
	url, endpos = get_next_target(page)
	if url:
	links.append(url)
	page = page[endpos:]
	else:
	break
	return links


	def union(a, b):
	for e in b:
	if e not in a:
	a.append(e)

	def add_page_to_index(index, url, content):
	words = content.split()
	for word in words:
	add_to_index(index, word, url)

	def add_to_index(index, keyword, url):
	if keyword in index:
	index[keyword].append(url)
	else:
	index[keyword] = [url]

	def lookup(index, keyword):
	if keyword in index:
	return index[keyword]
	else:
	return None



	index , graph = crawl_web('http://udacity.com/cs101x/urank/index.html')

	if 'http://udacity.com/cs101x/urank/index.html' in graph:
	print graph['http://udacity.com/cs101x/urank/index.html']
	#>>> ['http://udacity.com/cs101x/urank/hummus.html',
	#'http://udacity.com/cs101x/urank/arsenic.html',
	#'http://udacity.com/cs101x/urank/kathleen.html',
	#'http://udacity.com/cs101x/urank/nickel.html',
	#'http://udacity.com/cs101x/urank/zinc.html']