Skip to content

Instantly share code, notes, and snippets.

@morisasy
Last active July 28, 2017 09:34
Show Gist options
  • Save morisasy/abeb6434d639bf0198d6ff65652cb273 to your computer and use it in GitHub Desktop.
Save morisasy/abeb6434d639bf0198d6ff65652cb273 to your computer and use it in GitHub Desktop.
# THREE GOLD STARS #
# Yes, we really mean it! This is really tough (but doable) unless
# you have some previous experience before this course.
# Modify the crawl_web procedure to take a second parameter,
# max_depth, that limits the depth of the search. We can
# define the depth of a page as the number of links that must
# be followed to reach that page starting from the seed page,
# that is, the length of the shortest path from the seed to
# the page. No pages whose depth exceeds max_depth should be
# included in the crawl.
#
# For example, if max_depth is 0, the only page that should
# be crawled is the seed page. If max_depth is 1, the pages
# that should be crawled are the seed page and every page that
# it links to directly. If max_depth is 2, the crawl should
# also include all pages that are linked to by these pages.
#
# Note that the pages in the crawl may be in any order.
#
# The following definition of get_page provides an interface
# to the website found at http://www.udacity.com/cs101x/index.html
# The function output order does not affect grading.
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return ('<html> <body> This is a test page for learning to crawl! '
'<p> It is a good idea to '
'<a href="http://www.udacity.com/cs101x/crawling.html">learn to '
'crawl</a> before you try to '
'<a href="http://www.udacity.com/cs101x/walking.html">walk</a> '
'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. '
'</p> </body> </html> ')
elif url == "http://www.udacity.com/cs101x/crawling.html":
return ('<html> <body> I have not learned to crawl yet, but I '
'am quite good at '
'<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.'
'</body> </html>')
elif url == "http://www.udacity.com/cs101x/walking.html":
return ('<html> <body> I cant get enough '
'<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! '
'</body> </html>')
elif url == "http://www.udacity.com/cs101x/flying.html":
return ('<html> <body> The magic words are Squeamish Ossifrage! '
'</body> </html>')
elif url == "http://top.contributors/velak.html":
return ('<a href="http://top.contributors/jesyspa.html">'
'<a href="http://top.contributors/forbiddenvoid.html">')
elif url == "http://top.contributors/jesyspa.html":
return ('<a href="http://top.contributors/elssar.html">'
'<a href="http://top.contributors/kilaws.html">')
elif url == "http://top.contributors/forbiddenvoid.html":
return ('<a href="http://top.contributors/charlzz.html">'
'<a href="http://top.contributors/johang.html">'
'<a href="http://top.contributors/graemeblake.html">')
elif url == "http://top.contributors/kilaws.html":
return ('<a href="http://top.contributors/tomvandenbosch.html">'
'<a href="http://top.contributors/mathprof.html">')
elif url == "http://top.contributors/graemeblake.html":
return ('<a href="http://top.contributors/dreyescat.html">'
'<a href="http://top.contributors/angel.html">')
elif url == "A1":
return '<a href="B1"> <a href="C1"> '
elif url == "B1":
return '<a href="E1">'
elif url == "C1":
return '<a href="D1">'
elif url == "D1":
return '<a href="E1"> '
elif url == "E1":
return '<a href="F1"> '
except:
return ""
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed,max_depth):
tocrawl = [seed]
crawled = []
next_depth = []
depth = 0
while tocrawl and depth <= max_depth:
page = tocrawl.pop()
if page not in crawled:
union(next_depth, get_all_links(get_page(page)))
crawled.append(page)
if not tocrawl:
tocrawl, next_depth = next_depth, []
depth = depth + 1
return crawled
print(crawl_web("http://www.udacity.com/cs101x/index.html",0))
#>>> ['http://www.udacity.com/cs101x/index.html']
print(crawl_web("http://www.udacity.com/cs101x/index.html",1))
#>>> ['http://www.udacity.com/cs101x/index.html',
#>>> 'http://www.udacity.com/cs101x/flying.html',
#>>> 'http://www.udacity.com/cs101x/walking.html',
#>>> 'http://www.udacity.com/cs101x/crawling.html']
print(crawl_web("http://www.udacity.com/cs101x/index.html",50))
#>>> ['http://www.udacity.com/cs101x/index.html',
#>>> 'http://www.udacity.com/cs101x/flying.html',
#>>> 'http://www.udacity.com/cs101x/walking.html',
#>>> 'http://www.udacity.com/cs101x/crawling.html',
#>>> 'http://www.udacity.com/cs101x/kicking.html']
print(crawl_web("http://top.contributors/forbiddenvoid.html",2))
#>>> ['http://top.contributors/forbiddenvoid.html',
#>>> 'http://top.contributors/graemeblake.html',
#>>> 'http://top.contributors/angel.html',
#>>> 'http://top.contributors/dreyescat.html',
#>>> 'http://top.contributors/johang.html',
#>>> 'http://top.contributors/charlzz.html']
print(crawl_web("A1",3))
#>>> ['A1', 'C1', 'B1', 'E1', 'D1', 'F1']
# (May be in any order)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment