Created
April 7, 2013 23:01
-
-
Save AndySze/5332975 to your computer and use it in GitHub Desktop.
web crawler with depth
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# This question explores a different way (from the previous question) | |
# to limit the pages that it can crawl. | |
# | |
####### | |
# THREE GOLD STARS # | |
# Yes, we really mean it! This is really tough (but doable) unless | |
# you have some previous experience before this course. | |
# Modify the crawl_web procedure to take a second parameter, | |
# max_depth, that limits the depth of the search. We can | |
# define the depth of a page as the number of links that must | |
# be followed to reach that page starting from the seed page, | |
# that is, the length of the shortest path from the seed to | |
# the page. No pages whose depth exceeds max_depth should be | |
# included in the crawl. | |
# | |
# For example, if max_depth is 0, the only page that should | |
# be crawled is the seed page. If max_depth is 1, the pages | |
# that should be crawled are the seed page and every page that | |
# it links to directly. If max_depth is 2, the crawl should | |
# also include all pages that are linked to by these pages. | |
# | |
# Note that the pages in the crawl may be in any order. | |
# | |
# The following definition of get_page provides an interface | |
# to the website found at http://www.udacity.com/cs101x/index.html | |
# The function output order does not affect grading. | |
def get_page(url): | |
try: | |
if url == "http://www.udacity.com/cs101x/index.html": | |
return ('<html> <body> This is a test page for learning to crawl! ' | |
'<p> It is a good idea to ' | |
'<a href="http://www.udacity.com/cs101x/crawling.html">learn to ' | |
'crawl</a> before you try to ' | |
'<a href="http://www.udacity.com/cs101x/walking.html">walk</a> ' | |
'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. ' | |
'</p> </body> </html> ') | |
elif url == "http://www.udacity.com/cs101x/crawling.html": | |
return ('<html> <body> I have not learned to crawl yet, but I ' | |
'am quite good at ' | |
'<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.' | |
'</body> </html>') | |
elif url == "http://www.udacity.com/cs101x/walking.html": | |
return ('<html> <body> I cant get enough ' | |
'<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! ' | |
'</body> </html>') | |
elif url == "http://www.udacity.com/cs101x/flying.html": | |
return ('<html> <body> The magic words are Squeamish Ossifrage! ' | |
'</body> </html>') | |
elif url == "http://top.contributors/velak.html": | |
return ('<a href="http://top.contributors/jesyspa.html">' | |
'<a href="http://top.contributors/forbiddenvoid.html">') | |
elif url == "http://top.contributors/jesyspa.html": | |
return ('<a href="http://top.contributors/elssar.html">' | |
'<a href="http://top.contributors/kilaws.html">') | |
elif url == "http://top.contributors/forbiddenvoid.html": | |
return ('<a href="http://top.contributors/charlzz.html">' | |
'<a href="http://top.contributors/johang.html">' | |
'<a href="http://top.contributors/graemeblake.html">') | |
elif url == "http://top.contributors/kilaws.html": | |
return ('<a href="http://top.contributors/tomvandenbosch.html">' | |
'<a href="http://top.contributors/mathprof.html">') | |
elif url == "http://top.contributors/graemeblake.html": | |
return ('<a href="http://top.contributors/dreyescat.html">' | |
'<a href="http://top.contributors/angel.html">') | |
elif url == "A1": | |
return '<a href="B1"> <a href="C1"> ' | |
elif url == "B1": | |
return '<a href="E1">' | |
elif url == "C1": | |
return '<a href="D1">' | |
elif url == "D1": | |
return '<a href="E1"> ' | |
elif url == "E1": | |
return '<a href="F1"> ' | |
except: | |
return "" | |
return "" | |
def get_next_target(page): | |
start_link = page.find('<a href=') | |
if start_link == -1: | |
return None, 0 | |
start_quote = page.find('"', start_link) | |
end_quote = page.find('"', start_quote + 1) | |
url = page[start_quote + 1:end_quote] | |
return url, end_quote | |
def union(p,q): | |
for e in q: | |
if e not in p: | |
p.append(e) | |
def get_all_links(page): | |
links = [] | |
while True: | |
url,endpos = get_next_target(page) | |
if url: | |
links.append(url) | |
page = page[endpos:] | |
else: | |
break | |
return links | |
def crawl_web(seed,max_depth): | |
tocrawl = [seed] | |
crawled = [] | |
depth = 0 | |
next_depth = [] | |
while tocrawl and depth<=max_depth: | |
page = tocrawl.pop() | |
if page not in crawled: | |
union(next_depth, get_all_links(get_page(page))) | |
crawled.append(page) | |
if not tocrawl: | |
tocrawl,next_depth = next_depth,[] | |
depth = depth + 1 | |
return crawled | |
print crawl_web("http://www.udacity.com/cs101x/index.html",0) | |
#>>> ['http://www.udacity.com/cs101x/index.html'] | |
print crawl_web("http://top.contributors/velak.html",1) | |
#>>> ['http://www.udacity.com/cs101x/index.html', | |
#>>> 'http://www.udacity.com/cs101x/flying.html', | |
#>>> 'http://www.udacity.com/cs101x/walking.html', | |
#>>> 'http://www.udacity.com/cs101x/crawling.html'] | |
print crawl_web("http://www.udacity.com/cs101x/index.html",50) | |
#>>> ['http://www.udacity.com/cs101x/index.html', | |
#>>> 'http://www.udacity.com/cs101x/flying.html', | |
#>>> 'http://www.udacity.com/cs101x/walking.html', | |
#>>> 'http://www.udacity.com/cs101x/crawling.html', | |
#>>> 'http://www.udacity.com/cs101x/kicking.html'] | |
print crawl_web("http://top.contributors/forbiddenvoid.html",2) | |
#>>> ['http://top.contributors/forbiddenvoid.html', | |
#>>> 'http://top.contributors/graemeblake.html', | |
#>>> 'http://top.contributors/angel.html', | |
#>>> 'http://top.contributors/dreyescat.html', | |
#>>> 'http://top.contributors/johang.html', | |
#>>> 'http://top.contributors/charlzz.html'] | |
#print crawl_web("A1",3) | |
#>>> ['A1', 'C1', 'B1', 'E1', 'D1', 'F1'] | |
# (May be in any order) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment