Skip to content

Instantly share code, notes, and snippets.

@mafriend
Last active January 12, 2016 17:11
Show Gist options
  • Save mafriend/8fab01228a08ae040e4a to your computer and use it in GitHub Desktop.
Save mafriend/8fab01228a08ae040e4a to your computer and use it in GitHub Desktop.
Web Crawler
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
#print url
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
union(tocrawl, get_all_links(get_page(page)))
crawled.append(page)
return crawled
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment