Skip to content

Instantly share code, notes, and snippets.

@alexbeletsky
Created May 7, 2012 09:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexbeletsky/2626946 to your computer and use it in GitHub Desktop.
Save alexbeletsky/2626946 to your computer and use it in GitHub Desktop.
CS101 - Simple Web Crawler (week 3)
import urllib2
def web_request(url):
response = urllib2.urlopen(url)
return response.read()
def get_next_link(page):
href = '<a href="';
start_pos = page.find(href)
if start_pos > 0:
url_start = start_pos + len(href)
url_end = page.find('"', url_start)
url = page[url_start:url_end]
return url, url_end
return None, -1
def get_links(page):
links = []
while True:
url, end_pos = get_next_link(page)
if not url:
break
links.append(url)
page = page[end_pos:]
return links
def crawl(seed):
to_crawl = [seed]
crawled = []
print 'Crawling started...'
while to_crawl:
current = to_crawl.pop()
if current not in crawled:
page = web_request(current)
links = get_links(page)
if len(links) > 0:
to_crawl.extend(links)
crawled.append(current)
print 'Crawling done...'
return crawled
c = crawl('http://www.udacity.com/cs101x/index.html')
print c
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment