Skip to content

Instantly share code, notes, and snippets.

@alexbeletsky
Created May 20, 2012 11:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexbeletsky/2757731 to your computer and use it in GitHub Desktop.
Save alexbeletsky/2757731 to your computer and use it in GitHub Desktop.
CS101 - Simple Web Crawler (week 4)
import urllib2
def get_html_body(html):
openTag = '<body>'
closeTag = '</body>'
bodyBegins = html.find(openTag) + len(openTag)
bodyEnds = html.find(closeTag, bodyBegins)
return html[bodyBegins:bodyEnds]
def get_all_html_tags(body):
tokens = []
pos = 0
word = ''
wait_for = '<'
start_capture = False
while pos < len(body):
symbol = body[pos]
if symbol == '<':
start_capture = True
if start_capture and symbol == '>':
start_capture = False
word += body[pos]
if word not in tokens:
tokens.append(word)
word = ''
if start_capture:
word += body[pos]
pos += 1
return tokens
def remove_all_html_tags(body):
tags = get_all_html_tags(body)
for tag in tags:
body = body.replace(tag, '')
return body
def clean_up(html):
body = get_html_body(html)
return remove_all_html_tags(body)
def get_page_content(url):
return urllib2.urlopen(url).read()
def get_next_link(page):
href = '<a href="';
start_pos = page.find(href)
if start_pos > 0:
url_start = start_pos + len(href)
url_end = page.find('"', url_start)
url = page[url_start:url_end]
return url, url_end
return None, -1
def get_links(page):
links = []
while True:
url, end_pos = get_next_link(page)
if not url:
break
links.append(url)
page = page[end_pos:]
return links
def lookup(index, keyword):
element = find_in_index(index, keyword)
if element is not None:
return element[1]
return []
def find_in_index(index, keyword):
for e in index:
if e[0] == keyword:
return e;
return None
def add_to_index(index, keyword, url):
element = find_in_index(index, keyword)
if element is None:
element = [keyword, []]
index.append(element)
urls = element[1]
if url not in urls:
urls.append(url)
def add_page_to_index(index, url, page):
content = clean_up(page)
words = content.split()
for word in words:
add_to_index(index, word, url)
def crawl(seed):
to_crawl = [seed]
crawled = []
index = []
while to_crawl:
url = to_crawl.pop()
if url not in crawled:
page = get_page_content(url)
add_page_to_index(index, url, page)
links = get_links(page)
if len(links) > 0:
to_crawl.extend(links)
crawled.append(url)
return index
index = crawl('http://www.udacity.com/cs101x/index.html')
print index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment