Created
May 20, 2012 11:37
-
-
Save alexbeletsky/2757731 to your computer and use it in GitHub Desktop.
CS101 - Simple Web Crawler (week 4)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib2 | |
def get_html_body(html): | |
openTag = '<body>' | |
closeTag = '</body>' | |
bodyBegins = html.find(openTag) + len(openTag) | |
bodyEnds = html.find(closeTag, bodyBegins) | |
return html[bodyBegins:bodyEnds] | |
def get_all_html_tags(body): | |
tokens = [] | |
pos = 0 | |
word = '' | |
wait_for = '<' | |
start_capture = False | |
while pos < len(body): | |
symbol = body[pos] | |
if symbol == '<': | |
start_capture = True | |
if start_capture and symbol == '>': | |
start_capture = False | |
word += body[pos] | |
if word not in tokens: | |
tokens.append(word) | |
word = '' | |
if start_capture: | |
word += body[pos] | |
pos += 1 | |
return tokens | |
def remove_all_html_tags(body): | |
tags = get_all_html_tags(body) | |
for tag in tags: | |
body = body.replace(tag, '') | |
return body | |
def clean_up(html): | |
body = get_html_body(html) | |
return remove_all_html_tags(body) | |
def get_page_content(url): | |
return urllib2.urlopen(url).read() | |
def get_next_link(page): | |
href = '<a href="'; | |
start_pos = page.find(href) | |
if start_pos > 0: | |
url_start = start_pos + len(href) | |
url_end = page.find('"', url_start) | |
url = page[url_start:url_end] | |
return url, url_end | |
return None, -1 | |
def get_links(page): | |
links = [] | |
while True: | |
url, end_pos = get_next_link(page) | |
if not url: | |
break | |
links.append(url) | |
page = page[end_pos:] | |
return links | |
def lookup(index, keyword): | |
element = find_in_index(index, keyword) | |
if element is not None: | |
return element[1] | |
return [] | |
def find_in_index(index, keyword): | |
for e in index: | |
if e[0] == keyword: | |
return e; | |
return None | |
def add_to_index(index, keyword, url): | |
element = find_in_index(index, keyword) | |
if element is None: | |
element = [keyword, []] | |
index.append(element) | |
urls = element[1] | |
if url not in urls: | |
urls.append(url) | |
def add_page_to_index(index, url, page): | |
content = clean_up(page) | |
words = content.split() | |
for word in words: | |
add_to_index(index, word, url) | |
def crawl(seed): | |
to_crawl = [seed] | |
crawled = [] | |
index = [] | |
while to_crawl: | |
url = to_crawl.pop() | |
if url not in crawled: | |
page = get_page_content(url) | |
add_page_to_index(index, url, page) | |
links = get_links(page) | |
if len(links) > 0: | |
to_crawl.extend(links) | |
crawled.append(url) | |
return index | |
index = crawl('http://www.udacity.com/cs101x/index.html') | |
print index |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment