alexbeletsky/gist:2757731

## gistfile1.py
import urllib2

def get_html_body(html):
	openTag = '<body>'
	closeTag = '</body>'
	bodyBegins = html.find(openTag) + len(openTag)
	bodyEnds = html.find(closeTag, bodyBegins)

	return html[bodyBegins:bodyEnds]

def get_all_html_tags(body):
	tokens = []
	pos = 0
	word = ''
	wait_for = '<'
	start_capture = False

	while pos < len(body):
		symbol = body[pos]
		if symbol == '<':
			start_capture = True

		if start_capture and symbol == '>':
			start_capture = False
			word += body[pos]
			if word not in tokens:
				tokens.append(word)
			word = ''

		if start_capture:
			word += body[pos]

		pos += 1

	return tokens

def remove_all_html_tags(body):
	tags = get_all_html_tags(body)

	for tag in tags:
		body = body.replace(tag, '')

	return body

def clean_up(html):
	body = get_html_body(html)
	return remove_all_html_tags(body)

def get_page_content(url):
	return urllib2.urlopen(url).read()

def get_next_link(page):
	href = '<a href="';
	start_pos = page.find(href)
	if start_pos > 0:
		url_start = start_pos + len(href)
		url_end = page.find('"', url_start)
		url = page[url_start:url_end]
		return url, url_end

	return None, -1

def get_links(page):
	links = []
	while True:
		url, end_pos = get_next_link(page)
		if not url:
			break

		links.append(url)
		page = page[end_pos:]

	return links

def lookup(index, keyword):
	element = find_in_index(index, keyword)
	if element is not None:
		return element[1]

	return []

def find_in_index(index, keyword):
	for e in index:
		if e[0] == keyword:
			return e;
	return None

def add_to_index(index, keyword, url):
	element = find_in_index(index, keyword)
	if element is None:
		element = [keyword, []]
		index.append(element)

	urls = element[1]
	if url not in urls:
		urls.append(url)

def add_page_to_index(index, url, page):
	content = clean_up(page)
	words = content.split()
	for word in words:
		add_to_index(index, word, url)

def crawl(seed):
	to_crawl = [seed]
	crawled = []
	index = []

	while to_crawl:
		url = to_crawl.pop()
		if url not in crawled:
			page = get_page_content(url)
			add_page_to_index(index, url, page)
			links = get_links(page)
			if len(links) > 0:
				to_crawl.extend(links)

			crawled.append(url)

	return index

index = crawl('http://www.udacity.com/cs101x/index.html')
print index
	import urllib2

	def get_html_body(html):
	openTag = '<body>'
	closeTag = '</body>'
	bodyBegins = html.find(openTag) + len(openTag)
	bodyEnds = html.find(closeTag, bodyBegins)

	return html[bodyBegins:bodyEnds]

	def get_all_html_tags(body):
	tokens = []
	pos = 0
	word = ''
	wait_for = '<'
	start_capture = False

	while pos < len(body):
	symbol = body[pos]
	if symbol == '<':
	start_capture = True

	if start_capture and symbol == '>':
	start_capture = False
	word += body[pos]
	if word not in tokens:
	tokens.append(word)
	word = ''

	if start_capture:
	word += body[pos]

	pos += 1

	return tokens

	def remove_all_html_tags(body):
	tags = get_all_html_tags(body)

	for tag in tags:
	body = body.replace(tag, '')

	return body

	def clean_up(html):
	body = get_html_body(html)
	return remove_all_html_tags(body)

	def get_page_content(url):
	return urllib2.urlopen(url).read()

	def get_next_link(page):
	href = '<a href="';
	start_pos = page.find(href)
	if start_pos > 0:
	url_start = start_pos + len(href)
	url_end = page.find('"', url_start)
	url = page[url_start:url_end]
	return url, url_end

	return None, -1

	def get_links(page):
	links = []
	while True:
	url, end_pos = get_next_link(page)
	if not url:
	break

	links.append(url)
	page = page[end_pos:]

	return links

	def lookup(index, keyword):
	element = find_in_index(index, keyword)
	if element is not None:
	return element[1]

	return []

	def find_in_index(index, keyword):
	for e in index:
	if e[0] == keyword:
	return e;
	return None

	def add_to_index(index, keyword, url):
	element = find_in_index(index, keyword)
	if element is None:
	element = [keyword, []]
	index.append(element)

	urls = element[1]
	if url not in urls:
	urls.append(url)

	def add_page_to_index(index, url, page):
	content = clean_up(page)
	words = content.split()
	for word in words:
	add_to_index(index, word, url)

	def crawl(seed):
	to_crawl = [seed]
	crawled = []
	index = []

	while to_crawl:
	url = to_crawl.pop()
	if url not in crawled:
	page = get_page_content(url)
	add_page_to_index(index, url, page)
	links = get_links(page)
	if len(links) > 0:
	to_crawl.extend(links)

	crawled.append(url)

	return index

	index = crawl('http://www.udacity.com/cs101x/index.html')
	print index