/webcraw

## webcraw
#http://www.diveintopython.net/html_processing/extracting_data.html
#https://docs.python.org/2/library/robotparser.html
import robotparser
import urllib
import csv
from urlparse import urlparse
def get_page(url):
        sock = urllib.urlopen(url)
        htmlSource = sock.read()
        sock.close()
        return htmlSource

#https://www.udacity.com/course/viewer#!/c-cs101/l-48727569/e-48718374/m-48719196
def get_next_target(page):
    start_link = page.find('href=')
    if start_link == -1:
        return None, 0
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]
    return url, end_quote

def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)


def get_all_links(page):
    links = []
    while True:
        url,endpos = get_next_target(page)
        if url:
            links.append(url)
            page = page[endpos:]
        else:
            break
    terms = extract_dictionary_terms(page)
    return links, terms

def get_title(page):
        start_title = page.find('<title>')
        if start_title == -1:
                return 'None'
        end_title = page.find('</title>')
        title = page[start_title+7:end_title]
        return title

def crawl_web(seed,max_depth,max_pages):
    tocrawl = [seed]
    crawled = []
    next_depth = []
    title = []
    depth = 0
    while tocrawl and depth <= max_depth:
        page_url = tocrawl.pop()
        url_split = urlparse(page_url)
        rp = robotparser.RobotFileParser()
        rp.set_url(url_split.netloc + "/robots.txt")
        rp.read()
        if (page_url not in crawled) and (rp.can_fetch("*", page_url)):
            links,terms = get_all_links(get_page(page_url))
            union(next_depth, links)
            while len(tocrawl) > max_pages:
                tocrawl.pop()
            #union(title, get_title(get_page(page)))
            write_to_file(page_url,terms)
            crawled.append(page_url)
        if not tocrawl:
            tocrawl, next_depth = next_depth, []
            depth = depth + 1

def extract_dictionary_terms(page):
	terms = import_dictionary()
	for term in terms:
		if page.find(term) >= 0:
			terms.append(term)
		else:
			continue
	if len(terms) > 0:
		return terms
	else:
		return []

def import_dictionary():
    with open("dictionary.txt", "r") as dictionaryfile:
    	dictionary = dictionaryfile.readlines()
    dictionaryfile.close()
    return dictionary

def write_to_file(url,terms):
	crawled_file = open("crawled_file.csv", "wb")
	wr = csv.writer(crawled_file, quoting=csv.QUOTE_ALL)
	print >> crawled_file, url + '\",\"'
	wr.writerow(terms)
#    print >> crawled_file, url + ','
#    for term in terms:
#        print >> crawled_file, term + ','
#    print >> crawled_file, '\n'
	crawled_file.close()
	return


#print "Hello, welcome to my web crawler."
seed = raw_input("Please enter the seed website URL: ")
#tocrawl, seed =
crawl_web(seed,500,500)
#while tocrawl > 0:
#	crawl_web(seed,4,10)
	#http://www.diveintopython.net/html_processing/extracting_data.html
	#https://docs.python.org/2/library/robotparser.html
	import robotparser
	import urllib
	import csv
	from urlparse import urlparse
	def get_page(url):
	sock = urllib.urlopen(url)
	htmlSource = sock.read()
	sock.close()
	return htmlSource

	#https://www.udacity.com/course/viewer#!/c-cs101/l-48727569/e-48718374/m-48719196
	def get_next_target(page):
	start_link = page.find('href=')
	if start_link == -1:
	return None, 0
	start_quote = page.find('"', start_link)
	end_quote = page.find('"', start_quote + 1)
	url = page[start_quote + 1:end_quote]
	return url, end_quote

	def union(p,q):
	for e in q:
	if e not in p:
	p.append(e)


	def get_all_links(page):
	links = []
	while True:
	url,endpos = get_next_target(page)
	if url:
	links.append(url)
	page = page[endpos:]
	else:
	break
	terms = extract_dictionary_terms(page)
	return links, terms

	def get_title(page):
	start_title = page.find('<title>')
	if start_title == -1:
	return 'None'
	end_title = page.find('</title>')
	title = page[start_title+7:end_title]
	return title

	def crawl_web(seed,max_depth,max_pages):
	tocrawl = [seed]
	crawled = []
	next_depth = []
	title = []
	depth = 0
	while tocrawl and depth <= max_depth:
	page_url = tocrawl.pop()
	url_split = urlparse(page_url)
	rp = robotparser.RobotFileParser()
	rp.set_url(url_split.netloc + "/robots.txt")
	rp.read()
	if (page_url not in crawled) and (rp.can_fetch("*", page_url)):
	links,terms = get_all_links(get_page(page_url))
	union(next_depth, links)
	while len(tocrawl) > max_pages:
	tocrawl.pop()
	#union(title, get_title(get_page(page)))
	write_to_file(page_url,terms)
	crawled.append(page_url)
	if not tocrawl:
	tocrawl, next_depth = next_depth, []
	depth = depth + 1

	def extract_dictionary_terms(page):
	terms = import_dictionary()
	for term in terms:
	if page.find(term) >= 0:
	terms.append(term)
	else:
	continue
	if len(terms) > 0:
	return terms
	else:
	return []

	def import_dictionary():
	with open("dictionary.txt", "r") as dictionaryfile:
	dictionary = dictionaryfile.readlines()
	dictionaryfile.close()
	return dictionary

	def write_to_file(url,terms):
	crawled_file = open("crawled_file.csv", "wb")
	wr = csv.writer(crawled_file, quoting=csv.QUOTE_ALL)
	print >> crawled_file, url + '\",\"'
	wr.writerow(terms)
	# print >> crawled_file, url + ','
	# for term in terms:
	# print >> crawled_file, term + ','
	# print >> crawled_file, '\n'
	crawled_file.close()
	return


	#print "Hello, welcome to my web crawler."
	seed = raw_input("Please enter the seed website URL: ")
	#tocrawl, seed =
	crawl_web(seed,500,500)
	#while tocrawl > 0:
	# crawl_web(seed,4,10)