heaven00/gist:4593340

## gistfile1.py
import mechanize
from urlparse import urlparse, urljoin

"""Mechanize will open the link given to it and get the links without breaking much sweat,
  I am not sure what kind of repository setup you have so i am just going to use a global list named master to store all the succesfully crawled links, I also added a failed_link list that will store the links that the mechanize model was unable to open for further crawling. """

"""I wasn't sure about the links that the program needs to handle or how those links needed to be handled, so this program assumes that the link given to it is a valid link and i tried to make the program such that it doest not fail without warning"""

"""to form the repository of sites, the master and failed_links lists needs to be stored. Personally I would prefer storing the data in Mongo in the structure:
  	{'site' : site,
		'depth' : depth,
		'successful_crawls' : [master] list,
		'unsuccessful_crawls' : [failed_links] list  }"""


master = []
failed_links = []

#crawler function will extract the links from the page using mechanize library and return those links in a list
def crawler(link):
	"""mechanize.Browser opens a browser instance"""
	br = mechanize.Browser()
	try:
		br.open(link)
		"""the next line br._factory.is_html = True is added to avoid the error 'mechanize._mechanize.BrowserStateError: not viewing HTML' """
		br._factory.is_html = True
	except Exception, Error:
		failed_links.append(link)
		print "Couldn't open %s --------- %s"%(link, Error)

	link_list = []
	"""in this try except the links are scraped from a page if possible"""
	try:
		for links in br.links():
			url = links.url
			link = links.absolute_url
			if link not in master:
				link_list.append(link)
		return link_list

	except Exception, Error:
		print "could not get link ",Error


#this function will add a list of links to the global list master
def add_to_master(tmp_list):
	"""the temp_list is passed in this function which stores the links scraped from a single page and each link is added to the master list"""
	if tmp_list != None:
		for link in tmp_list:
			master.append(link)

#this function will get the next link from master
def get_link(i):
	try:
		return master[i]
	except:
		return None


#this is the main function where the program takes shape
"""The depth defines how deep does the program needs to dig for links and the link is the initial link from where the crawling needs to begin"""
def main(depth,link):
	temp_list =[]
	temp_list.append(link)
	add_to_master(temp_list)
	for i in range(0,depth):
		link = get_link(i)
		if link != None:
			try:
				print 'passing link=============>', link
				temp_list = crawler(link)
				add_to_master(temp_list)
			except:
				print "unable to handle link"
		else:
			print "most probably request disallowed by robots.txt "
			break
	print "number of successfully crawled links", len(master)
	print "number of failed links", len(failed_links)

"""you can change the variables here and test run the script for various scenarios or in any way you like."""
if __name__ == '__main__':
	main(1000,"http://python.org")
	import mechanize
	from urlparse import urlparse, urljoin

	"""Mechanize will open the link given to it and get the links without breaking much sweat,
	I am not sure what kind of repository setup you have so i am just going to use a global list named master to store all the succesfully crawled links, I also added a failed_link list that will store the links that the mechanize model was unable to open for further crawling. """

	"""I wasn't sure about the links that the program needs to handle or how those links needed to be handled, so this program assumes that the link given to it is a valid link and i tried to make the program such that it doest not fail without warning"""

	"""to form the repository of sites, the master and failed_links lists needs to be stored. Personally I would prefer storing the data in Mongo in the structure:
	{'site' : site,
	'depth' : depth,
	'successful_crawls' : [master] list,
	'unsuccessful_crawls' : [failed_links] list }"""


	master = []
	failed_links = []

	#crawler function will extract the links from the page using mechanize library and return those links in a list
	def crawler(link):
	"""mechanize.Browser opens a browser instance"""
	br = mechanize.Browser()
	try:
	br.open(link)
	"""the next line br._factory.is_html = True is added to avoid the error 'mechanize._mechanize.BrowserStateError: not viewing HTML' """
	br._factory.is_html = True
	except Exception, Error:
	failed_links.append(link)
	print "Couldn't open %s --------- %s"%(link, Error)

	link_list = []
	"""in this try except the links are scraped from a page if possible"""
	try:
	for links in br.links():
	url = links.url
	link = links.absolute_url
	if link not in master:
	link_list.append(link)
	return link_list

	except Exception, Error:
	print "could not get link ",Error


	#this function will add a list of links to the global list master
	def add_to_master(tmp_list):
	"""the temp_list is passed in this function which stores the links scraped from a single page and each link is added to the master list"""
	if tmp_list != None:
	for link in tmp_list:
	master.append(link)

	#this function will get the next link from master
	def get_link(i):
	try:
	return master[i]
	except:
	return None


	#this is the main function where the program takes shape
	"""The depth defines how deep does the program needs to dig for links and the link is the initial link from where the crawling needs to begin"""
	def main(depth,link):
	temp_list =[]
	temp_list.append(link)
	add_to_master(temp_list)
	for i in range(0,depth):
	link = get_link(i)
	if link != None:
	try:
	print 'passing link=============>', link
	temp_list = crawler(link)
	add_to_master(temp_list)
	except:
	print "unable to handle link"
	else:
	print "most probably request disallowed by robots.txt "
	break
	print "number of successfully crawled links", len(master)
	print "number of failed links", len(failed_links)

	"""you can change the variables here and test run the script for various scenarios or in any way you like."""
	if __name__ == '__main__':
	main(1000,"http://python.org")