sebgoa/crawler.py

## crawler.py
#!/usr/bin/env python

import subprocess
import urlparse
import urllib2
from urllib import urlencode
import socket
import Queue
import threading
import getopt
import sys
import os

'''Adding a comment for a test'''

'''Non-standard modules for html parsing and graph creation'''
import networkx as nx
import BeautifulSoup

'''Needed to plot the graph, otherwise skip'''
import matplotlib
matplotlib.use('TKAgg')
import matplotlib.pyplot as plt

max_threads=50
next_url=Queue.Queue()
crawled_urls=[]

def check_link(url):
  '''Function to test that the url is in the clemson.edu domain and that it is not a pdf file'''
	domain='.'.join(urlparse.urlparse(url).netloc.split('.')[-2:])
	filetype = urlparse.urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]

	if  (domain == 'clemson.edu' and filetype != 'pdf'):
		return True
	else:
		return False

def get_host(url):
	'''Function to get the IP adress of the host serving the page'''
	return socket.gethostbyname(urlparse.urlparse(url).netloc)

def get_links_from_page(url):
	'''Function to extract a list of urls from a page
	Uses a flag to choose between html parsers...more can be implemented
	'''
	global parser_flag

	if parser_flag == 'lynx':
		res=subprocess.Popen('lynx -dump ' + url + '| grep http | awk \'{print $2}\' \
			| uniq',shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
		(pstdout, pstderr) = res.communicate()
		urllist = pstdout.split("\n")

	elif parser_flag == 'beautifulsoup':
		urllist = []
		try:
			'''This may fail due to unicode issues, needs to be checked'''
			res=urllib2.urlopen(url)
			htmlpage=res.read()
		except:
			return urllist

		try:
			page=BeautifulSoup.BeautifulSoup(htmlpage)
		except:
			return urllist

		refs=page.findAll("a")
		for a in refs:
			try:
				link = a['href']
				if link[:4] == 'http':
					urllist.append(link)
			except:
				pass
	else:
		print "Do not know how to parse the html !!! Specify a parser_flag"

	return urllist

def find_links(url_tuple,graph):
	'''Crawls to a given depth using a tuple structure to tag urls with their depth'''
	global crawled_urls,next_url,max_depth

	print url_tuple
	print len(crawled_urls)
	url = url_tuple[0]
	depth = url_tuple[1]

	if ( depth < max_depth and check_link(url) == True) :

        	links = get_links_from_page(url)

		for link in links:
			'''These two lines create the graph'''
			graph.add_node(link)
			graph.add_edge(url,link)
			'''If the link has not been crawled yet, add it in the queue with additional depth'''
			if link not in crawled_urls:
				next_url.put((link,depth+1))
				crawled_urls.append(link)
	else:
		pass
	return

class crawler_thread(threading.Thread):
	'''Consumer thread that gets a url from the queue and find the links in that page url'''
	def __init__(self,queue,graph):
		threading.Thread.__init__(self)
		self.to_be_crawled=queue
		self.graph=graph
	def run(self):
		while self.to_be_crawled.empty() is False:
			find_links(self.to_be_crawled.get(),self.graph)

def draw_graph(graph,graph_file_name):
	'''Function to draw the graph and save the files'''
	nx.draw(graph,with_labels=False)
	nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
	plt.savefig(os.cwd()+graph_file_name+'.png')

def usage():
	'''Usage function, prints the usage to stdout'''
	print '-r specifies the root url \
		-d specifies the depth \
		-p specifies the parser'

def main():
	'''Initiates the queue by putting the root url in it
	Then iterates until the queue is empty
	A simple threaded version starts crawler_thread to empty the queue
	Speed up seems limited and suspicious :), to be checked
	'''

	next_url.put((root_url,0))
	crawled_urls.append(root_url)
	ip_list=[]
	g=nx.Graph()
	g.add_node(root_url)
	thread_list=[]

	for i in range(max_threads):
		t=crawler_thread(next_url,g)
		t.daemon=True
		t.start()
		thread_list.append(t)

	for t in thread_list:
		t.join()

	for url in crawled_urls:
		ip_list.append(socket.gethostbyname(urlparse.urlparse(url).netloc))
		ip_list=list(set(ip_list))

	print "Unique Host: %s " % len(ip_list)

	fh=open(os.getcwd()+'/targets.list','w')
	for ip in ip_list:
		fh.write(str(ip)+'\n')

	nodesize=[g.degree(n)*10 for n in g]
	pos=nx.spring_layout(g,iterations=20)
	#pos=nx.graphviz_layout(g,prog='neato')
	#pos=nx.spectral_layout(g)
	nx.draw(g,with_labels=False)
	nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
	nx.draw_networkx_edges(g,pos)
	plt.show()
	plt.savefig("/Users/runseb/Desktop/crawl.png")
	nx.write_dot(g,"/Users/runseb/Desktop/crawl.dot")


if __name__=='__main__':

	try:
		options, remainder = getopt.getopt(sys.argv[1:],'r:p:d:')
	except getopt.GetoptError, err:
		print str(err)
	        usage()
        	sys.exit(2)

	'''Set defaults'''
	root_url='http://www.clemson.edu'
	parser_flag = 'beautifulsoup'
	max_depth=2

	for opt, arg in options:
		if opt == '-r':
			root_url = arg
		elif opt == '-p':
			parser_flag = arg
		elif opt == '-d':
			max_depth == arg
		else:
			usage()
			sys.exit(2)

	sys.exit(main())
	#!/usr/bin/env python

	import subprocess
	import urlparse
	import urllib2
	from urllib import urlencode
	import socket
	import Queue
	import threading
	import getopt
	import sys
	import os

	'''Adding a comment for a test'''

	'''Non-standard modules for html parsing and graph creation'''
	import networkx as nx
	import BeautifulSoup

	'''Needed to plot the graph, otherwise skip'''
	import matplotlib
	matplotlib.use('TKAgg')
	import matplotlib.pyplot as plt

	max_threads=50
	next_url=Queue.Queue()
	crawled_urls=[]

	def check_link(url):
	'''Function to test that the url is in the clemson.edu domain and that it is not a pdf file'''
	domain='.'.join(urlparse.urlparse(url).netloc.split('.')[-2:])
	filetype = urlparse.urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]

	if (domain == 'clemson.edu' and filetype != 'pdf'):
	return True
	else:
	return False

	def get_host(url):
	'''Function to get the IP adress of the host serving the page'''
	return socket.gethostbyname(urlparse.urlparse(url).netloc)

	def get_links_from_page(url):
	'''Function to extract a list of urls from a page
	Uses a flag to choose between html parsers...more can be implemented
	'''
	global parser_flag

	if parser_flag == 'lynx':
	res=subprocess.Popen('lynx -dump ' + url + '\| grep http \| awk \'{print $2}\' \
	\| uniq',shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
	(pstdout, pstderr) = res.communicate()
	urllist = pstdout.split("\n")

	elif parser_flag == 'beautifulsoup':
	urllist = []
	try:
	'''This may fail due to unicode issues, needs to be checked'''
	res=urllib2.urlopen(url)
	htmlpage=res.read()
	except:
	return urllist

	try:
	page=BeautifulSoup.BeautifulSoup(htmlpage)
	except:
	return urllist

	refs=page.findAll("a")
	for a in refs:
	try:
	link = a['href']
	if link[:4] == 'http':
	urllist.append(link)
	except:
	pass
	else:
	print "Do not know how to parse the html !!! Specify a parser_flag"

	return urllist

	def find_links(url_tuple,graph):
	'''Crawls to a given depth using a tuple structure to tag urls with their depth'''
	global crawled_urls,next_url,max_depth

	print url_tuple
	print len(crawled_urls)
	url = url_tuple[0]
	depth = url_tuple[1]

	if ( depth < max_depth and check_link(url) == True) :

	links = get_links_from_page(url)

	for link in links:
	'''These two lines create the graph'''
	graph.add_node(link)
	graph.add_edge(url,link)
	'''If the link has not been crawled yet, add it in the queue with additional depth'''
	if link not in crawled_urls:
	next_url.put((link,depth+1))
	crawled_urls.append(link)
	else:
	pass
	return

	class crawler_thread(threading.Thread):
	'''Consumer thread that gets a url from the queue and find the links in that page url'''
	def __init__(self,queue,graph):
	threading.Thread.__init__(self)
	self.to_be_crawled=queue
	self.graph=graph
	def run(self):
	while self.to_be_crawled.empty() is False:
	find_links(self.to_be_crawled.get(),self.graph)

	def draw_graph(graph,graph_file_name):
	'''Function to draw the graph and save the files'''
	nx.draw(graph,with_labels=False)
	nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
	plt.savefig(os.cwd()+graph_file_name+'.png')

	def usage():
	'''Usage function, prints the usage to stdout'''
	print '-r specifies the root url \
	-d specifies the depth \
	-p specifies the parser'

	def main():
	'''Initiates the queue by putting the root url in it
	Then iterates until the queue is empty
	A simple threaded version starts crawler_thread to empty the queue
	Speed up seems limited and suspicious :), to be checked
	'''

	next_url.put((root_url,0))
	crawled_urls.append(root_url)
	ip_list=[]
	g=nx.Graph()
	g.add_node(root_url)
	thread_list=[]

	for i in range(max_threads):
	t=crawler_thread(next_url,g)
	t.daemon=True
	t.start()
	thread_list.append(t)

	for t in thread_list:
	t.join()

	for url in crawled_urls:
	ip_list.append(socket.gethostbyname(urlparse.urlparse(url).netloc))
	ip_list=list(set(ip_list))

	print "Unique Host: %s " % len(ip_list)

	fh=open(os.getcwd()+'/targets.list','w')
	for ip in ip_list:
	fh.write(str(ip)+'\n')

	nodesize=[g.degree(n)*10 for n in g]
	pos=nx.spring_layout(g,iterations=20)
	#pos=nx.graphviz_layout(g,prog='neato')
	#pos=nx.spectral_layout(g)
	nx.draw(g,with_labels=False)
	nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
	nx.draw_networkx_edges(g,pos)
	plt.show()
	plt.savefig("/Users/runseb/Desktop/crawl.png")
	nx.write_dot(g,"/Users/runseb/Desktop/crawl.dot")


	if __name__=='__main__':

	try:
	options, remainder = getopt.getopt(sys.argv[1:],'r:p:d:')
	except getopt.GetoptError, err:
	print str(err)
	usage()
	sys.exit(2)

	'''Set defaults'''
	root_url='http://www.clemson.edu'
	parser_flag = 'beautifulsoup'
	max_depth=2

	for opt, arg in options:
	if opt == '-r':
	root_url = arg
	elif opt == '-p':
	parser_flag = arg
	elif opt == '-d':
	max_depth == arg
	else:
	usage()
	sys.exit(2)

	sys.exit(main())