Skip to content

Instantly share code, notes, and snippets.

@sebgoa
Last active October 6, 2023 13:25
Show Gist options
  • Save sebgoa/5027212 to your computer and use it in GitHub Desktop.
Save sebgoa/5027212 to your computer and use it in GitHub Desktop.
A web crawler and graph builder
#!/usr/bin/env python
import subprocess
import urlparse
import urllib2
from urllib import urlencode
import socket
import Queue
import threading
import getopt
import sys
import os
'''Adding a comment for a test'''
'''Non-standard modules for html parsing and graph creation'''
import networkx as nx
import BeautifulSoup
'''Needed to plot the graph, otherwise skip'''
import matplotlib
matplotlib.use('TKAgg')
import matplotlib.pyplot as plt
max_threads=50
next_url=Queue.Queue()
crawled_urls=[]
def check_link(url):
'''Function to test that the url is in the clemson.edu domain and that it is not a pdf file'''
domain='.'.join(urlparse.urlparse(url).netloc.split('.')[-2:])
filetype = urlparse.urlparse(url).path.split('/')[-1:][0].split('.')[-1:][0]
if (domain == 'clemson.edu' and filetype != 'pdf'):
return True
else:
return False
def get_host(url):
'''Function to get the IP adress of the host serving the page'''
return socket.gethostbyname(urlparse.urlparse(url).netloc)
def get_links_from_page(url):
'''Function to extract a list of urls from a page
Uses a flag to choose between html parsers...more can be implemented
'''
global parser_flag
if parser_flag == 'lynx':
res=subprocess.Popen('lynx -dump ' + url + '| grep http | awk \'{print $2}\' \
| uniq',shell=True,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
(pstdout, pstderr) = res.communicate()
urllist = pstdout.split("\n")
elif parser_flag == 'beautifulsoup':
urllist = []
try:
'''This may fail due to unicode issues, needs to be checked'''
res=urllib2.urlopen(url)
htmlpage=res.read()
except:
return urllist
try:
page=BeautifulSoup.BeautifulSoup(htmlpage)
except:
return urllist
refs=page.findAll("a")
for a in refs:
try:
link = a['href']
if link[:4] == 'http':
urllist.append(link)
except:
pass
else:
print "Do not know how to parse the html !!! Specify a parser_flag"
return urllist
def find_links(url_tuple,graph):
'''Crawls to a given depth using a tuple structure to tag urls with their depth'''
global crawled_urls,next_url,max_depth
print url_tuple
print len(crawled_urls)
url = url_tuple[0]
depth = url_tuple[1]
if ( depth < max_depth and check_link(url) == True) :
links = get_links_from_page(url)
for link in links:
'''These two lines create the graph'''
graph.add_node(link)
graph.add_edge(url,link)
'''If the link has not been crawled yet, add it in the queue with additional depth'''
if link not in crawled_urls:
next_url.put((link,depth+1))
crawled_urls.append(link)
else:
pass
return
class crawler_thread(threading.Thread):
'''Consumer thread that gets a url from the queue and find the links in that page url'''
def __init__(self,queue,graph):
threading.Thread.__init__(self)
self.to_be_crawled=queue
self.graph=graph
def run(self):
while self.to_be_crawled.empty() is False:
find_links(self.to_be_crawled.get(),self.graph)
def draw_graph(graph,graph_file_name):
'''Function to draw the graph and save the files'''
nx.draw(graph,with_labels=False)
nx.write_dot(graph,os.cwd()+graph_file_name+'.dot')
plt.savefig(os.cwd()+graph_file_name+'.png')
def usage():
'''Usage function, prints the usage to stdout'''
print '-r specifies the root url \
-d specifies the depth \
-p specifies the parser'
def main():
'''Initiates the queue by putting the root url in it
Then iterates until the queue is empty
A simple threaded version starts crawler_thread to empty the queue
Speed up seems limited and suspicious :), to be checked
'''
next_url.put((root_url,0))
crawled_urls.append(root_url)
ip_list=[]
g=nx.Graph()
g.add_node(root_url)
thread_list=[]
for i in range(max_threads):
t=crawler_thread(next_url,g)
t.daemon=True
t.start()
thread_list.append(t)
for t in thread_list:
t.join()
for url in crawled_urls:
ip_list.append(socket.gethostbyname(urlparse.urlparse(url).netloc))
ip_list=list(set(ip_list))
print "Unique Host: %s " % len(ip_list)
fh=open(os.getcwd()+'/targets.list','w')
for ip in ip_list:
fh.write(str(ip)+'\n')
nodesize=[g.degree(n)*10 for n in g]
pos=nx.spring_layout(g,iterations=20)
#pos=nx.graphviz_layout(g,prog='neato')
#pos=nx.spectral_layout(g)
nx.draw(g,with_labels=False)
nx.draw_networkx_nodes(g,pos,node_size=nodesize,node_color='r')
nx.draw_networkx_edges(g,pos)
plt.show()
plt.savefig("/Users/runseb/Desktop/crawl.png")
nx.write_dot(g,"/Users/runseb/Desktop/crawl.dot")
if __name__=='__main__':
try:
options, remainder = getopt.getopt(sys.argv[1:],'r:p:d:')
except getopt.GetoptError, err:
print str(err)
usage()
sys.exit(2)
'''Set defaults'''
root_url='http://www.clemson.edu'
parser_flag = 'beautifulsoup'
max_depth=2
for opt, arg in options:
if opt == '-r':
root_url = arg
elif opt == '-p':
parser_flag = arg
elif opt == '-d':
max_depth == arg
else:
usage()
sys.exit(2)
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment