Skip to content

Instantly share code, notes, and snippets.

@vinitkumar
Created June 29, 2012 19:01
Show Gist options
  • Save vinitkumar/3019984 to your computer and use it in GitHub Desktop.
Save vinitkumar/3019984 to your computer and use it in GitHub Desktop.
This crawler starts with a target url, fetches the web-page of that url and parser all the links of that page and stores it in a repo. Next it uses the url from the repo and repears the same process.This process goes on till a respective number of links a
#! /usr/bin/env python
import sys,re,time,math,urllib2,optparse,urlparse
from BeautifulSoup import BeautifulSoup
from traceback import format_exc
from cgi import escape
from Queue import Queue, Empty as QueueEmpty
__author__="Vinit Kumar"
__version__ = "0.1"
__license__ ="MIT"
Version="0.1"
Usage="""
This crawler starts with a target url, fetches the web-page of that url
and parser all the links of that page and stores it in a repo. Next it
uses the url from the repo and repears the same process.This process
goes on till a respective number of links are fetched or if it reaches
its depth.
In Order to use:
$ ./crawler -d5 <url>
Here in this case it goes till depth of 5 and url is target URL to
start crawling.
"""
Agent = "%s/%s" % (__name__,__version__)
class Webcrawler(object):
def __init__(self, root, depth, locked=True):
self.root = root #start of the crawling
self.depth = depth #depth upto which it traverse
self.locked = locked
self.links = 0
self.followed = 0
self.urls = [] #the Repo
self.host = urlparse.urlparse(root)[1]
def crawl(self):
""""
This function fetches all links of the page and arranges in a
queue.Now urls take one by one and crawling operation is performed
"""
page = Linkfetcher(self.root)
page.linkfetch()
queue = Queue()
for url in page.urls:
queue.put(url)
followed = [self.root]
n = 0
while True:
try:
url = queue.get()
except QueueEmpty:
break
n += 1 #this controls the crawler to run upto specified depth
if url not in followed:
try:
host = urlparse.urlparse(url)[1]
if self.locked and re.match(".*%s" % self.host, host):
followed.append(url)
self.followed += 1
page = Linkfetcher(url)
page.linkfetch()
for i, url in enumerate(page):
if url not in self.urls:
self.links += 1
queue.put(url)
self.urls.append(url)
if n > self.depth and self.depth > 0:
break
except Exception, e:
print "ERROR: The URL '%s' can't be processed due to (%s)" % (url, e)
print format_exc()
class Linkfetcher(object):
"""This class as its name suggest fetches the links
"""
def __init__(self, url):
self.url = url
self.urls = []
def _addHeaders(self, request):
request.add_header("User-Agent", Agent)
def __getitem__(self, x):
return self.urls[x]
def open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def linkfetch(self):
request, handle = self.open()
self._addHeaders(request)
if handle:
try:
content = unicode(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr,"ERROR: %s -> %s" % (error,error.url)
else:
print >> sys.stderr,"ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags=[]
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)
def option_parser():
#This gives a cleaner interface for taking option and arguments as compared to sys.argv[]
parser = optparse.OptionParser(usage=Usage, version=Version)
parser.add_option("-l","--links",
action="store_true",default=False,dest="links",
help="Get links for target url only")
parser.add_option("-d","--depth",
action="store", type="int" ,default=30, dest="depth",
help="Maximum depth to traverse")
opts, args = parser.parse_args()
if len(args) < 1:
parser.print_help()
raise SystemExit, 1
return opts, args
def getlinks(): #the function to get links
page = Linkfetcher(url)
page.linkfetch()
for i, url in enumerate(page):
print "%d ==> %s" % (i, url)
def main():
opts, args = option_parser()
url = args[0] #target url
if opts.links:
getlinks(url)
raise SystemExit, 0
depth = opts.depth
sTime = time.time() #start-time
print "Crawler started for %s, will crawl upto depth %d" %(url, depth)
print "==============================================================="
webcrawler = Webcrawler(url,depth)
webcrawler.crawl()
print "\n".join(webcrawler.urls)
eTime = time.time() # end-time
tTime = eTime - sTime #time taken for crawling
print "\n"
print "Crawler Statistics"
print "=================="
print "No of links Found: %d" % webcrawler.links
print "No of follwed: %d" % webcrawler.followed
print "Time Stats : Found all links after %0.2fs" % tTime
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment