zstix/sitemap-crawl.py

## sitemap-crawl.py
import sys
import urllib2

from HTMLParser import HTMLParser

# TODO: unique list
# TODO: kick out external urls?
# TODO: output file
# TODO: better CLI interface

# Extends the HTMLParser class to get all the link urls on a page.
# Optionally, can be supplied a tag type, key, and value for the element
# containing the links (rather than getting all links on the page).
class SiteMapParser(HTMLParser):
    def __init__(self, tag = '', name = '', value = ''):
        HTMLParser.__init__(self)
        self.targetTag = tag
        self.targetName = name
        self.targetValue = value
        self.urls = []
        # if no selecter is set, then get everything
        if tag != '' and name != '' and value != '':
            self.match = True
        else:
            self.match = False

    def handle_starttag(self, tag, attrs):
        """First, look for the start of the selector. If set, and we're looking
        at a link, add the url for the link to the list"""
        if tag == self.targetTag:
            for name, value in attrs:
                if name == self.targetName and value == self.targetValue:
                    self.match = True
        if tag == "a" and self.match:
            for name, value in attrs:
                if name == "href":
                    self.urls.append(value)

def check_url(url, text, base = ""):
    """Check if text exists on a given page. Returns a boolean and prints status"""
    # if a relative path, tack on the base url
    if base not in url:
        url = base + url
    url = url.replace(" ", "%20")
    try:
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        print("[*] %s %s" %(response.getcode(), url))
        if text in response.read():
            print("\t[*] %s contains '%s'" %(url, text))
            return True
        else:
            return False
        response.close()
    except urllib2.HTTPError as error:
        print("\t[!] Failed to connect to %s" % url)
        print("\t[!] %s" % error)
        pass

def get_links(url, tag, name, value):
    """Finds all the links on a given page (within an optional selector).
    Returns a list of urls"""
    try:
        request = urllib2.Request(url)
        response = urllib2.urlopen(request)
        page = response.read()
        parser = SiteMapParser(tag, name, value)
        parser.feed(page)
        return parser.urls
    except urllib2.HTTPError as error:
        print("[!] Failed to connect to %s" % url)
        print("[!] %s" % error)
        sys.exit(1)

def check_args():
    """Check that the program has the right number of arguments"""
    if len(sys.argv) <= 5:
        print("[!] Missing arguments")
        print("[?] Usage: python %s <base-url> <sitemap-path> <selector-tag> <selector-name> <selector-value>" % sys.argv[0])
        print("[?] Example: python %s http://foo.com/ sitemap.html ul id site-map-list" % sys.argv[0])
        sys.exit(1)

if __name__ == "__main__":
    # get the CLI arguments
    check_args()
    base, sitemap, tag, name, value = sys.argv[1:]
    # find all the urls
    sitemap_url = base + sitemap
    urls = get_links(sitemap_url, tag, name, value)
    # check each url
    for url in urls:
        check_url(url, "database", base)
    sys.exit(0)
	import sys
	import urllib2

	from HTMLParser import HTMLParser

	# TODO: unique list
	# TODO: kick out external urls?
	# TODO: output file
	# TODO: better CLI interface

	# Extends the HTMLParser class to get all the link urls on a page.
	# Optionally, can be supplied a tag type, key, and value for the element
	# containing the links (rather than getting all links on the page).
	class SiteMapParser(HTMLParser):
	def __init__(self, tag = '', name = '', value = ''):
	HTMLParser.__init__(self)
	self.targetTag = tag
	self.targetName = name
	self.targetValue = value
	self.urls = []
	# if no selecter is set, then get everything
	if tag != '' and name != '' and value != '':
	self.match = True
	else:
	self.match = False

	def handle_starttag(self, tag, attrs):
	"""First, look for the start of the selector. If set, and we're looking
	at a link, add the url for the link to the list"""
	if tag == self.targetTag:
	for name, value in attrs:
	if name == self.targetName and value == self.targetValue:
	self.match = True
	if tag == "a" and self.match:
	for name, value in attrs:
	if name == "href":
	self.urls.append(value)

	def check_url(url, text, base = ""):
	"""Check if text exists on a given page. Returns a boolean and prints status"""
	# if a relative path, tack on the base url
	if base not in url:
	url = base + url
	url = url.replace(" ", "%20")
	try:
	request = urllib2.Request(url)
	response = urllib2.urlopen(request)
	print("[*] %s %s" %(response.getcode(), url))
	if text in response.read():
	print("\t[*] %s contains '%s'" %(url, text))
	return True
	else:
	return False
	response.close()
	except urllib2.HTTPError as error:
	print("\t[!] Failed to connect to %s" % url)
	print("\t[!] %s" % error)
	pass

	def get_links(url, tag, name, value):
	"""Finds all the links on a given page (within an optional selector).
	Returns a list of urls"""
	try:
	request = urllib2.Request(url)
	response = urllib2.urlopen(request)
	page = response.read()
	parser = SiteMapParser(tag, name, value)
	parser.feed(page)
	return parser.urls
	except urllib2.HTTPError as error:
	print("[!] Failed to connect to %s" % url)
	print("[!] %s" % error)
	sys.exit(1)

	def check_args():
	"""Check that the program has the right number of arguments"""
	if len(sys.argv) <= 5:
	print("[!] Missing arguments")
	print("[?] Usage: python %s <base-url> <sitemap-path> <selector-tag> <selector-name> <selector-value>" % sys.argv[0])
	print("[?] Example: python %s http://foo.com/ sitemap.html ul id site-map-list" % sys.argv[0])
	sys.exit(1)

	if __name__ == "__main__":
	# get the CLI arguments
	check_args()
	base, sitemap, tag, name, value = sys.argv[1:]
	# find all the urls
	sitemap_url = base + sitemap
	urls = get_links(sitemap_url, tag, name, value)
	# check each url
	for url in urls:
	check_url(url, "database", base)
	sys.exit(0)