Skip to content

Instantly share code, notes, and snippets.

@zstix
Last active July 30, 2019 16:16
Show Gist options
  • Save zstix/6c057637da65de0c10f0fd1ef3a28bfc to your computer and use it in GitHub Desktop.
Save zstix/6c057637da65de0c10f0fd1ef3a28bfc to your computer and use it in GitHub Desktop.
import sys
import urllib2
from HTMLParser import HTMLParser
# TODO: unique list
# TODO: kick out external urls?
# TODO: output file
# TODO: better CLI interface
# Extends the HTMLParser class to get all the link urls on a page.
# Optionally, can be supplied a tag type, key, and value for the element
# containing the links (rather than getting all links on the page).
class SiteMapParser(HTMLParser):
def __init__(self, tag = '', name = '', value = ''):
HTMLParser.__init__(self)
self.targetTag = tag
self.targetName = name
self.targetValue = value
self.urls = []
# if no selecter is set, then get everything
if tag != '' and name != '' and value != '':
self.match = True
else:
self.match = False
def handle_starttag(self, tag, attrs):
"""First, look for the start of the selector. If set, and we're looking
at a link, add the url for the link to the list"""
if tag == self.targetTag:
for name, value in attrs:
if name == self.targetName and value == self.targetValue:
self.match = True
if tag == "a" and self.match:
for name, value in attrs:
if name == "href":
self.urls.append(value)
def check_url(url, text, base = ""):
"""Check if text exists on a given page. Returns a boolean and prints status"""
# if a relative path, tack on the base url
if base not in url:
url = base + url
url = url.replace(" ", "%20")
try:
request = urllib2.Request(url)
response = urllib2.urlopen(request)
print("[*] %s %s" %(response.getcode(), url))
if text in response.read():
print("\t[*] %s contains '%s'" %(url, text))
return True
else:
return False
response.close()
except urllib2.HTTPError as error:
print("\t[!] Failed to connect to %s" % url)
print("\t[!] %s" % error)
pass
def get_links(url, tag, name, value):
"""Finds all the links on a given page (within an optional selector).
Returns a list of urls"""
try:
request = urllib2.Request(url)
response = urllib2.urlopen(request)
page = response.read()
parser = SiteMapParser(tag, name, value)
parser.feed(page)
return parser.urls
except urllib2.HTTPError as error:
print("[!] Failed to connect to %s" % url)
print("[!] %s" % error)
sys.exit(1)
def check_args():
"""Check that the program has the right number of arguments"""
if len(sys.argv) <= 5:
print("[!] Missing arguments")
print("[?] Usage: python %s <base-url> <sitemap-path> <selector-tag> <selector-name> <selector-value>" % sys.argv[0])
print("[?] Example: python %s http://foo.com/ sitemap.html ul id site-map-list" % sys.argv[0])
sys.exit(1)
if __name__ == "__main__":
# get the CLI arguments
check_args()
base, sitemap, tag, name, value = sys.argv[1:]
# find all the urls
sitemap_url = base + sitemap
urls = get_links(sitemap_url, tag, name, value)
# check each url
for url in urls:
check_url(url, "database", base)
sys.exit(0)
@zstix
Copy link
Author

zstix commented Jul 24, 2019

Needs better parameter input, but it's a start.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment