Last active
July 30, 2019 16:16
-
-
Save zstix/6c057637da65de0c10f0fd1ef3a28bfc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import urllib2 | |
from HTMLParser import HTMLParser | |
# TODO: unique list | |
# TODO: kick out external urls? | |
# TODO: output file | |
# TODO: better CLI interface | |
# Extends the HTMLParser class to get all the link urls on a page. | |
# Optionally, can be supplied a tag type, key, and value for the element | |
# containing the links (rather than getting all links on the page). | |
class SiteMapParser(HTMLParser): | |
def __init__(self, tag = '', name = '', value = ''): | |
HTMLParser.__init__(self) | |
self.targetTag = tag | |
self.targetName = name | |
self.targetValue = value | |
self.urls = [] | |
# if no selecter is set, then get everything | |
if tag != '' and name != '' and value != '': | |
self.match = True | |
else: | |
self.match = False | |
def handle_starttag(self, tag, attrs): | |
"""First, look for the start of the selector. If set, and we're looking | |
at a link, add the url for the link to the list""" | |
if tag == self.targetTag: | |
for name, value in attrs: | |
if name == self.targetName and value == self.targetValue: | |
self.match = True | |
if tag == "a" and self.match: | |
for name, value in attrs: | |
if name == "href": | |
self.urls.append(value) | |
def check_url(url, text, base = ""): | |
"""Check if text exists on a given page. Returns a boolean and prints status""" | |
# if a relative path, tack on the base url | |
if base not in url: | |
url = base + url | |
url = url.replace(" ", "%20") | |
try: | |
request = urllib2.Request(url) | |
response = urllib2.urlopen(request) | |
print("[*] %s %s" %(response.getcode(), url)) | |
if text in response.read(): | |
print("\t[*] %s contains '%s'" %(url, text)) | |
return True | |
else: | |
return False | |
response.close() | |
except urllib2.HTTPError as error: | |
print("\t[!] Failed to connect to %s" % url) | |
print("\t[!] %s" % error) | |
pass | |
def get_links(url, tag, name, value): | |
"""Finds all the links on a given page (within an optional selector). | |
Returns a list of urls""" | |
try: | |
request = urllib2.Request(url) | |
response = urllib2.urlopen(request) | |
page = response.read() | |
parser = SiteMapParser(tag, name, value) | |
parser.feed(page) | |
return parser.urls | |
except urllib2.HTTPError as error: | |
print("[!] Failed to connect to %s" % url) | |
print("[!] %s" % error) | |
sys.exit(1) | |
def check_args(): | |
"""Check that the program has the right number of arguments""" | |
if len(sys.argv) <= 5: | |
print("[!] Missing arguments") | |
print("[?] Usage: python %s <base-url> <sitemap-path> <selector-tag> <selector-name> <selector-value>" % sys.argv[0]) | |
print("[?] Example: python %s http://foo.com/ sitemap.html ul id site-map-list" % sys.argv[0]) | |
sys.exit(1) | |
if __name__ == "__main__": | |
# get the CLI arguments | |
check_args() | |
base, sitemap, tag, name, value = sys.argv[1:] | |
# find all the urls | |
sitemap_url = base + sitemap | |
urls = get_links(sitemap_url, tag, name, value) | |
# check each url | |
for url in urls: | |
check_url(url, "database", base) | |
sys.exit(0) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Needs better parameter input, but it's a start.