Created
October 1, 2017 02:36
-
-
Save manasmbellani/d2ef11a854d698779ab7767f99d55f08 to your computer and use it in GitHub Desktop.
recon_info_netcraft.py - Script to download sub domains for a given domain from netcraft.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import subprocess | |
import shlex | |
import requests | |
from bs4 import BeautifulSoup | |
from sys import exit | |
from argparse import ArgumentParser, RawTextHelpFormatter | |
### URL to searchdns.netcraft.com to get all the domains | |
NETCRAFT_DOMAIN_NAME = "http://searchdns.netcraft.com" | |
NETCRAFT_REQUEST_URL = "/?restriction=site+contains&host=" | |
### Write local output from a command to this file ### | |
DEFAULT_OUTPUT_FILE = "/tmp/out10.txt" | |
### User Agent String for web requests | |
USER_AGENT_STRING = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " | |
"AppleWebKit/537.36 (KHTML, like Gecko) " | |
"Chrome/60.0.3112.113 Safari/537.36") | |
### TO_MODIFY: Define the description of the script | |
DEFINITION = "Runs crtsh to get the domains for the certificates" | |
def write_output_to_file(outputfile, output): | |
with open(outputfile, "wb+") as f: | |
f.write(output.encode("utf-8")) | |
def make_get_request_via_browser(url, browser=None): | |
""" Make request for a URL using existing browser, or create a new browser tab """ | |
if browser: | |
browser.get(url) | |
else: | |
browser = webdriver.Firefox() | |
browser.get(url) | |
return browser | |
def make_get_request(url): | |
""" Make Get request to a URL """ | |
headers = {"User-Agent": USER_AGENT_STRING} | |
resp = requests.get(url, headers=headers) | |
return resp | |
def main(): | |
parser = ArgumentParser(description=DEFINITION) | |
### TO_MODIFY: Define the arguments that script takes | |
parser.add_argument("--domain", dest="domain", action="store", required=True) | |
parser.add_argument("--outputfile", dest="outputfile", action="store", required=False) | |
### Read the arguments, and update executable locations to config | |
args = parser.parse_args() | |
config = vars(args) | |
### All output from commands executed are stored here | |
output = "" | |
domain = config["domain"] | |
num_pages_parsed = 0 | |
all_pages_parsed = False | |
all_domains = set() | |
url = NETCRAFT_DOMAIN_NAME + NETCRAFT_REQUEST_URL + "{}".format(domain) | |
while not all_pages_parsed: | |
print("[i] Making request to url {} to get all domains".format(url)) | |
resp = make_get_request(url) | |
print("[i] Response length: {}".format(len(resp.text))) | |
bs = BeautifulSoup(resp.text, "lxml") | |
print("[i] Parsing page {}".format(num_pages_parsed + 1)) | |
print("[i] Getting results table") | |
results_table = None | |
try: | |
results_table = bs.find(attrs={"class": "TBtable"}) | |
except Exception as e: | |
print("[-] Results table not found.") | |
print("[-] Error: {}".format(e)) | |
exit(1) | |
if results_table: | |
print("[i] Parsing results table (excluding header row)") | |
result_rows = None | |
try: | |
result_rows = results_table.findAll("tr") | |
result_rows = result_rows[1:] | |
except Exception as e: | |
print("[-] result_rows not found") | |
print("[-] Error: {}".format(e)) | |
exit(1) | |
if result_rows: | |
print("[i] Parsing url from each table row") | |
try: | |
get_url = lambda row: row.findAll("td")[1].find("a").text | |
domains_on_this_page = [get_url(r) for r in result_rows] | |
except Exception as e: | |
print("[-] Cannot parse domain") | |
print("[-] Error: {}".format(e)) | |
exit(1) | |
print("[+] Number of domains parsed from the page: {}".format(len(domains_on_this_page))) | |
### Display the domains discovered | |
output = "\n".join(domains_on_this_page) | |
print("[+] Domains discovered on this page:\n{}\n".format(output)) | |
### add domains on this page to the current list of domains | |
all_domains.update(domains_on_this_page) | |
### is there another page? | |
all_urls_on_page = bs.findAll("a") | |
next_page_url_elem = [elem for elem in all_urls_on_page if "Next page" in elem.text] | |
if next_page_url_elem: | |
### Get the next page | |
next_page_url = next_page_url_elem[0].get("href") | |
url = NETCRAFT_DOMAIN_NAME + next_page_url | |
num_pages_parsed += 1 | |
else: | |
### No next page, all URLs have been parsed | |
all_pages_parsed = True | |
### Finally write the output to the specified file, if defined | |
output = "\n".join(all_domains) | |
print("[+] Total number of domains discovered: {}".format(len(all_domains))) | |
print("[+] Total domains discovered:\n{}\n".format(output)) | |
if "outputfile" in config and config["outputfile"]: | |
write_output_to_file(config["outputfile"], output) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment