Last active
October 15, 2020 08:31
-
-
Save StoneSwine/97ddc17318bcd768b2c5e1a643de38ea to your computer and use it in GitHub Desktop.
Fast recursive link discovery, a bit like the spider in previous versions of Burp. Limits to content on the specified website. (Spawns a lot of processes and threads)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from multiprocessing import Process, Manager | |
from threading import Thread | |
from urllib.parse import urlparse | |
import requests | |
from bs4 import BeautifulSoup | |
baseurl = input("ONE domain to crawl [must include scheme]: ").strip() | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' | |
} | |
def recursive_crawler(url, urlist): | |
pro = [] | |
u=urlparse(url) | |
if not all([u.scheme, u.netloc, u.path]): | |
url = baseurl + url | |
try: | |
page = requests.get(url, timeout=10, headers=headers) | |
for link in [i for i in BeautifulSoup(page.text, 'html.parser').find_all("a") if i.has_attr("href")]: | |
tmpurl = urlparse(link["href"]) | |
if (tmpurl.scheme == "https" or tmpurl.scheme == "http" or tmpurl.scheme == "") \ | |
and (tmpurl.netloc == "" or tmpurl.netloc == urlparse(baseurl).netloc) \ | |
and tmpurl.path.strip() not in urlist and tmpurl.path.strip() != "": | |
if tmpurl.path.strip()[0] == "/": | |
urlist.append(tmpurl.path.strip()) | |
pr = Thread(target=recursive_crawler, args=(baseurl + tmpurl.path.strip(), urlist,)) | |
pr.start() | |
pro.append(pr) | |
for pr in pro: | |
pr.join() | |
except requests.exceptions.RequestException: | |
pass | |
if __name__ == '__main__': | |
url_list = Manager().list() | |
writers = [] | |
try: | |
page = requests.get(baseurl, timeout=10, headers=headers) | |
for link in [i for i in BeautifulSoup(page.text, 'html.parser').find_all("a") if i.has_attr("href")]: | |
proc = Process(target=recursive_crawler, args=(link["href"], url_list,)) | |
proc.start() | |
writers.append(proc) | |
for p in writers: | |
p.join() | |
print("\n".join(url_list)) | |
except requests.exceptions.RequestException: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment