Skip to content

Instantly share code, notes, and snippets.

@StoneSwine
Last active October 15, 2020 08:31
Show Gist options
  • Save StoneSwine/97ddc17318bcd768b2c5e1a643de38ea to your computer and use it in GitHub Desktop.
Save StoneSwine/97ddc17318bcd768b2c5e1a643de38ea to your computer and use it in GitHub Desktop.
Fast recursive link discovery, a bit like the spider in previous versions of Burp. Limits to content on the specified website. (Spawns a lot of processes and threads)
#!/usr/bin/env python3
from multiprocessing import Process, Manager
from threading import Thread
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
baseurl = input("ONE domain to crawl [must include scheme]: ").strip()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
def recursive_crawler(url, urlist):
pro = []
u=urlparse(url)
if not all([u.scheme, u.netloc, u.path]):
url = baseurl + url
try:
page = requests.get(url, timeout=10, headers=headers)
for link in [i for i in BeautifulSoup(page.text, 'html.parser').find_all("a") if i.has_attr("href")]:
tmpurl = urlparse(link["href"])
if (tmpurl.scheme == "https" or tmpurl.scheme == "http" or tmpurl.scheme == "") \
and (tmpurl.netloc == "" or tmpurl.netloc == urlparse(baseurl).netloc) \
and tmpurl.path.strip() not in urlist and tmpurl.path.strip() != "":
if tmpurl.path.strip()[0] == "/":
urlist.append(tmpurl.path.strip())
pr = Thread(target=recursive_crawler, args=(baseurl + tmpurl.path.strip(), urlist,))
pr.start()
pro.append(pr)
for pr in pro:
pr.join()
except requests.exceptions.RequestException:
pass
if __name__ == '__main__':
url_list = Manager().list()
writers = []
try:
page = requests.get(baseurl, timeout=10, headers=headers)
for link in [i for i in BeautifulSoup(page.text, 'html.parser').find_all("a") if i.has_attr("href")]:
proc = Process(target=recursive_crawler, args=(link["href"], url_list,))
proc.start()
writers.append(proc)
for p in writers:
p.join()
print("\n".join(url_list))
except requests.exceptions.RequestException:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment