Skip to content

Instantly share code, notes, and snippets.

@Aero-Blue
Last active May 26, 2019 02:19
Show Gist options
  • Save Aero-Blue/5a879c363220f0e23956b3facb5e429d to your computer and use it in GitHub Desktop.
Save Aero-Blue/5a879c363220f0e23956b3facb5e429d to your computer and use it in GitHub Desktop.
Crawls a specified domain and looks for malformed headers, exporting them to a file
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import itertools
import os, sys
from collections import namedtuple
class LinkScraper:
def __init__(self, domain):
self.domain = domain
self.robots = False
self.start()
self.export(self.links, "links.txt")
self.headers = {"User-Agent": "Scraper"}
def start(self):
self.links = self.scrape_links(self.domain)
self.linksets = [self.scrape_links(link) for link in self.links]
self.links = list(set(itertools.chain.from_iterable(self.linksets)))
print("Successfully scraped {} total links.".format(len(self.links)))
def within_domain(self, url):
if urlparse(url).netloc == self.domain:
return True
else:
return False
def scrape_links(self, link):
resp = requests.request("GET", link, headers=self.headers)
soup = BeautifulSoup(resp.text, "html.parser")
hrefs = [link.get("href") for link in soup.find_all("a")]
links = list(set([urljoin(link, urlparse(href).path) for href in hrefs]))
filter(self.within_domain, links)
return links
def export(self, lst, fname):
folder = urlparse(self.domain).netloc
try:
os.mkdir(folder)
except FileExistsError:
pass
with open(os.path.join(sys.path[0], folder, fname), "w+") as f:
for item in lst:
f.write(item + "\n")
return print("Links successfully exported.")
class HeadInspector:
def __init__(self, domain):
self.fname = "links.txt"
self.domain = domain
self.folder = urlparse(self.domain).netloc
self.links = self.import_urls()
self.headers = {"User-Agent": "Scraper"}
self.header_list = self.get_headers()
self.export_headers()
def import_urls(self):
with open(os.path.join(sys.path[0], self.folder, self.fname)) as f:
links = f.read().split("\n")
return links[:20]
def get_headers(self):
def is_present(header):
keys = [
"Allow",
"Location",
"Refresh",
"Server",
"Set:Cookie",
"X-Content-Security Policy",
"Content-Security-Policy",
"X-Frame-Options",
"X-Powered-By",
"X-XSS-Protection",
]
if header in keys:
return True
else:
return False
print("[Status]: Gathering headers...", flush=True, end="\r")
headers_list = [
requests.request("OPTIONS", link, headers=self.headers).headers
for link in self.links
if requests.request("OPTIONS", link, headers=self.headers).status_code
is 200
]
all_headers = []
for i in range(len(headers_list)):
header = list(iter(headers_list[i]))
header_values = [
"{}: {}".format(value, headers_list[i][value])
for value in list(filter(is_present, header))
]
header_values.insert(0, self.links[i])
header_values.append([])
all_headers.append(header_values)
keys = [
"X-Content-Security Policy",
"Content-Security-Policy",
"X-Frame-Options",
"X-Powered-By",
"X-XSS-Protection",
]
for header in all_headers:
for key in keys:
if key not in " ".join(header[: len(header) - 1]):
header[len(header) - 1].append(key)
print("[Status]: Finished gathering headers!", flush=True, end="\r")
return all_headers
def export_headers(self):
print("[Status]: Exporting headers...", flush=True, end="\r")
with open(os.path.join(sys.path[0], self.folder, "headers.txt"), "w+") as f:
for header in self.header_list:
f.write(
"URL: {}\nHeaders Found: {}\nMissing Headers: {}\n".format(
header[0], header[1 : len(header) - 1], header[len(header) - 1]
)
)
return print("[Status]: Finished exporting headers!", flush=True, end="\r")
LinkScraper("http://example.com")
HeadInspector("http://example.com")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment