Last active
May 26, 2019 02:19
-
-
Save Aero-Blue/5a879c363220f0e23956b3facb5e429d to your computer and use it in GitHub Desktop.
Crawls a specified domain and looks for malformed headers, exporting them to a file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from urllib.parse import urlparse, urljoin | |
from bs4 import BeautifulSoup | |
import itertools | |
import os, sys | |
from collections import namedtuple | |
class LinkScraper: | |
def __init__(self, domain): | |
self.domain = domain | |
self.robots = False | |
self.start() | |
self.export(self.links, "links.txt") | |
self.headers = {"User-Agent": "Scraper"} | |
def start(self): | |
self.links = self.scrape_links(self.domain) | |
self.linksets = [self.scrape_links(link) for link in self.links] | |
self.links = list(set(itertools.chain.from_iterable(self.linksets))) | |
print("Successfully scraped {} total links.".format(len(self.links))) | |
def within_domain(self, url): | |
if urlparse(url).netloc == self.domain: | |
return True | |
else: | |
return False | |
def scrape_links(self, link): | |
resp = requests.request("GET", link, headers=self.headers) | |
soup = BeautifulSoup(resp.text, "html.parser") | |
hrefs = [link.get("href") for link in soup.find_all("a")] | |
links = list(set([urljoin(link, urlparse(href).path) for href in hrefs])) | |
filter(self.within_domain, links) | |
return links | |
def export(self, lst, fname): | |
folder = urlparse(self.domain).netloc | |
try: | |
os.mkdir(folder) | |
except FileExistsError: | |
pass | |
with open(os.path.join(sys.path[0], folder, fname), "w+") as f: | |
for item in lst: | |
f.write(item + "\n") | |
return print("Links successfully exported.") | |
class HeadInspector: | |
def __init__(self, domain): | |
self.fname = "links.txt" | |
self.domain = domain | |
self.folder = urlparse(self.domain).netloc | |
self.links = self.import_urls() | |
self.headers = {"User-Agent": "Scraper"} | |
self.header_list = self.get_headers() | |
self.export_headers() | |
def import_urls(self): | |
with open(os.path.join(sys.path[0], self.folder, self.fname)) as f: | |
links = f.read().split("\n") | |
return links[:20] | |
def get_headers(self): | |
def is_present(header): | |
keys = [ | |
"Allow", | |
"Location", | |
"Refresh", | |
"Server", | |
"Set:Cookie", | |
"X-Content-Security Policy", | |
"Content-Security-Policy", | |
"X-Frame-Options", | |
"X-Powered-By", | |
"X-XSS-Protection", | |
] | |
if header in keys: | |
return True | |
else: | |
return False | |
print("[Status]: Gathering headers...", flush=True, end="\r") | |
headers_list = [ | |
requests.request("OPTIONS", link, headers=self.headers).headers | |
for link in self.links | |
if requests.request("OPTIONS", link, headers=self.headers).status_code | |
is 200 | |
] | |
all_headers = [] | |
for i in range(len(headers_list)): | |
header = list(iter(headers_list[i])) | |
header_values = [ | |
"{}: {}".format(value, headers_list[i][value]) | |
for value in list(filter(is_present, header)) | |
] | |
header_values.insert(0, self.links[i]) | |
header_values.append([]) | |
all_headers.append(header_values) | |
keys = [ | |
"X-Content-Security Policy", | |
"Content-Security-Policy", | |
"X-Frame-Options", | |
"X-Powered-By", | |
"X-XSS-Protection", | |
] | |
for header in all_headers: | |
for key in keys: | |
if key not in " ".join(header[: len(header) - 1]): | |
header[len(header) - 1].append(key) | |
print("[Status]: Finished gathering headers!", flush=True, end="\r") | |
return all_headers | |
def export_headers(self): | |
print("[Status]: Exporting headers...", flush=True, end="\r") | |
with open(os.path.join(sys.path[0], self.folder, "headers.txt"), "w+") as f: | |
for header in self.header_list: | |
f.write( | |
"URL: {}\nHeaders Found: {}\nMissing Headers: {}\n".format( | |
header[0], header[1 : len(header) - 1], header[len(header) - 1] | |
) | |
) | |
return print("[Status]: Finished exporting headers!", flush=True, end="\r") | |
LinkScraper("http://example.com") | |
HeadInspector("http://example.com") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment