PatrikHudak/second-order.py

## second-order.py
# coding=utf-8
# python3

from urllib.parse import urlparse

import requests
import urllib3

from bs4 import BeautifulSoup

# Disable SSL insecure warnings
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Timeout for all HTTP requests
GLOBAL_HTTP_TIMEOUT = 7

# Set User-Agent for "OPSEC"
UA = {
	'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
}

def normalize_url(domain, src):
	'''
	(Try to) Normalize URL to its absolute form
	'''

	src = src.strip()
	src = src.rstrip('/')

	# Protocol relative URL
	if src.startswith('//'):
		return 'http:{}'.format(src)

	# Relative URL with /
	if src.startswith('/'):
		return 'http://{}{}'.format(domain, src)

	# Relative URL with ?
	if src.startswith('?'):
		return 'http://{}/{}'.format(domain, src)

	# Relative URL with ./
	if src.startswith('./'):
		return 'http://{}{}'.format(domain, src[1:])

	# Absolute URL
	if src.startswith('https://') or src.startswith('http://'):
		return src

	# Else let's hope it is relative URL
	return 'http://{}/{}'.format(domain, src)

def extract_javascript(domain, source_code):
	'''
	Extract and normalize external javascript files from HTML
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	scripts = [normalize_url(domain, s.get('src')) for s in tree.find_all('script') if s.get('src')]
	return list(set(scripts))

def extract_links(domain, source_code):
	'''
	Extract and normalize links in HTML file
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('a') if s.get('href')]
	return list(set(hrefs))

def extract_styles(domain, source_code):
	'''
	Extract and normalize CSS in HTML file
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('link') if s.get('href')]
	return list(set(hrefs))

def extract_cors(headers):
	cors = headers['Access-Control-Allow-Origin'].split(',')
	if '*' in cors:
		# Use your imagination here
		return []
	return cors

def extract_domain(url):
	'''Extracts domain name from given URL'''

	return urlparse(url).netloc

if __name__ == '__main__':
	# This is sample of intended functionality
	# ----
	# Note that there is a missing functionality for showing
	# origin domain name where takeover was discovered (if any)
	# ----

	domains = [] # Database retrieval
	results = {}
	for d in domains:
		for prefix in ['http://', 'https://']:
			# Trying both HTTP and HTTPS where HTTPS has higher priority
			# (Thus second in the list)
			try:
				r = requests.get('{}{}'.format(prefix, d), timeout=GLOBAL_HTTP_TIMEOUT, verify=False, headers=UA)
			except:
				pass

		if r is None:
			# Connection refused / NXDOMAIN / ...
			continue

		urls = extract_javascript(d, r.text)
		urls += extract_links(d, r.text)
		urls += extract_styles(d, r.text)
		urls += extract_cors(r.headers)

		# takeoverable = subdomain_takeover.check([extract_domain(u) for u in urls])
		# ...
	# coding=utf-8
	# python3

	from urllib.parse import urlparse

	import requests
	import urllib3

	from bs4 import BeautifulSoup

	# Disable SSL insecure warnings
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	# Timeout for all HTTP requests
	GLOBAL_HTTP_TIMEOUT = 7

	# Set User-Agent for "OPSEC"
	UA = {
	'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36"
	}

	def normalize_url(domain, src):
	'''
	(Try to) Normalize URL to its absolute form
	'''

	src = src.strip()
	src = src.rstrip('/')

	# Protocol relative URL
	if src.startswith('//'):
	return 'http:{}'.format(src)

	# Relative URL with /
	if src.startswith('/'):
	return 'http://{}{}'.format(domain, src)

	# Relative URL with ?
	if src.startswith('?'):
	return 'http://{}/{}'.format(domain, src)

	# Relative URL with ./
	if src.startswith('./'):
	return 'http://{}{}'.format(domain, src[1:])

	# Absolute URL
	if src.startswith('https://') or src.startswith('http://'):
	return src

	# Else let's hope it is relative URL
	return 'http://{}/{}'.format(domain, src)

	def extract_javascript(domain, source_code):
	'''
	Extract and normalize external javascript files from HTML
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	scripts = [normalize_url(domain, s.get('src')) for s in tree.find_all('script') if s.get('src')]
	return list(set(scripts))

	def extract_links(domain, source_code):
	'''
	Extract and normalize links in HTML file
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('a') if s.get('href')]
	return list(set(hrefs))

	def extract_styles(domain, source_code):
	'''
	Extract and normalize CSS in HTML file
	'''

	tree = BeautifulSoup(source_code, 'html.parser')
	hrefs = [normalize_url(domain, s.get('href')) for s in tree.find_all('link') if s.get('href')]
	return list(set(hrefs))

	def extract_cors(headers):
	cors = headers['Access-Control-Allow-Origin'].split(',')
	if '*' in cors:
	# Use your imagination here
	return []
	return cors

	def extract_domain(url):
	'''Extracts domain name from given URL'''

	return urlparse(url).netloc

	if __name__ == '__main__':
	# This is sample of intended functionality
	# ----
	# Note that there is a missing functionality for showing
	# origin domain name where takeover was discovered (if any)
	# ----

	domains = [] # Database retrieval
	results = {}
	for d in domains:
	for prefix in ['http://', 'https://']:
	# Trying both HTTP and HTTPS where HTTPS has higher priority
	# (Thus second in the list)
	try:
	r = requests.get('{}{}'.format(prefix, d), timeout=GLOBAL_HTTP_TIMEOUT, verify=False, headers=UA)
	except:
	pass

	if r is None:
	# Connection refused / NXDOMAIN / ...
	continue

	urls = extract_javascript(d, r.text)
	urls += extract_links(d, r.text)
	urls += extract_styles(d, r.text)
	urls += extract_cors(r.headers)

	# takeoverable = subdomain_takeover.check([extract_domain(u) for u in urls])
	# ...