Skip to content

Instantly share code, notes, and snippets.

@meatyite
Last active April 12, 2019 17:04
Show Gist options
  • Save meatyite/a9aa0fda272053c215b8ccb8383a8fa9 to your computer and use it in GitHub Desktop.
Save meatyite/a9aa0fda272053c215b8ccb8383a8fa9 to your computer and use it in GitHub Desktop.
http_tree.py
#!/usr/bin/python3
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse, urljoin
import urllib.request as urllib
import sys
from os.path import basename
already_scanned_urls = []
def isDirnameURL(url):
if basename(url) == '' or url.endswith('.html') or url.endswith('.php') or url.endswith('.aspx') or basename(url).startswith('?'):
return True
else:
return False
def search_for_url_type(soup, main_url, urlType):
tags = soup.find_all(urlType)
for tag in tags:
tag_src = tag.get('src')
tag_href = tag.get('href')
if tag != None:
if tag_src != None:
tag_src = urljoin(tag_src, main_url)
print((main_url, tag_src))
elif tag_href != None:
try:
tag_href = urljoin(main_url, tag_href)
if not(tag_href in already_scanned_urls):
already_scanned_urls.append(tag_href)
search_for_links(tag_href)
else:
pass
except Exception as e:
pass
print((main_url, tag_href))
elif tag_href != None and tag_src != None:
tag_src = urljoin(tag_src, tag_href)
try:
tag_href = urljoin(main_url, tag_href)
if not(tag_href in already_scanned_urls):
already_scanned_urls.append(tag_href)
search_for_links(tag_href)
else:
pass
except Exception as e:
pass
print((main_url, tag_src, tag_href))
def replace_url(url):
return url.replace('https://', 'http://') # urllib doesn't support https
def _search_for_links(url):
print("URL:" + url)
soup = bs(urllib.urlopen(url).read(), 'html.parser')
print("PAGE URLS:")
search_for_url_type(soup, url, 'a')
print("IMAGE URLS:")
search_for_url_type(soup, url, 'img')
print("VIDEO URLS:")
search_for_url_type(soup, url, 'video')
print("IFRAME URLS:")
search_for_url_type(soup, url, 'iframe')
print("DIV URLS:")
search_for_url_type(soup, url, 'div')
print("SCRIPT URLS:")
search_for_url_type(soup, url, 'script')
def search_for_links(url):
url = replace_url(url)
_search_for_links(url)
if __name__ == "__main__":
page_url = sys.argv[1]
if page_url == '-h' or page_url == '--help':
print("usage: http_tree.py [url]")
sys.exit(0)
search_for_links(page_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment