Last active
April 12, 2019 17:04
-
-
Save meatyite/a9aa0fda272053c215b8ccb8383a8fa9 to your computer and use it in GitHub Desktop.
http_tree.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
from bs4 import BeautifulSoup as bs | |
from urllib.parse import urlparse, urljoin | |
import urllib.request as urllib | |
import sys | |
from os.path import basename | |
already_scanned_urls = [] | |
def isDirnameURL(url): | |
if basename(url) == '' or url.endswith('.html') or url.endswith('.php') or url.endswith('.aspx') or basename(url).startswith('?'): | |
return True | |
else: | |
return False | |
def search_for_url_type(soup, main_url, urlType): | |
tags = soup.find_all(urlType) | |
for tag in tags: | |
tag_src = tag.get('src') | |
tag_href = tag.get('href') | |
if tag != None: | |
if tag_src != None: | |
tag_src = urljoin(tag_src, main_url) | |
print((main_url, tag_src)) | |
elif tag_href != None: | |
try: | |
tag_href = urljoin(main_url, tag_href) | |
if not(tag_href in already_scanned_urls): | |
already_scanned_urls.append(tag_href) | |
search_for_links(tag_href) | |
else: | |
pass | |
except Exception as e: | |
pass | |
print((main_url, tag_href)) | |
elif tag_href != None and tag_src != None: | |
tag_src = urljoin(tag_src, tag_href) | |
try: | |
tag_href = urljoin(main_url, tag_href) | |
if not(tag_href in already_scanned_urls): | |
already_scanned_urls.append(tag_href) | |
search_for_links(tag_href) | |
else: | |
pass | |
except Exception as e: | |
pass | |
print((main_url, tag_src, tag_href)) | |
def replace_url(url): | |
return url.replace('https://', 'http://') # urllib doesn't support https | |
def _search_for_links(url): | |
print("URL:" + url) | |
soup = bs(urllib.urlopen(url).read(), 'html.parser') | |
print("PAGE URLS:") | |
search_for_url_type(soup, url, 'a') | |
print("IMAGE URLS:") | |
search_for_url_type(soup, url, 'img') | |
print("VIDEO URLS:") | |
search_for_url_type(soup, url, 'video') | |
print("IFRAME URLS:") | |
search_for_url_type(soup, url, 'iframe') | |
print("DIV URLS:") | |
search_for_url_type(soup, url, 'div') | |
print("SCRIPT URLS:") | |
search_for_url_type(soup, url, 'script') | |
def search_for_links(url): | |
url = replace_url(url) | |
_search_for_links(url) | |
if __name__ == "__main__": | |
page_url = sys.argv[1] | |
if page_url == '-h' or page_url == '--help': | |
print("usage: http_tree.py [url]") | |
sys.exit(0) | |
search_for_links(page_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment