Created
April 2, 2020 14:56
-
-
Save Oisann/e90e1478924fc5bed7b8dc0e04b3a359 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests_html import HTMLSession | |
from urllib.parse import urljoin, urlparse | |
scheme = "https" | |
domain = "example.com" | |
checked = [] | |
failed = [] | |
prettyFail = [] | |
def main(): | |
frontpage = f"{scheme}://{domain}" | |
with HTMLSession() as s: | |
findLinks(s, frontpage) | |
print("Found these failed URLs:") | |
for fail in prettyFail: | |
print(f" - {fail[0]}: {fail[1]}") | |
def findLinks(s, url): | |
d = urlparse(url) | |
if not d.netloc.endswith(domain): | |
failed.append(url) | |
print(f"Found external link: {url}") | |
return | |
print(f"Checking {url}") | |
try: | |
r = s.get(url) | |
if r.status_code >= 200 and r.status_code < 300: | |
checked.append(url) | |
links = r.html.links | |
for link in links: | |
if not link.startswith("http"): | |
link = urljoin(url, link) | |
if not (link in checked or link in failed): | |
findLinks(s, link) | |
elif r.status_code == 504: | |
print(f"Oops... {r.status_code} from {url}") | |
findLinks(s, url) | |
else: | |
failed.append(url) | |
prettyFail.append([r.status_code, url]) | |
except: | |
failed.append(url) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment