Last active
March 25, 2019 06:15
-
-
Save wasi0013/639b097f8603510bbf2af148bed98a8a to your computer and use it in GitHub Desktop.
Find all the broken links from a website.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests_html | |
def crawl(base_url): | |
broken_links = [] | |
session = requests_html.HTMLSession() | |
try: | |
r = session.get(base_url) | |
except: | |
broken_links = [base_url] | |
return broken_links | |
if r.status_code != 200: | |
return [base_url] | |
base_url.replace("http://","").replace("https://", "") | |
visit = r.html.absolute_links | |
visited = [] | |
while visit: | |
url = visit.pop() | |
session = requests_html.HTMLSession() | |
try: | |
r = session.get(url) | |
except: | |
broken_links.append(url) | |
continue | |
if r.status_code != 200: | |
broken_links.append(url) | |
if base_url in url: | |
for link in r.html.absolute_links: | |
if link not in visited: | |
visit.add(link) | |
print(url, r.status_code) | |
visited.append(url) | |
return broken_links | |
print(crawl("https://wasi0013.com")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment