Last active
February 25, 2021 21:54
-
-
Save rs77/dc222c28d60d87cf8d9541d0aafce8aa to your computer and use it in GitHub Desktop.
Python script to check broken links on website: https://scripteverything.com/check-broken-links-on-website-without-using-scrapy/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Inspiration from: https://www.scrapingbee.com/blog/crawling-python/ | |
from typing import List, Tuple, Optional | |
import urllib3 | |
from urllib.parse import urljoin | |
import requests | |
from requests import Response | |
from requests.exceptions import SSLError, ConnectionError, ReadTimeout | |
from bs4 import BeautifulSoup, ResultSet | |
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) | |
class Crawler: | |
def __init__(self, seed_urls: List[str] = [], accepted_domains: List[str] = [], | |
denied_domains: List[str] = [], crawl_depth_limit: int = 0, output_file: str = 'report.csv', | |
timeout_length_sec: int = 5): | |
self.output_file: str = output_file | |
self.checked_urls: List[Tuple[int, str]] = [] | |
self.seed_urls: List[str] = seed_urls | |
self.accepted_domains: List[str] = accepted_domains | |
self.denied_domains: List[str] = denied_domains | |
self.crawl_depth_limit: int = crawl_depth_limit | |
self.timeout_length_sec: int = timeout_length_sec | |
def download_url(self, url: str) -> Tuple[int, str]: | |
""" | |
Fetches the HTTP status code of URL, and returns the status code and URL (if error) or HTML. | |
:param url: | |
:return: status_code, HTML | |
""" | |
try: | |
res: Response = requests.get(url, timeout=5.0) | |
except SSLError: | |
res: Response = requests.get(url, verify=False, timeout=5.0) | |
except ReadTimeout: | |
return 499, url | |
except ConnectionError: | |
return 499, url | |
status: int = res.status_code | |
self.checked_urls.append((status, url)) | |
html: str = res.text | |
return status, html | |
@staticmethod | |
def validate_links(links_list: List, url: str, attr: str) -> List[str]: | |
""" | |
Loops through the links_list and returns valid list of URLs | |
:param links_list: | |
:param url: | |
:param attr: | |
:return: | |
""" | |
result: List[str] = [] | |
if len(links_list) == 0: | |
return result | |
for link in links_list: | |
path = link.get(attr) | |
if path is None: | |
continue | |
elif path.startswith('/'): | |
result.append(urljoin(url, path)) | |
result.append(path) | |
return result | |
def get_links(self, url: str, html) -> List[str]: | |
result: List[str] = [] | |
soup: BeautifulSoup = BeautifulSoup(html, 'html.parser') | |
# Fetch the necessary link tags | |
# 1. Anchor tags | |
anchor_list: List[ResultSet] = soup.find_all('a') | |
# 1A. Anchor tag attribute = href | |
anchor_links: List[str] = self.validate_links(anchor_list, url, 'href') | |
result: List[str] = result + anchor_links | |
# 2. Image tags | |
img_list: List[ResultSet] = soup.find_all('img') | |
# 2A. Image tag attribute = src | |
image_links: List[str] = self.validate_links(img_list, url, 'src') | |
result: List[str] = result + image_links | |
return result | |
def is_denied_url(self, url: str) -> bool: | |
""" | |
Is the url found in the denied domain list? | |
:param url: | |
:return: | |
""" | |
if len(self.denied_domains) > 0: | |
for denied_url in self.denied_domains: | |
if denied_url in url: | |
return True | |
return False | |
def is_accepted_url(self, url: str) -> bool: | |
""" | |
Is the url found in the has_domain list? | |
:param url: | |
:return: | |
""" | |
if len(self.accepted_domains) > 0: | |
for accepted_url in self.accepted_domains: | |
if accepted_url in url: | |
return True | |
return False | |
def is_already_listed(self, url: str) -> bool: | |
""" | |
Is the url already listed to check, or has been checked? | |
:param url: | |
:return: | |
""" | |
if len(self.checked_urls) > 0: | |
for checked in self.checked_urls: | |
if checked[1] == url: | |
return True | |
return False | |
def append_to_file(self, line: str): | |
""" | |
Appends results to file | |
:param line: line to write to file | |
:return: void | |
""" | |
with open(self.output_file, 'a') as f: | |
f.write(line) | |
def crawl(self, source_url: str, from_url: Optional[str] = None, depth: int = 0, follow: bool = True): | |
status, html = self.download_url(source_url) | |
if status in [400, 401, 402, 403, 404, 410, 499, 500, 501, 510]: | |
output: str = f'{status},{from_url},{source_url}\n' | |
self.append_to_file(output) | |
return None | |
# If the link starts with HTTP instead of HTTPS then report | |
if source_url.startswith('http://'): | |
output: str = f'{status} HTTP CALL,{from_url},{source_url}\n' | |
self.append_to_file(output) | |
# Simple log of progress | |
if from_url is not None: | |
output: str = f"Crawling: {status} {source_url} from {from_url}" | |
print(output) | |
if follow: | |
links: List[str] = self.get_links(source_url, html) | |
for url in links: | |
# blank link? | |
if url is None: | |
print(f"Empty link on {source_url}") | |
continue | |
# is it a valid url? | |
full_url: str = urljoin(source_url, url) | |
# check if url meets exclusion list | |
if self.is_denied_url(full_url): | |
self.crawl(full_url, source_url, follow=False) | |
continue | |
# have we already checked this url? | |
if self.is_already_listed(full_url): | |
continue | |
# if this URI is not in the accepted list, check external links | |
if not self.is_accepted_url(full_url): | |
self.crawl(full_url, source_url, follow=False) | |
elif 0 < self.crawl_depth_limit < depth: | |
self.crawl(full_url, source_url, depth+1, follow=True) | |
elif self.crawl_depth_limit == 0: | |
self.crawl(full_url, source_url, follow=True) | |
def run(self): | |
for url in self.seed_urls: | |
self.crawl(url) | |
if __name__ == '__main__': | |
urls: List[str] = [ | |
'https://example.com/' | |
] | |
follow_domains: List[str] = [ | |
'example.com' | |
] | |
denied_domains: List[str] = [] | |
Crawler(urls, follow_domains, denied_domains).run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment