rs77/check_broken_links.py

## check_broken_links.py
# Inspiration from: https://www.scrapingbee.com/blog/crawling-python/

from typing import List, Tuple, Optional
import urllib3
from urllib.parse import urljoin
import requests
from requests import Response
from requests.exceptions import SSLError, ConnectionError, ReadTimeout
from bs4 import BeautifulSoup, ResultSet

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


class Crawler:

    def __init__(self, seed_urls: List[str] = [], accepted_domains: List[str] = [],
                 denied_domains: List[str] = [], crawl_depth_limit: int = 0, output_file: str = 'report.csv',
                 timeout_length_sec: int = 5):
        self.output_file: str = output_file
        self.checked_urls: List[Tuple[int, str]] = []
        self.seed_urls: List[str] = seed_urls
        self.accepted_domains: List[str] = accepted_domains
        self.denied_domains: List[str] = denied_domains
        self.crawl_depth_limit: int = crawl_depth_limit
        self.timeout_length_sec: int = timeout_length_sec

    def download_url(self, url: str) -> Tuple[int, str]:
        """
        Fetches the HTTP status code of URL, and returns the status code and URL (if error) or HTML.
        :param url:
        :return: status_code, HTML
        """
        try:
            res: Response = requests.get(url, timeout=5.0)
        except SSLError:
            res: Response = requests.get(url, verify=False, timeout=5.0)
        except ReadTimeout:
            return 499, url
        except ConnectionError:
            return 499, url
        status: int = res.status_code
        self.checked_urls.append((status, url))
        html: str = res.text
        return status, html

    @staticmethod
    def validate_links(links_list: List, url: str, attr: str) -> List[str]:
        """
        Loops through the links_list and returns valid list of URLs
        :param links_list:
        :param url:
        :param attr:
        :return:
        """
        result: List[str] = []
        if len(links_list) == 0:
            return result
        for link in links_list:
            path = link.get(attr)
            if path is None:
                continue
            elif path.startswith('/'):
                result.append(urljoin(url, path))
            result.append(path)
        return result

    def get_links(self, url: str, html) -> List[str]:
        result: List[str] = []
        soup: BeautifulSoup = BeautifulSoup(html, 'html.parser')
        # Fetch the necessary link tags
        # 1. Anchor tags
        anchor_list: List[ResultSet] = soup.find_all('a')
        # 1A. Anchor tag attribute = href
        anchor_links: List[str] = self.validate_links(anchor_list, url, 'href')
        result: List[str] = result + anchor_links
        # 2. Image tags
        img_list: List[ResultSet] = soup.find_all('img')
        # 2A. Image tag attribute = src
        image_links: List[str] = self.validate_links(img_list, url, 'src')
        result: List[str] = result + image_links
        return result

    def is_denied_url(self, url: str) -> bool:
        """
        Is the url found in the denied domain list?
        :param url:
        :return:
        """
        if len(self.denied_domains) > 0:
            for denied_url in self.denied_domains:
                if denied_url in url:
                    return True
        return False

    def is_accepted_url(self, url: str) -> bool:
        """
        Is the url found in the has_domain list?
        :param url:
        :return:
        """
        if len(self.accepted_domains) > 0:
            for accepted_url in self.accepted_domains:
                if accepted_url in url:
                    return True
        return False

    def is_already_listed(self, url: str) -> bool:
        """
        Is the url already listed to check, or has been checked?
        :param url:
        :return:
        """
        if len(self.checked_urls) > 0:
            for checked in self.checked_urls:
                if checked[1] == url:
                    return True
        return False

    def append_to_file(self, line: str):
        """
        Appends results to file
        :param line: line to write to file
        :return: void
        """
        with open(self.output_file, 'a') as f:
            f.write(line)

    def crawl(self, source_url: str, from_url: Optional[str] = None, depth: int = 0, follow: bool = True):
        status, html = self.download_url(source_url)
        if status in [400, 401, 402, 403, 404, 410, 499, 500, 501, 510]:
            output: str = f'{status},{from_url},{source_url}\n'
            self.append_to_file(output)
            return None
        # If the link starts with HTTP instead of HTTPS then report
        if source_url.startswith('http://'):
            output: str = f'{status} HTTP CALL,{from_url},{source_url}\n'
            self.append_to_file(output)
        # Simple log of progress
        if from_url is not None:
            output: str = f"Crawling: {status} {source_url} from {from_url}"
            print(output)
        if follow:
            links: List[str] = self.get_links(source_url, html)
            for url in links:
                # blank link?
                if url is None:
                    print(f"Empty link on {source_url}")
                    continue
                # is it a valid url?
                full_url: str = urljoin(source_url, url)
                # check if url meets exclusion list
                if self.is_denied_url(full_url):
                    self.crawl(full_url, source_url, follow=False)
                    continue
                # have we already checked this url?
                if self.is_already_listed(full_url):
                    continue
                # if this URI is not in the accepted list, check external links
                if not self.is_accepted_url(full_url):
                    self.crawl(full_url, source_url, follow=False)
                elif 0 < self.crawl_depth_limit < depth:
                    self.crawl(full_url, source_url, depth+1, follow=True)
                elif self.crawl_depth_limit == 0:
                    self.crawl(full_url, source_url, follow=True)

    def run(self):
        for url in self.seed_urls:
            self.crawl(url)


if __name__ == '__main__':
    urls: List[str] = [
        'https://example.com/'
    ]
    follow_domains: List[str] = [
        'example.com'
    ]
    denied_domains: List[str] = []
    Crawler(urls, follow_domains, denied_domains).run()
	# Inspiration from: https://www.scrapingbee.com/blog/crawling-python/

	from typing import List, Tuple, Optional
	import urllib3
	from urllib.parse import urljoin
	import requests
	from requests import Response
	from requests.exceptions import SSLError, ConnectionError, ReadTimeout
	from bs4 import BeautifulSoup, ResultSet

	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


	class Crawler:

	def __init__(self, seed_urls: List[str] = [], accepted_domains: List[str] = [],
	denied_domains: List[str] = [], crawl_depth_limit: int = 0, output_file: str = 'report.csv',
	timeout_length_sec: int = 5):
	self.output_file: str = output_file
	self.checked_urls: List[Tuple[int, str]] = []
	self.seed_urls: List[str] = seed_urls
	self.accepted_domains: List[str] = accepted_domains
	self.denied_domains: List[str] = denied_domains
	self.crawl_depth_limit: int = crawl_depth_limit
	self.timeout_length_sec: int = timeout_length_sec

	def download_url(self, url: str) -> Tuple[int, str]:
	"""
	Fetches the HTTP status code of URL, and returns the status code and URL (if error) or HTML.
	:param url:
	:return: status_code, HTML
	"""
	try:
	res: Response = requests.get(url, timeout=5.0)
	except SSLError:
	res: Response = requests.get(url, verify=False, timeout=5.0)
	except ReadTimeout:
	return 499, url
	except ConnectionError:
	return 499, url
	status: int = res.status_code
	self.checked_urls.append((status, url))
	html: str = res.text
	return status, html

	@staticmethod
	def validate_links(links_list: List, url: str, attr: str) -> List[str]:
	"""
	Loops through the links_list and returns valid list of URLs
	:param links_list:
	:param url:
	:param attr:
	:return:
	"""
	result: List[str] = []
	if len(links_list) == 0:
	return result
	for link in links_list:
	path = link.get(attr)
	if path is None:
	continue
	elif path.startswith('/'):
	result.append(urljoin(url, path))
	result.append(path)
	return result

	def get_links(self, url: str, html) -> List[str]:
	result: List[str] = []
	soup: BeautifulSoup = BeautifulSoup(html, 'html.parser')
	# Fetch the necessary link tags
	# 1. Anchor tags
	anchor_list: List[ResultSet] = soup.find_all('a')
	# 1A. Anchor tag attribute = href
	anchor_links: List[str] = self.validate_links(anchor_list, url, 'href')
	result: List[str] = result + anchor_links
	# 2. Image tags
	img_list: List[ResultSet] = soup.find_all('img')
	# 2A. Image tag attribute = src
	image_links: List[str] = self.validate_links(img_list, url, 'src')
	result: List[str] = result + image_links
	return result

	def is_denied_url(self, url: str) -> bool:
	"""
	Is the url found in the denied domain list?
	:param url:
	:return:
	"""
	if len(self.denied_domains) > 0:
	for denied_url in self.denied_domains:
	if denied_url in url:
	return True
	return False

	def is_accepted_url(self, url: str) -> bool:
	"""
	Is the url found in the has_domain list?
	:param url:
	:return:
	"""
	if len(self.accepted_domains) > 0:
	for accepted_url in self.accepted_domains:
	if accepted_url in url:
	return True
	return False

	def is_already_listed(self, url: str) -> bool:
	"""
	Is the url already listed to check, or has been checked?
	:param url:
	:return:
	"""
	if len(self.checked_urls) > 0:
	for checked in self.checked_urls:
	if checked[1] == url:
	return True
	return False

	def append_to_file(self, line: str):
	"""
	Appends results to file
	:param line: line to write to file
	:return: void
	"""
	with open(self.output_file, 'a') as f:
	f.write(line)

	def crawl(self, source_url: str, from_url: Optional[str] = None, depth: int = 0, follow: bool = True):
	status, html = self.download_url(source_url)
	if status in [400, 401, 402, 403, 404, 410, 499, 500, 501, 510]:
	output: str = f'{status},{from_url},{source_url}\n'
	self.append_to_file(output)
	return None
	# If the link starts with HTTP instead of HTTPS then report
	if source_url.startswith('http://'):
	output: str = f'{status} HTTP CALL,{from_url},{source_url}\n'
	self.append_to_file(output)
	# Simple log of progress
	if from_url is not None:
	output: str = f"Crawling: {status} {source_url} from {from_url}"
	print(output)
	if follow:
	links: List[str] = self.get_links(source_url, html)
	for url in links:
	# blank link?
	if url is None:
	print(f"Empty link on {source_url}")
	continue
	# is it a valid url?
	full_url: str = urljoin(source_url, url)
	# check if url meets exclusion list
	if self.is_denied_url(full_url):
	self.crawl(full_url, source_url, follow=False)
	continue
	# have we already checked this url?
	if self.is_already_listed(full_url):
	continue
	# if this URI is not in the accepted list, check external links
	if not self.is_accepted_url(full_url):
	self.crawl(full_url, source_url, follow=False)
	elif 0 < self.crawl_depth_limit < depth:
	self.crawl(full_url, source_url, depth+1, follow=True)
	elif self.crawl_depth_limit == 0:
	self.crawl(full_url, source_url, follow=True)

	def run(self):
	for url in self.seed_urls:
	self.crawl(url)


	if __name__ == '__main__':
	urls: List[str] = [
	'https://example.com/'
	]
	follow_domains: List[str] = [
	'example.com'
	]
	denied_domains: List[str] = []
	Crawler(urls, follow_domains, denied_domains).run()