ryandhubbard/broken_links.py

## broken_links.py
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def find_dead_links(base_url, url, visited_urls, counter):
    # Convert the relative URL to an absolute URL
    absolute_url = urljoin(base_url, url)

    # Check if the URL has already been visited
    if absolute_url in visited_urls:
        return counter

    # Add the URL to the set of visited URLs
    visited_urls.add(absolute_url)

    # Check if the URL has a supported scheme (http or https)
    parsed_url = urlparse(absolute_url)
    if parsed_url.scheme not in ('http', 'https'):
        # print(f'Skipping unsupported scheme for {absolute_url}')
        return counter

    # Send a GET request to the URL
    response = requests.get(absolute_url)

    # Increment the counter for each page searched
    counter += 1
    # print(f'Searching page {counter}: {absolute_url}')

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the HTML content of the page
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all links on the page
        links = soup.find_all('a', href=True)

        # Loop through each link and check if it results in a 404 error
        for link in links:
            counter = find_dead_links(absolute_url, link['href'], visited_urls, counter)

    elif response.status_code == 404:
        print(f'Dead link found on page {counter}: {absolute_url}')

    else:
        # print(f'Error accessing {absolute_url}. Status code: {response.status_code}')
        pass

    return counter

# Replace 'https://www.gifts.com/' with the URL of the website you want to check
initial_url = 'https://www.example.com/'
visited_urls = set()
counter = find_dead_links(initial_url, initial_url, visited_urls, 0)
print(f'\nTotal pages searched: {counter}')
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse

	def find_dead_links(base_url, url, visited_urls, counter):
	# Convert the relative URL to an absolute URL
	absolute_url = urljoin(base_url, url)

	# Check if the URL has already been visited
	if absolute_url in visited_urls:
	return counter

	# Add the URL to the set of visited URLs
	visited_urls.add(absolute_url)

	# Check if the URL has a supported scheme (http or https)
	parsed_url = urlparse(absolute_url)
	if parsed_url.scheme not in ('http', 'https'):
	# print(f'Skipping unsupported scheme for {absolute_url}')
	return counter

	# Send a GET request to the URL
	response = requests.get(absolute_url)

	# Increment the counter for each page searched
	counter += 1
	# print(f'Searching page {counter}: {absolute_url}')

	# Check if the request was successful (status code 200)
	if response.status_code == 200:
	# Parse the HTML content of the page
	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all links on the page
	links = soup.find_all('a', href=True)

	# Loop through each link and check if it results in a 404 error
	for link in links:
	counter = find_dead_links(absolute_url, link['href'], visited_urls, counter)

	elif response.status_code == 404:
	print(f'Dead link found on page {counter}: {absolute_url}')

	else:
	# print(f'Error accessing {absolute_url}. Status code: {response.status_code}')
	pass

	return counter

	# Replace 'https://www.gifts.com/' with the URL of the website you want to check
	initial_url = 'https://www.example.com/'
	visited_urls = set()
	counter = find_dead_links(initial_url, initial_url, visited_urls, 0)
	print(f'\nTotal pages searched: {counter}')