Last active
November 11, 2023 06:58
-
-
Save ryandhubbard/2b88515fa05a6c29df7f4997369306c5 to your computer and use it in GitHub Desktop.
Search Website for broken links
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from urllib.parse import urljoin, urlparse | |
def find_dead_links(base_url, url, visited_urls, counter): | |
# Convert the relative URL to an absolute URL | |
absolute_url = urljoin(base_url, url) | |
# Check if the URL has already been visited | |
if absolute_url in visited_urls: | |
return counter | |
# Add the URL to the set of visited URLs | |
visited_urls.add(absolute_url) | |
# Check if the URL has a supported scheme (http or https) | |
parsed_url = urlparse(absolute_url) | |
if parsed_url.scheme not in ('http', 'https'): | |
# print(f'Skipping unsupported scheme for {absolute_url}') | |
return counter | |
# Send a GET request to the URL | |
response = requests.get(absolute_url) | |
# Increment the counter for each page searched | |
counter += 1 | |
# print(f'Searching page {counter}: {absolute_url}') | |
# Check if the request was successful (status code 200) | |
if response.status_code == 200: | |
# Parse the HTML content of the page | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Find all links on the page | |
links = soup.find_all('a', href=True) | |
# Loop through each link and check if it results in a 404 error | |
for link in links: | |
counter = find_dead_links(absolute_url, link['href'], visited_urls, counter) | |
elif response.status_code == 404: | |
print(f'Dead link found on page {counter}: {absolute_url}') | |
else: | |
# print(f'Error accessing {absolute_url}. Status code: {response.status_code}') | |
pass | |
return counter | |
# Replace 'https://www.gifts.com/' with the URL of the website you want to check | |
initial_url = 'https://www.example.com/' | |
visited_urls = set() | |
counter = find_dead_links(initial_url, initial_url, visited_urls, 0) | |
print(f'\nTotal pages searched: {counter}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment