Skip to content

Instantly share code, notes, and snippets.

@JakubAndrysek
Last active May 24, 2023 21:33
Show Gist options
  • Save JakubAndrysek/ea32e2c0b26d30b32f4c3cc08c8d953f to your computer and use it in GitHub Desktop.
Save JakubAndrysek/ea32e2c0b26d30b32f4c3cc08c8d953f to your computer and use it in GitHub Desktop.
This script will scrape a website and save all the paths to a file.

Web scraper for validating links in the website

Usage

It will also exclude any paths that end with the extensions specified in the --exclude-extensions parameter. It will also limit the number of links to visit per page to the number specified in the --max_links_per_page parameter. It could be used to test a website for broken links.

Warning

Be careful not to overload the website with requests. You could get banned.

Usage

python ValidateLinksScraping.py https://webscraper.io/test-sites --max_links_per_page 100 --exclude-extensions pdf,jpg,png

Results will be saved in a file called paths.txt in the current directory.

https://webscraper.io/
https://webscraper.io/cloud-scraper
https://webscraper.io/documentation
https://webscraper.io/how-to-videos
https://webscraper.io/pricing
https://webscraper.io/test-sites
https://webscraper.io/test-sites/e-commerce/allinone
https://webscraper.io/test-sites/e-commerce/allinone-popup-links
https://webscraper.io/test-sites/e-commerce/static
https://webscraper.io/tutorials
...
# Description: This script will scrape a website and save all the paths to a file.
# Usage: It will also exclude any paths that end with the extensions specified in the --exclude-extensions parameter.
# It will also limit the number of links to visit per page to the number specified in the --max_links_per_page parameter.
# It could be used to test a website for broken links.
# Be careful not to overload the website with requests. You could get banned.
# Example: python ValidateLinksScraping.py https://webscraper.io/test-sites --max_links_per_page 100 --exclude-extensions pdf,jpg,png
# Author: Kuba Andrysek
# Website: https://kubaandrysek.com
# Date: 2023-05-34
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import argparse
class WebScraper:
def __init__(self, base_url, max_links_per_page=50, exclude_extensions=None):
self.base_url = base_url
self.domain_name = urlparse(base_url).netloc
self.visited_pages = set()
self.max_links_per_page = max_links_per_page
self.pages_to_visit = [base_url]
self.paths = []
self.exclude_extensions = exclude_extensions
def valid_url(self, url):
parsed = urlparse(url)
return (parsed.netloc == self.domain_name and
not self.has_exclude_extension(url) and
not '#' in url)
def has_exclude_extension(self, url):
if not self.exclude_extensions:
return False
extensions = self.exclude_extensions.split(',')
return any(url.endswith(extension) for extension in extensions)
def visit_pages(self):
try:
while self.pages_to_visit:
url = self.pages_to_visit.pop(0)
if url in self.visited_pages or not self.valid_url(url):
continue
response = requests.get(url)
if response.status_code != 200:
print(f">>>>>>>>>>>Page {url} returned status code {response.status_code} <<<<<<<<<<<<")
# response.raise_for_status() # Will raise an exception for 4xx or 5xx status codes
self.visited_pages.add(url)
soup = BeautifulSoup(response.content, 'lxml')
# Add full path to paths list
self.paths.append(url)
links = soup.find_all('a')
for link in links[:self.max_links_per_page]:
href = link.get('href')
if href:
full_url = urljoin(url, href)
if self.valid_url(full_url):
self.pages_to_visit.append(full_url)
print(f"Currently exploring: {url}")
except KeyboardInterrupt:
print("\nInterrupted by user. Saving progress...")
print(f"Done. Found {len(self.paths)} paths and saved to paths.txt")
def save_to_file(self):
# Sort paths and write to file
self.paths.sort()
with open('paths.txt', 'w') as f:
for path in self.paths:
f.write(path + '\n')
def parse_arguments():
parser = argparse.ArgumentParser(description='Web scraper parameters.')
parser.add_argument('base_url', help='Base URL to start scraping')
parser.add_argument('--max_links_per_page', type=int, default=50, help='Maximum number of links to visit per page')
parser.add_argument('--exclude-extensions', help='Comma-separated list of file extensions to exclude')
args = parser.parse_args()
return args.base_url, args.max_links_per_page, args.exclude_extensions
if __name__ == "__main__":
base_url, max_links_per_page, exclude_extensions = parse_arguments()
scraper = WebScraper(base_url, max_links_per_page, exclude_extensions)
scraper.visit_pages()
scraper.save_to_file()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment