|
# Description: This script will scrape a website and save all the paths to a file. |
|
# Usage: It will also exclude any paths that end with the extensions specified in the --exclude-extensions parameter. |
|
# It will also limit the number of links to visit per page to the number specified in the --max_links_per_page parameter. |
|
# It could be used to test a website for broken links. |
|
# Be careful not to overload the website with requests. You could get banned. |
|
|
|
# Example: python ValidateLinksScraping.py https://webscraper.io/test-sites --max_links_per_page 100 --exclude-extensions pdf,jpg,png |
|
# Author: Kuba Andrysek |
|
# Website: https://kubaandrysek.com |
|
# Date: 2023-05-34 |
|
|
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse, urljoin |
|
import argparse |
|
|
|
class WebScraper: |
|
def __init__(self, base_url, max_links_per_page=50, exclude_extensions=None): |
|
self.base_url = base_url |
|
self.domain_name = urlparse(base_url).netloc |
|
self.visited_pages = set() |
|
self.max_links_per_page = max_links_per_page |
|
self.pages_to_visit = [base_url] |
|
self.paths = [] |
|
self.exclude_extensions = exclude_extensions |
|
|
|
def valid_url(self, url): |
|
parsed = urlparse(url) |
|
return (parsed.netloc == self.domain_name and |
|
not self.has_exclude_extension(url) and |
|
not '#' in url) |
|
|
|
def has_exclude_extension(self, url): |
|
if not self.exclude_extensions: |
|
return False |
|
|
|
extensions = self.exclude_extensions.split(',') |
|
return any(url.endswith(extension) for extension in extensions) |
|
|
|
def visit_pages(self): |
|
try: |
|
while self.pages_to_visit: |
|
url = self.pages_to_visit.pop(0) |
|
if url in self.visited_pages or not self.valid_url(url): |
|
continue |
|
|
|
response = requests.get(url) |
|
if response.status_code != 200: |
|
print(f">>>>>>>>>>>Page {url} returned status code {response.status_code} <<<<<<<<<<<<") |
|
# response.raise_for_status() # Will raise an exception for 4xx or 5xx status codes |
|
|
|
self.visited_pages.add(url) |
|
soup = BeautifulSoup(response.content, 'lxml') |
|
|
|
# Add full path to paths list |
|
self.paths.append(url) |
|
|
|
links = soup.find_all('a') |
|
for link in links[:self.max_links_per_page]: |
|
href = link.get('href') |
|
if href: |
|
full_url = urljoin(url, href) |
|
if self.valid_url(full_url): |
|
self.pages_to_visit.append(full_url) |
|
|
|
print(f"Currently exploring: {url}") |
|
except KeyboardInterrupt: |
|
print("\nInterrupted by user. Saving progress...") |
|
print(f"Done. Found {len(self.paths)} paths and saved to paths.txt") |
|
|
|
def save_to_file(self): |
|
# Sort paths and write to file |
|
self.paths.sort() |
|
with open('paths.txt', 'w') as f: |
|
for path in self.paths: |
|
f.write(path + '\n') |
|
|
|
def parse_arguments(): |
|
parser = argparse.ArgumentParser(description='Web scraper parameters.') |
|
parser.add_argument('base_url', help='Base URL to start scraping') |
|
parser.add_argument('--max_links_per_page', type=int, default=50, help='Maximum number of links to visit per page') |
|
parser.add_argument('--exclude-extensions', help='Comma-separated list of file extensions to exclude') |
|
args = parser.parse_args() |
|
|
|
return args.base_url, args.max_links_per_page, args.exclude_extensions |
|
|
|
|
|
if __name__ == "__main__": |
|
base_url, max_links_per_page, exclude_extensions = parse_arguments() |
|
scraper = WebScraper(base_url, max_links_per_page, exclude_extensions) |
|
scraper.visit_pages() |
|
scraper.save_to_file() |