NoWorries/scan_pages.py

## scan_pages.py
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urlparse, urljoin

# read the CSV file and extract all URLs
urls = []
with open("sitemap_links.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        urls.append(row[0])

# Common class names and element names for headers and footers to ignore
ignore_list = [
    ".header",
    ".site-header",
    ".navbar",
    ".topbar",
    ".main-header",
    ".logo",
    ".site-logo",
    ".menu",
    ".main-menu",
    ".navigation",
    ".nav",
    ".branding",
    ".footer",
    ".site-footer",
    ".bottombar",
    ".main-footer",
    ".copyright",
    ".footer-menu",
    ".contact-info",
    ".social-icons",
    ".copyright-text",
]

# create a CSV file and write the data to it
with open("detected_links.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Page URL", "Page", "Type", "Text", "Href"])

    # iterate over all URLs
    total_links = 0
    pages_visited = set()
    for page_url in urls:
        try:
            response = requests.get(page_url)
        except requests.exceptions.RequestException:
            continue
        html = response.content
        soup = BeautifulSoup(html, "html.parser")

        # Ignore links and buttons within header and footer elements
        links = soup.select(f"a:not({' ,'.join(ignore_list)}) , button:not({' ,'.join(ignore_list)})")

        for link in links:
            if link.has_attr("href"):
                link_type = "Link"
                if link.has_attr("class") and any(cls in link["class"] for cls in ["btn", "button", "as-button", "as-btn"]):
                    link_type = "Link as Button"
                full_url = urljoin(page_url, link["href"])
                writer.writerow([page_url, soup.title.string if soup.title else "No title", link_type, link.text.strip(), full_url])
                total_links += 1
        pages_visited.add(page_url)
        print(f"Scanned {len(pages_visited)} out of {len(urls)} URLs, found {total_links} links", end="\r")

    print("Scanning completed.")
    print(f"Total unique pages visited: {len(pages_visited)}")
    print(f"Total links found: {total_links}")
	import requests
	from bs4 import BeautifulSoup
	import csv
	from urllib.parse import urlparse, urljoin

	# read the CSV file and extract all URLs
	urls = []
	with open("sitemap_links.csv", "r") as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
	urls.append(row[0])

	# Common class names and element names for headers and footers to ignore
	ignore_list = [
	".header",
	".site-header",
	".navbar",
	".topbar",
	".main-header",
	".logo",
	".site-logo",
	".menu",
	".main-menu",
	".navigation",
	".nav",
	".branding",
	".footer",
	".site-footer",
	".bottombar",
	".main-footer",
	".copyright",
	".footer-menu",
	".contact-info",
	".social-icons",
	".copyright-text",
	]

	# create a CSV file and write the data to it
	with open("detected_links.csv", "w", newline="") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["Page URL", "Page", "Type", "Text", "Href"])

	# iterate over all URLs
	total_links = 0
	pages_visited = set()
	for page_url in urls:
	try:
	response = requests.get(page_url)
	except requests.exceptions.RequestException:
	continue
	html = response.content
	soup = BeautifulSoup(html, "html.parser")

	# Ignore links and buttons within header and footer elements
	links = soup.select(f"a:not({' ,'.join(ignore_list)}) , button:not({' ,'.join(ignore_list)})")

	for link in links:
	if link.has_attr("href"):
	link_type = "Link"
	if link.has_attr("class") and any(cls in link["class"] for cls in ["btn", "button", "as-button", "as-btn"]):
	link_type = "Link as Button"
	full_url = urljoin(page_url, link["href"])
	writer.writerow([page_url, soup.title.string if soup.title else "No title", link_type, link.text.strip(), full_url])
	total_links += 1
	pages_visited.add(page_url)
	print(f"Scanned {len(pages_visited)} out of {len(urls)} URLs, found {total_links} links", end="\r")

	print("Scanning completed.")
	print(f"Total unique pages visited: {len(pages_visited)}")
	print(f"Total links found: {total_links}")