|
import csv |
|
import requests |
|
from bs4 import BeautifulSoup |
|
from urllib.parse import urlparse, urljoin |
|
|
|
# Function to find the sitemap URL on a website (checking with and without "www.") |
|
def find_sitemap(base_url): |
|
# Construct potential sitemap URLs based on common locations |
|
common_sitemap_locations = [ |
|
"/sitemap.xml", |
|
"/sitemap_index.xml", |
|
"/sitemap/sitemap.xml", |
|
"/sitemap/sitemap_index.xml", |
|
] |
|
|
|
# Ensure "https://" protocol prefix |
|
if not base_url.startswith("https://"): |
|
base_url = "https://" + base_url |
|
|
|
# Remove "www." prefix if present |
|
base_url_without_www = base_url.replace("www.", "") |
|
|
|
# Combine base URLs with common sitemap locations for both variations |
|
sitemap_urls = [urljoin(base_url, location) for location in common_sitemap_locations] |
|
sitemap_urls += [urljoin(base_url_without_www, location) for location in common_sitemap_locations] |
|
|
|
# Check each potential sitemap URL and return the first one that exists |
|
for sitemap_url in sitemap_urls: |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" |
|
} |
|
response = requests.head(sitemap_url, headers=headers) |
|
if response.status_code == 200: |
|
return sitemap_url |
|
|
|
return None |
|
|
|
# Function to parse the sitemap XML and extract links |
|
def parse_sitemap(sitemap_url): |
|
headers = { |
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36" |
|
} |
|
response = requests.get(sitemap_url, headers=headers) |
|
soup = BeautifulSoup(response.content, "xml") |
|
locs = soup.find_all("loc") |
|
urls = [loc.text.strip() for loc in locs] |
|
return urls |
|
|
|
# Step 1: Input the website URL you want to scan |
|
website_url = input("Enter the website URL: ").strip() |
|
|
|
# Step 2: Check if "www." is not present, and try both variations (with and without) |
|
if not website_url.startswith("https://www.") and not website_url.startswith("https://"): |
|
website_url_with_www = "https://www." + website_url |
|
website_url_without_www = "https://" + website_url.replace("www.", "") |
|
sitemap_url = find_sitemap(website_url_with_www) or find_sitemap(website_url_without_www) |
|
else: |
|
# Step 3: Find the sitemap URL on the website |
|
sitemap_url = find_sitemap(website_url) |
|
|
|
if sitemap_url: |
|
print(f"Found sitemap at: {sitemap_url}") |
|
|
|
# Step 4: Parse the sitemap and extract links |
|
sitemap_links = parse_sitemap(sitemap_url) |
|
|
|
# Step 5: Export the list of links to a CSV file |
|
with open("sitemap_links.csv", "w", newline="") as file: |
|
writer = csv.writer(file) |
|
writer.writerow(["URL"]) |
|
writer.writerows([[url] for url in sitemap_links]) |
|
|
|
print("Sitemap links have been saved to 'sitemap_links.csv'") |
|
else: |
|
print("No sitemap found on the provided website.") |
|
|
|
# Second Script |
|
# read the CSV file and extract all URLs |
|
urls = [] |
|
with open("sitemap_links.csv", "r") as csvfile: |
|
reader = csv.reader(csvfile) |
|
for row in reader: |
|
urls.append(row[0]) |
|
|
|
# Common class names and element names for headers and footers to ignore |
|
ignore_list = [ |
|
".header", |
|
".site-header", |
|
".navbar", |
|
".topbar", |
|
".main-header", |
|
".logo", |
|
".site-logo", |
|
".menu", |
|
".main-menu", |
|
".navigation", |
|
".nav", |
|
".branding", |
|
".footer", |
|
".site-footer", |
|
".bottombar", |
|
".main-footer", |
|
".copyright", |
|
".footer-menu", |
|
".contact-info", |
|
".social-icons", |
|
".copyright-text", |
|
] |
|
|
|
# create a CSV file and write the data to it |
|
with open("detected_links.csv", "w", newline="") as csvfile: |
|
writer = csv.writer(csvfile) |
|
writer.writerow(["Page URL", "Page", "Type", "Text", "Href"]) |
|
|
|
# iterate over all URLs |
|
total_links = 0 |
|
pages_visited = set() |
|
for page_url in urls: |
|
try: |
|
response = requests.get(page_url) |
|
except requests.exceptions.RequestException: |
|
continue |
|
html = response.content |
|
soup = BeautifulSoup(html, "html.parser") |
|
|
|
# Ignore links and buttons within header and footer elements |
|
links = soup.select(f"a:not({' ,'.join(ignore_list)}) , button:not({' ,'.join(ignore_list)})") |
|
|
|
for link in links: |
|
if link.has_attr("href"): |
|
link_type = "Link" |
|
if link.has_attr("class") and any(cls in link["class"] for cls in ["btn", "button", "as-button", "as-btn"]): |
|
link_type = "Link as Button" |
|
full_url = urljoin(page_url, link["href"]) |
|
writer.writerow([page_url, soup.title.string if soup.title else "No title", link_type, link.text.strip(), full_url]) |
|
total_links += 1 |
|
pages_visited.add(page_url) |
|
print(f"Scanned {len(pages_visited)} out of {len(urls)} URLs, found {total_links} links", end="\r") |
|
|
|
print("Scanning completed.") |
|
print(f"Total unique pages visited: {len(pages_visited)}") |
|
print(f"Total links found: {total_links}") |