NoWorries/export_links.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Setting up and running the Export Links script

Prerequisites


You need to have Python installed on your computer. If you don't have it installed, follow the instructions in this guide to install it: How to Install Python

Step 1: Download the Script


Download the export links crawler script (use the download button if that doesn't work) to your computer. Save it in a folder where you can easily find it.

Step 2: Install Required Dependencies


Open a terminal or Command Prompt.
Navigate to the folder where you saved the script using the cd command (e.g., cd Desktop/MyScripts).
Run the following command to install the required dependencies:
pip install requests beautifulsoup4

Edit the Ignore List (Optional):
If you want to customize the elements that the script ignores (such as specific header or footer classes or element names), you can edit the ignore_list in the script. Open the script using a text editor, and you'll find the ignore_list variable. Add or remove class names or element names as needed.
For example, if you want to ignore links within elements with the class my-header and my-footer, you can modify the ignore_list like this:
ignore_list = [
    ".my-header",
    ".my-footer",
]

Save the script after making your changes.
Step 3: Run the Script


To run the script, you need to use the python command followed by the script's filename. For example, if your script is named export_links.py, type python export_links.py and press Enter.

Step 4: Enter the Website URL


The script will prompt you to enter the URL of the website you want to scan for a sitemap. Type the full website URL and press Enter. For example, https://www.example.com.

Step 5: Wait for the Script to Complete


The script will start searching for the sitemap on the provided website. If it finds a sitemap, it will display the sitemap's URL.
The script will then parse the sitemap and extract the links. Once it's done, it will inform you that the sitemap links have been saved.

Step 6: Check the Output


The extracted links from the sitemap will be saved in a file named sitemap_links.csv in the same folder where the script is located.
You can open this CSV file with a spreadsheet program like Microsoft Excel or Google Sheets to view the list of links.


## export_links.py
import csv
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

# Function to find the sitemap URL on a website (checking with and without "www.")
def find_sitemap(base_url):
    # Construct potential sitemap URLs based on common locations
    common_sitemap_locations = [
        "/sitemap.xml",
        "/sitemap_index.xml",
        "/sitemap/sitemap.xml",
        "/sitemap/sitemap_index.xml",
    ]

    # Ensure "https://" protocol prefix
    if not base_url.startswith("https://"):
        base_url = "https://" + base_url

    # Remove "www." prefix if present
    base_url_without_www = base_url.replace("www.", "")

    # Combine base URLs with common sitemap locations for both variations
    sitemap_urls = [urljoin(base_url, location) for location in common_sitemap_locations]
    sitemap_urls += [urljoin(base_url_without_www, location) for location in common_sitemap_locations]

    # Check each potential sitemap URL and return the first one that exists
    for sitemap_url in sitemap_urls:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
        }
        response = requests.head(sitemap_url, headers=headers)
        if response.status_code == 200:
            return sitemap_url

    return None

# Function to parse the sitemap XML and extract links
def parse_sitemap(sitemap_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }
    response = requests.get(sitemap_url, headers=headers)
    soup = BeautifulSoup(response.content, "xml")
    locs = soup.find_all("loc")
    urls = [loc.text.strip() for loc in locs]
    return urls

# Step 1: Input the website URL you want to scan
website_url = input("Enter the website URL: ").strip()

# Step 2: Check if "www." is not present, and try both variations (with and without)
if not website_url.startswith("https://www.") and not website_url.startswith("https://"):
    website_url_with_www = "https://www." + website_url
    website_url_without_www = "https://" + website_url.replace("www.", "")
    sitemap_url = find_sitemap(website_url_with_www) or find_sitemap(website_url_without_www)
else:
    # Step 3: Find the sitemap URL on the website
    sitemap_url = find_sitemap(website_url)

if sitemap_url:
    print(f"Found sitemap at: {sitemap_url}")

    # Step 4: Parse the sitemap and extract links
    sitemap_links = parse_sitemap(sitemap_url)

    # Step 5: Export the list of links to a CSV file
    with open("sitemap_links.csv", "w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["URL"])
        writer.writerows([[url] for url in sitemap_links])

    print("Sitemap links have been saved to 'sitemap_links.csv'")
else:
    print("No sitemap found on the provided website.")

# Second Script
# read the CSV file and extract all URLs
urls = []
with open("sitemap_links.csv", "r") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        urls.append(row[0])

# Common class names and element names for headers and footers to ignore
ignore_list = [
    ".header",
    ".site-header",
    ".navbar",
    ".topbar",
    ".main-header",
    ".logo",
    ".site-logo",
    ".menu",
    ".main-menu",
    ".navigation",
    ".nav",
    ".branding",
    ".footer",
    ".site-footer",
    ".bottombar",
    ".main-footer",
    ".copyright",
    ".footer-menu",
    ".contact-info",
    ".social-icons",
    ".copyright-text",
]

# create a CSV file and write the data to it
with open("detected_links.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Page URL", "Page", "Type", "Text", "Href"])

    # iterate over all URLs
    total_links = 0
    pages_visited = set()
    for page_url in urls:
        try:
            response = requests.get(page_url)
        except requests.exceptions.RequestException:
            continue
        html = response.content
        soup = BeautifulSoup(html, "html.parser")

        # Ignore links and buttons within header and footer elements
        links = soup.select(f"a:not({' ,'.join(ignore_list)}) , button:not({' ,'.join(ignore_list)})")

        for link in links:
            if link.has_attr("href"):
                link_type = "Link"
                if link.has_attr("class") and any(cls in link["class"] for cls in ["btn", "button", "as-button", "as-btn"]):
                    link_type = "Link as Button"
                full_url = urljoin(page_url, link["href"])
                writer.writerow([page_url, soup.title.string if soup.title else "No title", link_type, link.text.strip(), full_url])
                total_links += 1
        pages_visited.add(page_url)
        print(f"Scanned {len(pages_visited)} out of {len(urls)} URLs, found {total_links} links", end="\r")

    print("Scanning completed.")
    print(f"Total unique pages visited: {len(pages_visited)}")
    print(f"Total links found: {total_links}")
	import csv
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, urljoin

	# Function to find the sitemap URL on a website (checking with and without "www.")
	def find_sitemap(base_url):
	# Construct potential sitemap URLs based on common locations
	common_sitemap_locations = [
	"/sitemap.xml",
	"/sitemap_index.xml",
	"/sitemap/sitemap.xml",
	"/sitemap/sitemap_index.xml",
	]

	# Ensure "https://" protocol prefix
	if not base_url.startswith("https://"):
	base_url = "https://" + base_url

	# Remove "www." prefix if present
	base_url_without_www = base_url.replace("www.", "")

	# Combine base URLs with common sitemap locations for both variations
	sitemap_urls = [urljoin(base_url, location) for location in common_sitemap_locations]
	sitemap_urls += [urljoin(base_url_without_www, location) for location in common_sitemap_locations]

	# Check each potential sitemap URL and return the first one that exists
	for sitemap_url in sitemap_urls:
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
	}
	response = requests.head(sitemap_url, headers=headers)
	if response.status_code == 200:
	return sitemap_url

	return None

	# Function to parse the sitemap XML and extract links
	def parse_sitemap(sitemap_url):
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
	}
	response = requests.get(sitemap_url, headers=headers)
	soup = BeautifulSoup(response.content, "xml")
	locs = soup.find_all("loc")
	urls = [loc.text.strip() for loc in locs]
	return urls

	# Step 1: Input the website URL you want to scan
	website_url = input("Enter the website URL: ").strip()

	# Step 2: Check if "www." is not present, and try both variations (with and without)
	if not website_url.startswith("https://www.") and not website_url.startswith("https://"):
	website_url_with_www = "https://www." + website_url
	website_url_without_www = "https://" + website_url.replace("www.", "")
	sitemap_url = find_sitemap(website_url_with_www) or find_sitemap(website_url_without_www)
	else:
	# Step 3: Find the sitemap URL on the website
	sitemap_url = find_sitemap(website_url)

	if sitemap_url:
	print(f"Found sitemap at: {sitemap_url}")

	# Step 4: Parse the sitemap and extract links
	sitemap_links = parse_sitemap(sitemap_url)

	# Step 5: Export the list of links to a CSV file
	with open("sitemap_links.csv", "w", newline="") as file:
	writer = csv.writer(file)
	writer.writerow(["URL"])
	writer.writerows([[url] for url in sitemap_links])

	print("Sitemap links have been saved to 'sitemap_links.csv'")
	else:
	print("No sitemap found on the provided website.")

	# Second Script
	# read the CSV file and extract all URLs
	urls = []
	with open("sitemap_links.csv", "r") as csvfile:
	reader = csv.reader(csvfile)
	for row in reader:
	urls.append(row[0])

	# Common class names and element names for headers and footers to ignore
	ignore_list = [
	".header",
	".site-header",
	".navbar",
	".topbar",
	".main-header",
	".logo",
	".site-logo",
	".menu",
	".main-menu",
	".navigation",
	".nav",
	".branding",
	".footer",
	".site-footer",
	".bottombar",
	".main-footer",
	".copyright",
	".footer-menu",
	".contact-info",
	".social-icons",
	".copyright-text",
	]

	# create a CSV file and write the data to it
	with open("detected_links.csv", "w", newline="") as csvfile:
	writer = csv.writer(csvfile)
	writer.writerow(["Page URL", "Page", "Type", "Text", "Href"])

	# iterate over all URLs
	total_links = 0
	pages_visited = set()
	for page_url in urls:
	try:
	response = requests.get(page_url)
	except requests.exceptions.RequestException:
	continue
	html = response.content
	soup = BeautifulSoup(html, "html.parser")

	# Ignore links and buttons within header and footer elements
	links = soup.select(f"a:not({' ,'.join(ignore_list)}) , button:not({' ,'.join(ignore_list)})")

	for link in links:
	if link.has_attr("href"):
	link_type = "Link"
	if link.has_attr("class") and any(cls in link["class"] for cls in ["btn", "button", "as-button", "as-btn"]):
	link_type = "Link as Button"
	full_url = urljoin(page_url, link["href"])
	writer.writerow([page_url, soup.title.string if soup.title else "No title", link_type, link.text.strip(), full_url])
	total_links += 1
	pages_visited.add(page_url)
	print(f"Scanned {len(pages_visited)} out of {len(urls)} URLs, found {total_links} links", end="\r")

	print("Scanning completed.")
	print(f"Total unique pages visited: {len(pages_visited)}")
	print(f"Total links found: {total_links}")