Last active
September 17, 2023 10:11
-
-
Save NoWorries/12e2687880379317e9af6f1c7a18c343 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import csv | |
from urllib.parse import urlparse, urljoin | |
# read the CSV file and extract all URLs | |
urls = [] | |
with open("sitemap_links.csv", "r") as csvfile: | |
reader = csv.reader(csvfile) | |
for row in reader: | |
urls.append(row[0]) | |
# Common class names and element names for headers and footers to ignore | |
ignore_list = [ | |
".header", | |
".site-header", | |
".navbar", | |
".topbar", | |
".main-header", | |
".logo", | |
".site-logo", | |
".menu", | |
".main-menu", | |
".navigation", | |
".nav", | |
".branding", | |
".footer", | |
".site-footer", | |
".bottombar", | |
".main-footer", | |
".copyright", | |
".footer-menu", | |
".contact-info", | |
".social-icons", | |
".copyright-text", | |
] | |
# create a CSV file and write the data to it | |
with open("detected_links.csv", "w", newline="") as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(["Page URL", "Page", "Type", "Text", "Href"]) | |
# iterate over all URLs | |
total_links = 0 | |
pages_visited = set() | |
for page_url in urls: | |
try: | |
response = requests.get(page_url) | |
except requests.exceptions.RequestException: | |
continue | |
html = response.content | |
soup = BeautifulSoup(html, "html.parser") | |
# Ignore links and buttons within header and footer elements | |
links = soup.select(f"a:not({' ,'.join(ignore_list)}) , button:not({' ,'.join(ignore_list)})") | |
for link in links: | |
if link.has_attr("href"): | |
link_type = "Link" | |
if link.has_attr("class") and any(cls in link["class"] for cls in ["btn", "button", "as-button", "as-btn"]): | |
link_type = "Link as Button" | |
full_url = urljoin(page_url, link["href"]) | |
writer.writerow([page_url, soup.title.string if soup.title else "No title", link_type, link.text.strip(), full_url]) | |
total_links += 1 | |
pages_visited.add(page_url) | |
print(f"Scanned {len(pages_visited)} out of {len(urls)} URLs, found {total_links} links", end="\r") | |
print("Scanning completed.") | |
print(f"Total unique pages visited: {len(pages_visited)}") | |
print(f"Total links found: {total_links}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Setting Up the Environment and Dependencies
Install Python:
If you don't already have Python installed, you can download and install it from the official website: Python Downloads. Choose the appropriate version for your operating system (Windows, macOS, or Linux).
Install Dependencies:
Open a terminal or command prompt and navigate to the directory where your script will be located. Then, run the following commands to install the required Python packages:
These commands will install the
requests
andbeautifulsoup4
packages, which are necessary for the script to work.Running the Script
Prepare the CSV File:
Ensure that you have a CSV file named
sitemap_links.csv
in the same directory as your script. This CSV file should contain a list of URLs you want to scan. Each URL should be in a separate row in the first column.Edit the Ignore List (Optional):
If you want to customize the elements that the script ignores (such as specific header or footer classes or element names), you can edit the ignore_list in the script. Open the script using a text editor, and you'll find the ignore_list variable. Add or remove class names or element names as needed.
For example, if you want to ignore links within elements with the class
my-header
andmy-footer
, you can modify the ignore_list like this:Save the script after making your changes.
Run the Script:
Open a terminal or command prompt, navigate to the directory where your script is located, and run the script by entering the following command:
`python scan_pages.py``
Replace "scan_pages.py" with the actual name of your script file. The script will start scanning the URLs in the "sitemap_links.csv" file, collecting links, and saving the results in a new CSV file named "detected_links.csv."
View the Results:
Once the script has finished scanning, you can open the "detected_links.csv" file to view the collected data. This file will contain information about the links found on each page, including their type, text, and href attributes.