praharshbhatt/python_sitemap_extractor_script.py

## python_sitemap_extractor_script.py
import requests
import csv
from bs4 import BeautifulSoup
from os.path import exists


def scrape_sitemap(url):
    """
    Scrapes the provided sitemap URL and returns a list of extracted data.
    Handles both regular sitemaps and sitemap index files.
    """
    # Send a GET request to the sitemap URL
    response = requests.get(url)

    if response.status_code == 200:
        # Parse the XML sitemap using BeautifulSoup
        soup = BeautifulSoup(response.content, "lxml-xml")

        # Check if the sitemap is a sitemap index file
        if soup.sitemapindex:
            # Extract all the <sitemap> elements
            sitemaps = soup.find_all("sitemap")

            # Initialize a list to store the extracted data
            data = []

            for sitemap in sitemaps:
                # Extract the URL of each sitemap
                sitemap_url = sitemap.find("loc").text.strip()

                # Scrape the nested sitemap and append the extracted data to the list
                data += scrape_sitemap(sitemap_url)

            return data
        else:
            # Find all <url> elements in the sitemap
            urls = soup.find_all("url")

            # Initialize a list to store the extracted data
            data = []

            for url in urls:
                # Extract the required information from each <url> element
                loc = url.find("loc").text.strip()

                # Split the URL to get the site name and page file name
                split_url = loc.split("/")
                site_name = split_url[2]

                if split_url[-1] == '':
                    file_name = split_url[-2]
                else:
                    file_name = split_url[-1]

                # Append the extracted data to the list
                data.append([site_name, file_name, loc])

            return data
    else:
        # If the sitemap URL is invalid or returns an error, return an empty list
        return []
        print("valid sitemap not found for " + url)


def save_to_csv(data, filename):
    """
    Writes the extracted data to a CSV file with the specified filename.
    """

    # First, read the previous data if the file exists
    previous_data = []
    if exists(filename):
        with open(filename, "r", newline="") as file:
            reader = csv.reader(file)
            previous_data = list(reader)

    # Write the extracted data to a CSV file
    with open(filename, "w", newline="") as file:
        writer = csv.writer(file)
        if not previous_data:
            writer.writerow(["Site Name", "File Name", "Page URL"])
        else:
            writer.writerows(previous_data)
        writer.writerows(data)

    print(f"Data for saved to {filename}")


# Add all the sites you want to scrape to this list
sitemap_urls = [
    "https://www.multiverseapp.com/sitemap.xml",
]

number_of_sites_scraped = 0
for sitemap_url in sitemap_urls:
    # Scrape the sitemap and get the extracted data
    try:
        extracted_data = scrape_sitemap(sitemap_url)
        number_of_sites_scraped += 1
    except:
        print("failed to scrape " + sitemap_url)
        continue

    if extracted_data:
        # Save the extracted data to a CSV file
        save_to_csv(extracted_data, "sitemap_data.csv")
    else:
        print("Failed to scrape the sitemap. Please check the URL.")

print("Completed scraping " + str(number_of_sites_scraped) + " sitemap(s)")
	import requests
	import csv
	from bs4 import BeautifulSoup
	from os.path import exists


	def scrape_sitemap(url):
	"""
	Scrapes the provided sitemap URL and returns a list of extracted data.
	Handles both regular sitemaps and sitemap index files.
	"""
	# Send a GET request to the sitemap URL
	response = requests.get(url)

	if response.status_code == 200:
	# Parse the XML sitemap using BeautifulSoup
	soup = BeautifulSoup(response.content, "lxml-xml")

	# Check if the sitemap is a sitemap index file
	if soup.sitemapindex:
	# Extract all the <sitemap> elements
	sitemaps = soup.find_all("sitemap")

	# Initialize a list to store the extracted data
	data = []

	for sitemap in sitemaps:
	# Extract the URL of each sitemap
	sitemap_url = sitemap.find("loc").text.strip()

	# Scrape the nested sitemap and append the extracted data to the list
	data += scrape_sitemap(sitemap_url)

	return data
	else:
	# Find all <url> elements in the sitemap
	urls = soup.find_all("url")

	# Initialize a list to store the extracted data
	data = []

	for url in urls:
	# Extract the required information from each <url> element
	loc = url.find("loc").text.strip()

	# Split the URL to get the site name and page file name
	split_url = loc.split("/")
	site_name = split_url[2]

	if split_url[-1] == '':
	file_name = split_url[-2]
	else:
	file_name = split_url[-1]

	# Append the extracted data to the list
	data.append([site_name, file_name, loc])

	return data
	else:
	# If the sitemap URL is invalid or returns an error, return an empty list
	return []
	print("valid sitemap not found for " + url)


	def save_to_csv(data, filename):
	"""
	Writes the extracted data to a CSV file with the specified filename.
	"""

	# First, read the previous data if the file exists
	previous_data = []
	if exists(filename):
	with open(filename, "r", newline="") as file:
	reader = csv.reader(file)
	previous_data = list(reader)

	# Write the extracted data to a CSV file
	with open(filename, "w", newline="") as file:
	writer = csv.writer(file)
	if not previous_data:
	writer.writerow(["Site Name", "File Name", "Page URL"])
	else:
	writer.writerows(previous_data)
	writer.writerows(data)

	print(f"Data for saved to {filename}")


	# Add all the sites you want to scrape to this list
	sitemap_urls = [
	"https://www.multiverseapp.com/sitemap.xml",
	]

	number_of_sites_scraped = 0
	for sitemap_url in sitemap_urls:
	# Scrape the sitemap and get the extracted data
	try:
	extracted_data = scrape_sitemap(sitemap_url)
	number_of_sites_scraped += 1
	except:
	print("failed to scrape " + sitemap_url)
	continue

	if extracted_data:
	# Save the extracted data to a CSV file
	save_to_csv(extracted_data, "sitemap_data.csv")
	else:
	print("Failed to scrape the sitemap. Please check the URL.")

	print("Completed scraping " + str(number_of_sites_scraped) + " sitemap(s)")