Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save praharshbhatt/e2df4bb6333e8ff7951b3a975c4157eb to your computer and use it in GitHub Desktop.
Save praharshbhatt/e2df4bb6333e8ff7951b3a975c4157eb to your computer and use it in GitHub Desktop.
This Python script scrapes a provided sitemap URL, extracts the URLs, and saves them in a CSV file. It supports both regular sitemaps and sitemap index files, allowing you to handle nested sitemaps effectively. The script utilizes the requests library for making HTTP requests and the BeautifulSoup library for XML parsing.
import requests
import csv
from bs4 import BeautifulSoup
from os.path import exists
def scrape_sitemap(url):
"""
Scrapes the provided sitemap URL and returns a list of extracted data.
Handles both regular sitemaps and sitemap index files.
"""
# Send a GET request to the sitemap URL
response = requests.get(url)
if response.status_code == 200:
# Parse the XML sitemap using BeautifulSoup
soup = BeautifulSoup(response.content, "lxml-xml")
# Check if the sitemap is a sitemap index file
if soup.sitemapindex:
# Extract all the <sitemap> elements
sitemaps = soup.find_all("sitemap")
# Initialize a list to store the extracted data
data = []
for sitemap in sitemaps:
# Extract the URL of each sitemap
sitemap_url = sitemap.find("loc").text.strip()
# Scrape the nested sitemap and append the extracted data to the list
data += scrape_sitemap(sitemap_url)
return data
else:
# Find all <url> elements in the sitemap
urls = soup.find_all("url")
# Initialize a list to store the extracted data
data = []
for url in urls:
# Extract the required information from each <url> element
loc = url.find("loc").text.strip()
# Split the URL to get the site name and page file name
split_url = loc.split("/")
site_name = split_url[2]
if split_url[-1] == '':
file_name = split_url[-2]
else:
file_name = split_url[-1]
# Append the extracted data to the list
data.append([site_name, file_name, loc])
return data
else:
# If the sitemap URL is invalid or returns an error, return an empty list
return []
print("valid sitemap not found for " + url)
def save_to_csv(data, filename):
"""
Writes the extracted data to a CSV file with the specified filename.
"""
# First, read the previous data if the file exists
previous_data = []
if exists(filename):
with open(filename, "r", newline="") as file:
reader = csv.reader(file)
previous_data = list(reader)
# Write the extracted data to a CSV file
with open(filename, "w", newline="") as file:
writer = csv.writer(file)
if not previous_data:
writer.writerow(["Site Name", "File Name", "Page URL"])
else:
writer.writerows(previous_data)
writer.writerows(data)
print(f"Data for saved to {filename}")
# Add all the sites you want to scrape to this list
sitemap_urls = [
"https://www.multiverseapp.com/sitemap.xml",
]
number_of_sites_scraped = 0
for sitemap_url in sitemap_urls:
# Scrape the sitemap and get the extracted data
try:
extracted_data = scrape_sitemap(sitemap_url)
number_of_sites_scraped += 1
except:
print("failed to scrape " + sitemap_url)
continue
if extracted_data:
# Save the extracted data to a CSV file
save_to_csv(extracted_data, "sitemap_data.csv")
else:
print("Failed to scrape the sitemap. Please check the URL.")
print("Completed scraping " + str(number_of_sites_scraped) + " sitemap(s)")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment