Created
May 25, 2023 18:14
-
-
Save praharshbhatt/e2df4bb6333e8ff7951b3a975c4157eb to your computer and use it in GitHub Desktop.
This Python script scrapes a provided sitemap URL, extracts the URLs, and saves them in a CSV file. It supports both regular sitemaps and sitemap index files, allowing you to handle nested sitemaps effectively. The script utilizes the requests library for making HTTP requests and the BeautifulSoup library for XML parsing.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import csv | |
from bs4 import BeautifulSoup | |
from os.path import exists | |
def scrape_sitemap(url): | |
""" | |
Scrapes the provided sitemap URL and returns a list of extracted data. | |
Handles both regular sitemaps and sitemap index files. | |
""" | |
# Send a GET request to the sitemap URL | |
response = requests.get(url) | |
if response.status_code == 200: | |
# Parse the XML sitemap using BeautifulSoup | |
soup = BeautifulSoup(response.content, "lxml-xml") | |
# Check if the sitemap is a sitemap index file | |
if soup.sitemapindex: | |
# Extract all the <sitemap> elements | |
sitemaps = soup.find_all("sitemap") | |
# Initialize a list to store the extracted data | |
data = [] | |
for sitemap in sitemaps: | |
# Extract the URL of each sitemap | |
sitemap_url = sitemap.find("loc").text.strip() | |
# Scrape the nested sitemap and append the extracted data to the list | |
data += scrape_sitemap(sitemap_url) | |
return data | |
else: | |
# Find all <url> elements in the sitemap | |
urls = soup.find_all("url") | |
# Initialize a list to store the extracted data | |
data = [] | |
for url in urls: | |
# Extract the required information from each <url> element | |
loc = url.find("loc").text.strip() | |
# Split the URL to get the site name and page file name | |
split_url = loc.split("/") | |
site_name = split_url[2] | |
if split_url[-1] == '': | |
file_name = split_url[-2] | |
else: | |
file_name = split_url[-1] | |
# Append the extracted data to the list | |
data.append([site_name, file_name, loc]) | |
return data | |
else: | |
# If the sitemap URL is invalid or returns an error, return an empty list | |
return [] | |
print("valid sitemap not found for " + url) | |
def save_to_csv(data, filename): | |
""" | |
Writes the extracted data to a CSV file with the specified filename. | |
""" | |
# First, read the previous data if the file exists | |
previous_data = [] | |
if exists(filename): | |
with open(filename, "r", newline="") as file: | |
reader = csv.reader(file) | |
previous_data = list(reader) | |
# Write the extracted data to a CSV file | |
with open(filename, "w", newline="") as file: | |
writer = csv.writer(file) | |
if not previous_data: | |
writer.writerow(["Site Name", "File Name", "Page URL"]) | |
else: | |
writer.writerows(previous_data) | |
writer.writerows(data) | |
print(f"Data for saved to {filename}") | |
# Add all the sites you want to scrape to this list | |
sitemap_urls = [ | |
"https://www.multiverseapp.com/sitemap.xml", | |
] | |
number_of_sites_scraped = 0 | |
for sitemap_url in sitemap_urls: | |
# Scrape the sitemap and get the extracted data | |
try: | |
extracted_data = scrape_sitemap(sitemap_url) | |
number_of_sites_scraped += 1 | |
except: | |
print("failed to scrape " + sitemap_url) | |
continue | |
if extracted_data: | |
# Save the extracted data to a CSV file | |
save_to_csv(extracted_data, "sitemap_data.csv") | |
else: | |
print("Failed to scrape the sitemap. Please check the URL.") | |
print("Completed scraping " + str(number_of_sites_scraped) + " sitemap(s)") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment