Skip to content

Instantly share code, notes, and snippets.

@martingaido
Created May 25, 2023 20:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martingaido/7d50c38c0c344874ad756f18cd1bd5f8 to your computer and use it in GitHub Desktop.
Save martingaido/7d50c38c0c344874ad756f18cd1bd5f8 to your computer and use it in GitHub Desktop.
Python script to create sitemap.xml
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
def create_sitemap(url):
# Parse the base URL
parsed_url = urlparse(url)
base_url = parsed_url.scheme + "://" + parsed_url.netloc
# Initialize the set of visited URLs
visited_urls = set()
def crawl(url):
try:
# Send a GET request to the URL
response = requests.get(url)
response.raise_for_status()
# Parse the HTML content
soup = BeautifulSoup(response.text, "html.parser")
# Find all anchor tags
for anchor in soup.find_all("a"):
href = anchor.get("href")
# Join the relative URL with the base URL
absolute_url = urljoin(base_url, href)
# Remove any URL fragments
absolute_url = absolute_url.split("#")[0]
# Check if the URL is internal and not visited yet
if absolute_url.startswith(base_url) and absolute_url not in visited_urls:
visited_urls.add(absolute_url)
print("Crawling:", absolute_url)
crawl(absolute_url)
except requests.exceptions.RequestException as e:
print("Error:", e)
# Start crawling from the provided URL
crawl(url)
# Generate the sitemap XML
sitemap = "<?xml version='1.0' encoding='UTF-8'?>\n"
sitemap += "<urlset xmlns='http://www.sitemaps.org/schemas/sitemap/0.9'>\n"
for visited_url in visited_urls:
sitemap += f" <url><loc>{visited_url}</loc></url>\n"
sitemap += "</urlset>"
# Write the sitemap to a file
with open("sitemap.xml", "w") as file:
file.write(sitemap)
print("Sitemap created: sitemap.xml")
# Usage example
create_sitemap("https://www.example.com")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment