Skip to content

Instantly share code, notes, and snippets.

@pkdavies
Created May 31, 2023 20:57
Show Gist options
  • Save pkdavies/40836b343db5de3f69469f016f055abb to your computer and use it in GitHub Desktop.
Save pkdavies/40836b343db5de3f69469f016f055abb to your computer and use it in GitHub Desktop.
Extract URLs from a sitemap.xml
import requests
from xml.etree import ElementTree as ET
import random
def extract_urls(sitemap_url, urls=None):
if urls is None:
urls = []
well_known_user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
]
headers = {
'User-Agent': random.choice(well_known_user_agents)
}
# Fetch the sitemap content
response = requests.get(sitemap_url, headers=headers)
response.raise_for_status()
sitemap_xml = response.content
# Parse the sitemap XML
root = ET.fromstring(sitemap_xml)
for elem in root.iter():
# Extract URLs or Sitemap Index URLs
if elem.tag.endswith("sitemap"):
loc = elem.find("{*}loc")
if loc is not None:
extract_urls(loc.text, urls)
elif elem.tag.endswith("loc"):
urls.append(elem.text)
return urls
# Replace with your sitemap URL
sitemap_url = "https://example.com/sitemap.xml"
urls = extract_urls(sitemap_url)
print("Extracted URLs:")
for url in urls:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment