Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save HQJaTu/cd66cf659b8ee633685b43c5e7e92f05 to your computer and use it in GitHub Desktop.
Save HQJaTu/cd66cf659b8ee633685b43c5e7e92f05 to your computer and use it in GitHub Desktop.
Parse and dump a sitemap (using Python)
#! /usr/bin/env python
# -*- coding: utf-8 -*-
Inspired by Craig Addyman (
Enhanced by Viktor Petersson ( / @vpetersson
Enhanced by Jari Turkia ( / @HQJaTu
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse
def get_sitemap(url):
get_url = requests.get(url)
if get_url.status_code == 200:
return get_url.text
print('Unable to fetch sitemap: %s.' % url)
def process_sitemap(s):
soup = BeautifulSoup(s, 'lxml')
result = []
for loc in soup.findAll('loc'):
return result
def is_sub_sitemap(url):
parts = urlparse(url)
if parts.path.endswith('.xml') and 'sitemap' in parts.path:
return True
return False
def parse_sitemap(s):
sitemap = process_sitemap(s)
result = []
while sitemap:
candidate = sitemap.pop()
if is_sub_sitemap(candidate):
sub_sitemap = get_sitemap(candidate)
for i in process_sitemap(sub_sitemap):
return result
def main():
sitemap = get_sitemap('')
url_count = 0
for url in parse_sitemap(sitemap):
url_count += 1
print("%5d) %s" % (url_count, url))
if __name__ == '__main__':
Copy link

sorry for the question, im new at python
to where the code dump the sitemap ?
do i need to write any writing to file ?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment