HQJaTu/gist:cd66cf659b8ee633685b43c5e7e92f05

## gistfile1.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-

"""
Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/)
Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson
Enhanced by Jari Turkia (https://blog.hqcodeshop.fi/) / @HQJaTu
"""

from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse


def get_sitemap(url):
    get_url = requests.get(url)

    if get_url.status_code == 200:
        return get_url.text
    else:
        print('Unable to fetch sitemap: %s.' % url)


def process_sitemap(s):
    soup = BeautifulSoup(s, 'lxml')
    result = []

    for loc in soup.findAll('loc'):
        result.append(loc.text)

    return result


def is_sub_sitemap(url):
    parts = urlparse(url)
    if parts.path.endswith('.xml') and 'sitemap' in parts.path:
        return True
    else:
        return False


def parse_sitemap(s):
    sitemap = process_sitemap(s)
    result = []

    while sitemap:
        candidate = sitemap.pop()

        if is_sub_sitemap(candidate):
            sub_sitemap = get_sitemap(candidate)
            for i in process_sitemap(sub_sitemap):
                sitemap.append(i)
        else:
            result.append(candidate)

    return result


def main():
    sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml')

    url_count = 0
    for url in parse_sitemap(sitemap):
        url_count += 1
        print("%5d) %s" % (url_count, url))
    print("-end-of-list-")


if __name__ == '__main__':
    main()
	#! /usr/bin/env python
	# -- coding: utf-8 --

	"""
	Inspired by Craig Addyman (http://www.craigaddyman.com/parse-an-xml-sitemap-with-python/)
	Enhanced by Viktor Petersson (http://viktorpetersson.com) / @vpetersson
	Enhanced by Jari Turkia (https://blog.hqcodeshop.fi/) / @HQJaTu
	"""

	from bs4 import BeautifulSoup
	import requests
	from urllib.parse import urlparse


	def get_sitemap(url):
	get_url = requests.get(url)

	if get_url.status_code == 200:
	return get_url.text
	else:
	print('Unable to fetch sitemap: %s.' % url)


	def process_sitemap(s):
	soup = BeautifulSoup(s, 'lxml')
	result = []

	for loc in soup.findAll('loc'):
	result.append(loc.text)

	return result


	def is_sub_sitemap(url):
	parts = urlparse(url)
	if parts.path.endswith('.xml') and 'sitemap' in parts.path:
	return True
	else:
	return False


	def parse_sitemap(s):
	sitemap = process_sitemap(s)
	result = []

	while sitemap:
	candidate = sitemap.pop()

	if is_sub_sitemap(candidate):
	sub_sitemap = get_sitemap(candidate)
	for i in process_sitemap(sub_sitemap):
	sitemap.append(i)
	else:
	result.append(candidate)

	return result


	def main():
	sitemap = get_sitemap('https://www.cloudsigma.com/sitemap.xml')

	url_count = 0
	for url in parse_sitemap(sitemap):
	url_count += 1
	print("%5d) %s" % (url_count, url))
	print("-end-of-list-")


	if __name__ == '__main__':
	main()