Skip to content

Instantly share code, notes, and snippets.

@dagrz
Created August 29, 2023 05:51
Show Gist options
  • Save dagrz/66faa8d6c5a552ca060fc0ed637d5f63 to your computer and use it in GitHub Desktop.
Save dagrz/66faa8d6c5a552ca060fc0ed637d5f63 to your computer and use it in GitHub Desktop.
List AWS documentation URLs from sitemaps
#!/usr/bin/env python3
import requests, argparse
import xml.etree.ElementTree as ET
SITEMAP_URI = 'https://docs.aws.amazon.com/sitemap_index.xml'
def main():
get_sitemap_and_parse(SITEMAP_URI)
def get_sitemap_and_parse(sitemap_uri):
# Recurisively get the sitemap and parse it
try:
res = requests.get(sitemap_uri, allow_redirects=False)
parse_sitemap_xml(res.text)
except Exception as e:
pass
def parse_sitemap_xml(sitemap_data):
root = ET.fromstring(sitemap_data)
namespace = root.tag.split('}')[0].strip('{')
for sitemap_child in root.findall(f'{{{namespace}}}sitemap/{{{namespace}}}loc'):
get_sitemap_and_parse(sitemap_child.text)
for sitemap_child in root.findall(f'{{{namespace}}}url/{{{namespace}}}loc'):
print(sitemap_child.text)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment