Skip to content

Instantly share code, notes, and snippets.

@martyglaubitz
Created May 6, 2020 11:41
Show Gist options
  • Save martyglaubitz/4b5c5563aa8dabe2baa96f79c42d6e2f to your computer and use it in GitHub Desktop.
Save martyglaubitz/4b5c5563aa8dabe2baa96f79c42d6e2f to your computer and use it in GitHub Desktop.
Check urls in sitemap
import csv
import datetime
import io
import sys
import urllib.request
import xml.dom.minidom
def get_sitemap_urls(sitemap_url):
response = urllib.request.urlopen(sitemap_url)
sitemap_xml = response.read().decode('utf-8')
document = xml.dom.minidom.parseString(sitemap_xml)
return [node.firstChild.nodeValue for node in document.getElementsByTagName('loc')]
def check_urls(urls):
string_writer = io.StringIO()
csv_writer = csv.DictWriter(string_writer, fieldnames=['url', 'response_status', 'response_time_millis'], delimiter='|', quotechar=' ')
csv_writer.writeheader()
for url in urls:
time_before = datetime.datetime.now()
response_status = None
try:
response = urllib.request.urlopen(url)
response_status = response.getcode()
except:
response_status = 'error'
time_after = datetime.datetime.now()
csv_writer.writerow(dict(
url=url, response_status=str(response_status), response_time_millis=str(round((time_after-time_before).microseconds / 1000))
))
return string_writer.getvalue()
if __name__ == "__main__":
# usage: python check_sitemap.py https://your.domain/sitemap.xml > output.csv
sitemap_url = sys.argv[1]
urls = get_sitemap_urls(sitemap_url)
check_result_csv = check_urls(urls)
print(check_result_csv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment