This code shows how to check a sitemap to make sure there are no links pointing to missing pages and to see if 301s are working correctly. It is explained at
import requests
from bs4 import BeautifulSoup
sitemap = ''
r = requests.get(sitemap)
html = r.content
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a')
hrefs = filter(None, (link.get('href') for link in links))
urls = (href for href in hrefs if href.startswith('http'))
reqs = (requests.head(url) for url in external)
results = sorted(reqs, key=lambda r: (r.status_code, len(r.history)))
for r in results:
if r.ok and not r.history:
print(' - '.join([str(r.status_code), r.reason, r.url]))
for r in results:
if r.ok and r.history:
print('{} redirected'.format(r.url))
for response in result['history']:
print('>> Redirect to {}'.format(response.url))
for r in results:
if not r.ok:
print(' - '.join([str(r.status_code), r.reason, r.url]))
