Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
sitemap_index_url="https://www.practicalecommerce.com/sitemapindex.xml"
from bs4 import BeautifulSoup
import requests
sitemap_index = {}
r = requests.get(sitemap_index_url)
xml = r.text
soup = BeautifulSoup(xml)
sitemapTags = soup.find_all("sitemap")
print("The number of sitemaps are {0}".format(len(sitemapTags)))
for sitemap in sitemapTags:
sitemap_index[sitemap.findNext("loc").text] = sitemap.findNext("lastmod").text
print(sitemap_index)
#The number of sitemaps are 11
#{'https://www.practicalecommerce.com/site.xml': '2019-07-21T13:42:53Z', 'https://www.practicalecommerce.com/post_part1.xml': '2019-07-21T16:39:21Z', 'https://www.practicalecommerce.com/post_part2.xml': '2019-07-21T09:11:55Z', 'https://www.practicalecommerce.com/post_part3.xml': '2019-07-21T13:43:53Z', 'https://www.practicalecommerce.com/post_seminars.xml': '2019-07-21T13:41:18Z', 'https://www.practicalecommerce.com/post_wpbdp_listing.xml': '2019-07-21T13:46:39Z', 'https://www.practicalecommerce.com/post_google_news.xml': '2019-07-21T17:42:10Z', 'https://www.practicalecommerce.com/page.xml': '2019-07-21T13:46:23Z', 'https://www.practicalecommerce.com/taxonomy_category.xml': '2019-07-21T13:41:48Z', 'https://www.practicalecommerce.com/taxonomy_post_tag.xml': '2019-07-21T13:42:39Z', 'https://www.practicalecommerce.com/author.xml': '2019-07-21T13:41:34Z'}
sitemaps = dict()
paths = list()
lastmods = list()
for (sitemap_url, lasmod) in sitemap_index.items():
if(sitemap_url.find("post") > 0):
print(sitemap_url)
if 1: # for testing
r = requests.get(sitemap_url)
xml = r.text
soup = BeautifulSoup(xml)
URLTags = soup.find_all("url")
print("The number of URLs are {0}".format(len(URLTags)))
for URL in URLTags:
if URL.findNext("lastmod"):
url = URL.findNext("loc").text
parsed_url = urlparse(url)
paths.append(parsed_url.path)
lastmods.append(URL.findNext("lastmod").text)
#sitemaps[parsed_url.path] = URL.findNext("lastmod").text
#print(xml_sitemap) #for testing
sitemaps["path"] = paths
sitemaps["lastmod"] = lastmods
#https://www.practicalecommerce.com/post_part1.xml
#The number of URLs are 2500
#https://www.practicalecommerce.com/post_part2.xml
#The number of URLs are 2500
#https://www.practicalecommerce.com/post_part3.xml
#The number of URLs are 1301
#https://www.practicalecommerce.com/post_seminars.xml
#The number of URLs are 10
#https://www.practicalecommerce.com/post_wpbdp_listing.xml
#The number of URLs are 1162
#https://www.practicalecommerce.com/post_google_news.xml
#The number of URLs are 1
#https://www.practicalecommerce.com/taxonomy_post_tag.xml
#The number of URLs are 4
import pandas as pd
df = pd.DataFrame.from_dict(sitemaps)
df.head(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment