Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Sitemap Crawler Python
###################
# You can find the article about this gist here:
# https://primates.dev/find-all-urls-of-a-website-in-a-few-seconds-python/
####################
import requests
from bs4 import BeautifulSoup as Soup
import pandas as pd
import hashlib
# Pass the headers you want to retrieve from the xml such as ["loc", "lastmod"]
def parse_sitemap( url,headers):
resp = requests.get(url)
# we didn't get a valid response, bail
if (200 != resp.status_code):
return False
# BeautifulSoup to parse the document
soup = Soup(resp.content, "xml")
# find all the <url> tags in the document
urls = soup.findAll('url')
sitemaps = soup.findAll('sitemap')
new_list = ["Source"] + headers
panda_out_total = pd.DataFrame([], columns=new_list)
if not urls and not sitemaps:
return False
# Recursive call to the the function if sitemap contains sitemaps
if sitemaps:
for u in sitemaps:
test = u.find('loc').string
panda_recursive = parse_sitemap(test, headers)
panda_out_total = pd.concat([panda_out_total, panda_recursive], ignore_index=True)
# storage for later...
out = []
# Creates a hash of the parent sitemap
hash_sitemap = hashlib.md5(str(url).encode('utf-8')).hexdigest()
# Extract the keys we want
for u in urls:
values = [hash_sitemap]
for head in headers:
loc = None
loc = u.find(head)
if not loc:
loc = "None"
else:
loc = loc.string
values.append(loc)
out.append(values)
# Create a dataframe
panda_out = pd.DataFrame(out, columns= new_list)
# If recursive then merge recursive dataframe
if not panda_out_total.empty:
panda_out = pd.concat([panda_out, panda_out_total], ignore_index=True)
#returns the dataframe
return panda_out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment