Created
March 2, 2020 21:33
-
-
Save StanGirard/6ee7999a98b65a67afd531106998e526 to your computer and use it in GitHub Desktop.
Sitemap Crawler Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################### | |
# You can find the article about this gist here: | |
# https://primates.dev/find-all-urls-of-a-website-in-a-few-seconds-python/ | |
#################### | |
import requests | |
from bs4 import BeautifulSoup as Soup | |
import pandas as pd | |
import hashlib | |
# Pass the headers you want to retrieve from the xml such as ["loc", "lastmod"] | |
def parse_sitemap( url,headers): | |
resp = requests.get(url) | |
# we didn't get a valid response, bail | |
if (200 != resp.status_code): | |
return False | |
# BeautifulSoup to parse the document | |
soup = Soup(resp.content, "xml") | |
# find all the <url> tags in the document | |
urls = soup.findAll('url') | |
sitemaps = soup.findAll('sitemap') | |
new_list = ["Source"] + headers | |
panda_out_total = pd.DataFrame([], columns=new_list) | |
if not urls and not sitemaps: | |
return False | |
# Recursive call to the the function if sitemap contains sitemaps | |
if sitemaps: | |
for u in sitemaps: | |
test = u.find('loc').string | |
panda_recursive = parse_sitemap(test, headers) | |
panda_out_total = pd.concat([panda_out_total, panda_recursive], ignore_index=True) | |
# storage for later... | |
out = [] | |
# Creates a hash of the parent sitemap | |
hash_sitemap = hashlib.md5(str(url).encode('utf-8')).hexdigest() | |
# Extract the keys we want | |
for u in urls: | |
values = [hash_sitemap] | |
for head in headers: | |
loc = None | |
loc = u.find(head) | |
if not loc: | |
loc = "None" | |
else: | |
loc = loc.string | |
values.append(loc) | |
out.append(values) | |
# Create a dataframe | |
panda_out = pd.DataFrame(out, columns= new_list) | |
# If recursive then merge recursive dataframe | |
if not panda_out_total.empty: | |
panda_out = pd.concat([panda_out, panda_out_total], ignore_index=True) | |
#returns the dataframe | |
return panda_out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment