Skip to content

Instantly share code, notes, and snippets.

@kyletaylored
Last active April 17, 2018 20:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kyletaylored/2520b81dff7e6e45346a49131f490056 to your computer and use it in GitHub Desktop.
Save kyletaylored/2520b81dff7e6e45346a49131f490056 to your computer and use it in GitHub Desktop.
Python script to extract content types from a Drupal 7 sitemap.xml.
tqdm==4.19.5
requests==2.18.4
beautifulsoup4==4.6.0
PyYAML==3.12
import sys, os, requests, yaml
from bs4 import BeautifulSoup
from tqdm import tqdm
import xml.etree.ElementTree as ET
# Parse for node type
def parse_url_for_node_type(body):
if body.has_attr('class'):
for elem in (body.attrs['class']):
if 'node-type-' in elem:
str(elem)
return elem[10:]
# Parse for page type
def parse_url_for_page_type(body):
if body.has_attr('class'):
for elem in (body.attrs['class']):
if 'page-' in elem:
str(elem)
return elem[5:]
def main():
try:
# Open sitemap, loop through
print("Enter path to sitemap.xml (i.e. http://example.com/sitemap.xml)")
sitemap = input("(press enter to use ./sitemap.xml): \n\n")
if not sitemap:
sitemap = 'sitemap.xml'
# Check for interwebs
if sitemap[:4] == 'http':
xml = requests.get(sitemap)
file = xml.text
elif os.path.isfile(sitemap):
with open(sitemap, 'r') as content_file:
file = content_file.read()
else:
raise Exception("No file exists.")
# Parse XML with BS
s = BeautifulSoup(file, 'html.parser')
surls = s.findAll('url')
xurls = []
for url in surls:
loc = url.find('loc').string
xurls.append(loc)
# Throw error if missing.
if not xurls:
raise Exception("No URLs to cycle.")
pages = {}
types = {}
count = 0
for u in tqdm(xurls):
# For debugging
# count = count + 1
# if count == 25:
# break
# Cast to string, strip newline (it happens)
str(u)
u = u.rstrip()
# fetch page, get soup
page = requests.get(u)
soup = BeautifulSoup(page.text, 'html.parser')
body = soup.find('body')
# get class, update counters
if body:
t = parse_url_for_node_type(body)
p = parse_url_for_page_type(body)
# Count types
if t in types:
count = types[t]['count'] + 1
tObj = {'count': count}
types[t].update(tObj)
else:
tObj = {t: {'count': 1}}
types.update(tObj)
# Capture URL
if 'urls' in types[t]:
url_list = types[t]['urls']
url_list.append(u)
types[t]['urls'] = url_list
else:
url_list = [u]
types[t]['urls'] = url_list
# Store page type with node type.
types[t]['page'] = "null" if not p else p
# Output to screen, save to file.
if types:
with open('sitemap.yml', 'w') as outfile:
yaml.dump(types, outfile, default_flow_style=False)
return 0
except Exception as e:
print(e)
return 1
# Run main
if __name__ == '__main__':
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment