#!/usr/bin/env python3
import textwrap
import os
import math
from os import listdir, path
from html.parser import HTMLParser
# Globals
debug = True
rootDir = '/var/www/html' # Make sure there is no forward slash at the end
outHTML = 'sitemap.html'
#dirtoinclude = ["siteone", "sitetwo", "sitethree"]
dirtoinclude = ["siteone", "sitethree"]
htmlintro = textwrap.dedent("""
<!DOCTYPE html>
<title>Site Index</title>
<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />
body {
font-family: "Fira Code", "Consolas", "Lucida Console", monospace;
font-size: 12px;
margin-left: 4px;
background-color: rgb(26, 26, 26);
color: rgb(200, 200, 200);
h1 {
font-size: 20px;
color: rgb(220, 220, 220);
h2 {
font-size: 16px;
color: rgb(220, 220, 220);
a:link {
color: rgb(0, 128, 128);
a:visited {
color: rgb(128, 0, 64);
<h2>Website Index</h2>
htmloutro = textwrap.dedent("""
# Debug print function
def print_debug(inText=''):
if debug:
print("\033[93m" + "DEBUG: " + str(inText) + "\033[0m")
# HTMLParser object to grab the 'title' of the html page, in this case scan for h1, h2 tags etc
class TitleParser(HTMLParser):
def __init__(self):
self.recording = False = []
def handle_starttag(self, tag, attributes):
if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2:
self.recording = True
self.recording = False
def handle_endtag(self, tag):
if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2 and self.recording:
self.recording = False
def handle_data(self, data):
if self.recording:
data = remove_white_space(data)
print_debug("Heading tag data: " + data)
return data
# Remove whitespace, useful for funky headings
def remove_white_space(inText):
inText = inText.strip() # use the inbuilt python whitespace removal, this only gets leading and trailing whitespace
whitespacelist = ['\n','\t',' '] # Make sure double space is last
for whitespacechar in whitespacelist:
while whitespacechar in inText: # While there is this type of whitespace in the string
inText = inText.replace(whitespacechar,' ') # Replace with single space, this prevents newlines being removed and joining words together without a space
return inText
# Open file, send it to html parser
def get_page_title(inDir, inFileName):
# use os.sep instead of / just incase this somehow runs on some lesser operating system
path = inDir + os.sep + inFileName
print_debug("Opening: " + path)
with open(path, 'r') as file:
html =
# print_debug(html) # this is 10/10 noisy
p = TitleParser() # Create html parser instance
p.feed(html) # Feed the parser the html
# Output the title, is a list of results, we are grabbing the first entry if it exists
out = ''
out =[0]
out = inFileName
# This is a hack, default name for index.html
if inFileName == 'index.html':
out = 'Index'
print_debug("Using link title: " + out)
return out
def get_depth(inPathDepth):
# Using logerithms for the indent, dimishing indentation
result = math.log((inPathDepth - 0.3) * 0.5,1.15) - 1.4
# Just incase, helpful when playing with the constants
if result < 0:
result = 0
result = round(result,2)
return result
# Get the page title, Create paragraph with a href ;)
def process_file(inDirName, inFileName, inPathDepth):
pagetitle = get_page_title(inDirName, inFileName)
inDirName = inDirName.replace(rootDir, '') # Strip out root directory, the remaining will be valid to the webroot
line = '\t<p' + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '><a href="' + inDirName + "/" + inFileName + '">' + pagetitle + '</a></p>\n' # Create the line of html
print_debug('Adding HTML line: ' + remove_white_space(line))
return line
# Add an entry for folder name
def processFolder(inDirName, inPathDepth):
inPathDepth += 2
print_debug("Adding directory: " + inDirName)
line = '\t<h' + str(inPathDepth) + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '>' + inDirName + '</h' + str(inPathDepth) + '>\n' # Create the line of html, the heading level corrisponds to the depth of the directory
print_debug('Adding HTML line: ' + remove_white_space(line))
return line
# Main, would you believe
def main():
htmlout = htmlintro # Start the html sandwich
# Do a scan of html files in the root directory and add them to the html
for fileName in listdir(rootDir):
if fileName.endswith(".html"):
htmlout = htmlout + process_file(rootDir, fileName, 3)
# Do a recursive scan of html files for paths that have enteries in the whitelist
for dirName, _, fileList in os.walk(rootDir):
pathDepth = len(dirName.split(os.sep)) - len(rootDir.split(os.sep))
for dir in dirtoinclude:
print_debug("Checking whitelist: " + dir + " > " + dirName)
if dirName.find(dir) != -1:
print_debug("Found result: " + dir + " in " + dirName)
# Grab the name of the deepest folder in the path,
# capitalise its name and add send it to be added to the html
htmlout += processFolder((dirName.rsplit(os.sep, 1)[1].upper()), pathDepth)
# For every file in the folder, check if its a html file, send it for processing
for fileName in fileList:
if fileName.endswith(".html"):
htmlout = htmlout + \
process_file(dirName, fileName, pathDepth + 3)
print_debug("Nope, directory not on whitelist")
htmlout = htmlout + htmloutro # Finish the html sandwich
# Write the sitemap html file
print_debug('Writing file: ' + rootDir + '/' + outHTML)
with open(rootDir + '/' + outHTML, 'w') as file:
if __name__ == "__main__":
