kism/webindexer.py

## webindexer.py
#!/usr/bin/env python3
import textwrap
import os
import math
from os import listdir, path
from html.parser import HTMLParser

# Globals
debug = True
rootDir = '/var/www/html'  # Make sure there is no forward slash at the end
outHTML = 'sitemap.html'
#dirtoinclude = ["siteone", "sitetwo", "sitethree"]
dirtoinclude = ["siteone", "sitethree"]

htmlintro = textwrap.dedent("""
<!DOCTYPE html>
<html>
  <head>
    <title>Site Index</title>
    <meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />

    <link
      rel="stylesheet"
      href="https://fonts.googleapis.com/css?family=Fira+Code"
    />
  </head>

  <style>
    body {
      font-family: "Fira Code", "Consolas", "Lucida Console", monospace;
      font-size: 12px;
      margin-left: 4px;
      background-color: rgb(26, 26, 26);
      color: rgb(200, 200, 200);
    }

    h1 {
      font-size: 20px;
      color: rgb(220, 220, 220);
    }

    h2 {
      font-size: 16px;
      color: rgb(220, 220, 220);
    }

    a:link {
      color: rgb(0, 128, 128);
    }

    a:visited {
      color: rgb(128, 0, 64);
    }
  </style>

<body>
    <h2>Website Index</h2>
""")

htmloutro = textwrap.dedent("""
</body>

</html>
""")


# Debug print function
def print_debug(inText=''):
    if debug:
        print("\033[93m" + "DEBUG: " + str(inText) + "\033[0m")


# HTMLParser object to grab the 'title' of the html page, in this case scan for h1, h2 tags etc
class TitleParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.recording = False
        self.data = []

    def handle_starttag(self, tag, attributes):
        if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2:
            self.recording = True
            return
        else:
            self.recording = False
            return

    def handle_endtag(self, tag):
        if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2 and self.recording:
            self.recording = False

    def handle_data(self, data):
        if self.recording:
            data = remove_white_space(data)
            print_debug("Heading tag data: " + data)
            self.data.append(data)
            return data


# Remove whitespace, useful for funky headings
def remove_white_space(inText):
    inText = inText.strip() # use the inbuilt python whitespace removal, this only gets leading and trailing whitespace
    whitespacelist = ['\n','\t','  '] # Make sure double space is last
    for whitespacechar in whitespacelist:
        while whitespacechar in inText: # While there is this type of whitespace in the string
            inText = inText.replace(whitespacechar,' ') # Replace with single space, this prevents newlines being removed and joining words together without a space
    return inText


# Open file, send it to html parser
def get_page_title(inDir, inFileName):
    # use os.sep instead of / just incase this somehow runs on some lesser operating system
    path = inDir + os.sep + inFileName

    print_debug("Opening: " + path)
    with open(path, 'r') as file:
        html = file.read()
    # print_debug(html) # this is 10/10 noisy

    p = TitleParser() # Create html parser instance
    p.feed(html)      # Feed the parser the html

    # Output the title, p.data is a list of results, we are grabbing the first entry if it exists
    out = ''
    try:
        out = p.data[0]
    except:
        out = inFileName
        # This is a hack, default name for index.html
        if inFileName == 'index.html':
            out = 'Index'

    print_debug("Using link title: " + out)
    file.close()
    return out

def get_depth(inPathDepth):
    # Using logerithms for the indent, dimishing indentation
    result = math.log((inPathDepth - 0.3) * 0.5,1.15) - 1.4

    # Just incase, helpful when playing with the constants
    if result < 0:
        result = 0

    result = round(result,2)
    return result


# Get the page title, Create paragraph with a href ;)
def process_file(inDirName, inFileName, inPathDepth):
    pagetitle = get_page_title(inDirName, inFileName)
    inDirName = inDirName.replace(rootDir, '') # Strip out root directory, the remaining will be valid to the webroot
    line = '\t<p' + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '><a href="' + inDirName + "/" + inFileName + '">' + pagetitle + '</a></p>\n' # Create the line of html
    print_debug('Adding HTML line: ' + remove_white_space(line))
    print_debug()
    return line


# Add an entry for folder name
def processFolder(inDirName, inPathDepth):
    inPathDepth += 2
    print_debug("Adding directory: " + inDirName)
    line = '\t<h' + str(inPathDepth) + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '>' + inDirName + '</h' + str(inPathDepth) + '>\n' # Create the line of html, the heading level corrisponds to the depth of the directory
    print_debug('Adding HTML line: ' + remove_white_space(line))
    print_debug()
    return line


# Main, would you believe
def main():
    print_debug("Main")
    print_debug()

    htmlout = htmlintro # Start the html sandwich

    # Do a scan of html files in the root directory and add them to the html
    for fileName in listdir(rootDir):
        if fileName.endswith(".html"):
            htmlout = htmlout + process_file(rootDir, fileName, 3)

    # Do a recursive scan of html files for paths that have enteries in the whitelist
    for dirName, _, fileList in os.walk(rootDir):
        pathDepth = len(dirName.split(os.sep)) - len(rootDir.split(os.sep))
        for dir in dirtoinclude:
            print_debug("Checking whitelist: " + dir + " > " + dirName)
            if dirName.find(dir) != -1:
                print_debug("Found result: " + dir + " in " + dirName)
                # Grab the name of the deepest folder in the path,
                # capitalise its name and add send it to be added to the html
                htmlout += processFolder((dirName.rsplit(os.sep, 1)[1].upper()), pathDepth)
                # For every file in the folder, check if its a html file, send it for processing
                for fileName in fileList:
                    if fileName.endswith(".html"):
                        htmlout = htmlout + \
                            process_file(dirName, fileName, pathDepth + 3)
            else:
                print_debug("Nope, directory not on whitelist")
            print_debug()

    htmlout = htmlout + htmloutro # Finish the html sandwich

    # Write the sitemap html file
    print_debug('Writing file: ' + rootDir + '/' + outHTML)
    with open(rootDir + '/' + outHTML, 'w') as file:
        file.write(htmlout)
    file.close()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	import textwrap
	import os
	import math
	from os import listdir, path
	from html.parser import HTMLParser

	# Globals
	debug = True
	rootDir = '/var/www/html' # Make sure there is no forward slash at the end
	outHTML = 'sitemap.html'
	#dirtoinclude = ["siteone", "sitetwo", "sitethree"]
	dirtoinclude = ["siteone", "sitethree"]

	htmlintro = textwrap.dedent("""
	<!DOCTYPE html>
	<html>
	<head>
	<title>Site Index</title>
	<meta http-equiv="X-Clacks-Overhead" content="GNU Terry Pratchett" />

	<link
	rel="stylesheet"
	href="https://fonts.googleapis.com/css?family=Fira+Code"
	/>
	</head>

	<style>
	body {
	font-family: "Fira Code", "Consolas", "Lucida Console", monospace;
	font-size: 12px;
	margin-left: 4px;
	background-color: rgb(26, 26, 26);
	color: rgb(200, 200, 200);
	}

	h1 {
	font-size: 20px;
	color: rgb(220, 220, 220);
	}

	h2 {
	font-size: 16px;
	color: rgb(220, 220, 220);
	}

	a:link {
	color: rgb(0, 128, 128);
	}

	a:visited {
	color: rgb(128, 0, 64);
	}
	</style>

	<body>
	<h2>Website Index</h2>
	""")

	htmloutro = textwrap.dedent("""
	</body>

	</html>
	""")


	# Debug print function
	def print_debug(inText=''):
	if debug:
	print("\033[93m" + "DEBUG: " + str(inText) + "\033[0m")


	# HTMLParser object to grab the 'title' of the html page, in this case scan for h1, h2 tags etc
	class TitleParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.recording = False
	self.data = []

	def handle_starttag(self, tag, attributes):
	if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2:
	self.recording = True
	return
	else:
	self.recording = False
	return

	def handle_endtag(self, tag):
	if (tag[0] == 'h' or tag[0] == 'H') and len(tag) == 2 and self.recording:
	self.recording = False

	def handle_data(self, data):
	if self.recording:
	data = remove_white_space(data)
	print_debug("Heading tag data: " + data)
	self.data.append(data)
	return data


	# Remove whitespace, useful for funky headings
	def remove_white_space(inText):
	inText = inText.strip() # use the inbuilt python whitespace removal, this only gets leading and trailing whitespace
	whitespacelist = ['\n','\t',' '] # Make sure double space is last
	for whitespacechar in whitespacelist:
	while whitespacechar in inText: # While there is this type of whitespace in the string
	inText = inText.replace(whitespacechar,' ') # Replace with single space, this prevents newlines being removed and joining words together without a space
	return inText


	# Open file, send it to html parser
	def get_page_title(inDir, inFileName):
	# use os.sep instead of / just incase this somehow runs on some lesser operating system
	path = inDir + os.sep + inFileName

	print_debug("Opening: " + path)
	with open(path, 'r') as file:
	html = file.read()
	# print_debug(html) # this is 10/10 noisy

	p = TitleParser() # Create html parser instance
	p.feed(html) # Feed the parser the html

	# Output the title, p.data is a list of results, we are grabbing the first entry if it exists
	out = ''
	try:
	out = p.data[0]
	except:
	out = inFileName
	# This is a hack, default name for index.html
	if inFileName == 'index.html':
	out = 'Index'

	print_debug("Using link title: " + out)
	file.close()
	return out

	def get_depth(inPathDepth):
	# Using logerithms for the indent, dimishing indentation
	result = math.log((inPathDepth - 0.3) * 0.5,1.15) - 1.4

	# Just incase, helpful when playing with the constants
	if result < 0:
	result = 0

	result = round(result,2)
	return result


	# Get the page title, Create paragraph with a href ;)
	def process_file(inDirName, inFileName, inPathDepth):
	pagetitle = get_page_title(inDirName, inFileName)
	inDirName = inDirName.replace(rootDir, '') # Strip out root directory, the remaining will be valid to the webroot
	line = '\t<p' + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '><a href="' + inDirName + "/" + inFileName + '">' + pagetitle + '</a></p>\n' # Create the line of html
	print_debug('Adding HTML line: ' + remove_white_space(line))
	print_debug()
	return line


	# Add an entry for folder name
	def processFolder(inDirName, inPathDepth):
	inPathDepth += 2
	print_debug("Adding directory: " + inDirName)
	line = '\t<h' + str(inPathDepth) + ' style=" margin-left:+' + str(get_depth(inPathDepth)) + '%"' + '>' + inDirName + '</h' + str(inPathDepth) + '>\n' # Create the line of html, the heading level corrisponds to the depth of the directory
	print_debug('Adding HTML line: ' + remove_white_space(line))
	print_debug()
	return line


	# Main, would you believe
	def main():
	print_debug("Main")
	print_debug()

	htmlout = htmlintro # Start the html sandwich

	# Do a scan of html files in the root directory and add them to the html
	for fileName in listdir(rootDir):
	if fileName.endswith(".html"):
	htmlout = htmlout + process_file(rootDir, fileName, 3)

	# Do a recursive scan of html files for paths that have enteries in the whitelist
	for dirName, _, fileList in os.walk(rootDir):
	pathDepth = len(dirName.split(os.sep)) - len(rootDir.split(os.sep))
	for dir in dirtoinclude:
	print_debug("Checking whitelist: " + dir + " > " + dirName)
	if dirName.find(dir) != -1:
	print_debug("Found result: " + dir + " in " + dirName)
	# Grab the name of the deepest folder in the path,
	# capitalise its name and add send it to be added to the html
	htmlout += processFolder((dirName.rsplit(os.sep, 1)[1].upper()), pathDepth)
	# For every file in the folder, check if its a html file, send it for processing
	for fileName in fileList:
	if fileName.endswith(".html"):
	htmlout = htmlout + \
	process_file(dirName, fileName, pathDepth + 3)
	else:
	print_debug("Nope, directory not on whitelist")
	print_debug()

	htmlout = htmlout + htmloutro # Finish the html sandwich

	# Write the sitemap html file
	print_debug('Writing file: ' + rootDir + '/' + outHTML)
	with open(rootDir + '/' + outHTML, 'w') as file:
	file.write(htmlout)
	file.close()


	if __name__ == "__main__":
	main()