Skip to content

Instantly share code, notes, and snippets.

@adamnew123456
Created January 5, 2015 17:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adamnew123456/1d14699b865bb8b35556 to your computer and use it in GitHub Desktop.
Save adamnew123456/1d14699b865bb8b35556 to your computer and use it in GitHub Desktop.
WikiWord - A Static Wiki-Like Site Generator
"""
Converts a tree of wiki pages into a tree of HTML pages.
"""
from collections import defaultdict
import configparser
import glob
import itertools
import os, os.path
import re
import shutil
import subprocess
import sys
# For example, SomeNamespae:OtherPage
NAMESPACE_CHAR = ':'
# The underscore is included to avoid atrocities like CeeLanguage, replacing
# it with C_Language
WIKIWORD = re.compile(r'(\W|^)([A-Z][a-z0-9:_]+){2,}(\W|$)')
# The file extension used for wiki pages
WIKI_EXT = '.md'
# The main page to use, by copying it into index.html - None means that a
# 'master file' will be produced while includes all other pages
MAIN_PAGE = None
# The command line to use when converting wiki files into their HTML output.
# Note that no input file is given, because stdin is used; similarly, no output
# file is given, as stdout is used to collect output
CONVERT_CMDLINE = 'hsmarkdown'
# The link format to use when converting WikiWords to links.
LINK_FORMAT = '<span class="wikiword"> [{title}]({link}) </span>'
# The path of the CSS style to use
CSS_FILE = '/style.css'
# The HTML output to use as a template
OUTPUT_HTML = '''
<html>
<head>
<title> {title} </title>
<link rel="stylesheet" type="text/css" href="{css}" />
</head>
<body>
<h1 class="title"> {title} </h1>
{body}
</body>
</html>
'''
def get_path_elems(path):
"""
Gets all parts of a path.
>>> get_path_elems('/foo/bar/baz')
['foo', 'bar', 'baz']
"""
path = os.path.normpath(path)
elems = []
while True:
head, tail = os.path.split(path)
if not tail:
elems.reverse()
return elems
else:
elems.append(tail)
path = head
def create_directory_tree(path):
"""
Creates a directory tree for all elements of the given path, if the
path doesn't already point to a directory.
"""
elems = get_path_elems(path)
prefix = '/' + elems.pop(0)
for elem in elems:
full_path = os.path.join(prefix, elem)
if not os.path.isdir(full_path):
os.mkdir(full_path)
prefix = os.path.join(prefix, elem)
def wikiword_to_path(wikiword, suffix='.html', leading_slash=True):
"""
Converts a WikiWord to a link. Note that WikiWords may be namespaced,
and so converts namespaces into directories.
>>> wikiword_to_path('WikiWord')
'/WikiWord.html'
>>> wikiword_to_path('NameSpace:WikiWord')
'/NameSpace/WikiWord.html'
>>> wikiword_to_path('WikiWord', suffix='.foo')
'/WikiWord.foo'
>>> wikiword_to_path('WikiWord', leading_slash=False)
'WikiWord.html'
"""
elements = wikiword.split(NAMESPACE_CHAR)
return (('/' if leading_slash else '') +
os.sep.join(elements) + suffix)
def path_to_wikiword(path):
"""
Converts a path (relative to either the input directory or the output
directory) to a namespaced WikiWord, ignoring any suffixes.
>>> path_to_wikiword('/WikiWord.html')
'WikiWord'
>>> path_to_wikiword('/NameSpace/WikiWord.html')
'NameSpace:WikiWord'
>>> path_to_wikiword('/WikiWord.foo')
'WikiWord'
>>> path_to_wikiword('WikiWord.html')
'WikiWord'
"""
path = os.path.normpath(path)
elements = path.split(os.sep)
# The path may start with /, so drop the header if it is empty
if not elements[0]:
elements.pop(0)
# Purge any extension, since it isn't part of a valid WikiWord
real_name, _ = os.path.splitext(elements[-1])
elements[-1] = real_name
return ':'.join(elements)
def count_backslashes_at_end(text):
"""
Counts the number of backslahes at the end of a string. Used to figure out
whether or not to escape a WikiWord, and how many backslashes to add when
escaping one.
"""
sans_backslashes = text.rstrip('\\')
return len(text) - len(sans_backslashes)
def find_wikiwords(in_stream):
"""
Takes a stream, and generates from it a list, which contains two types of
tuples:
- (True, 'WikiWord') indicates text that is a WikiWord
- (False, '...') indicates text that is not a WikiWord
"""
text = in_stream.read()
while text:
match = WIKIWORD.search(text)
if not match:
yield (False, text)
break
match_begin, match_end = match.span()
wikiword = match.group()
pre_wikiword_text = text[:match_begin]
post_wikiword_text = text[match_end:]
# Remove any leading characters which are a part of the match generated
# by the regex, but which are not really a part of the WikiWord.
leading_text, *_, trailing_text = match.groups()
pre_wikiword_text += leading_text
post_wikiword_text = trailing_text + post_wikiword_text
wikiword = wikiword.lstrip(leading_text).rstrip(trailing_text)
# If there isn't any preceding text, then we don't care bout escape
# processing, since there cannot be any to process
if not pre_wikiword_text:
yield (True, wikiword)
text = post_wikiword_text
continue
# Escapes stack up before WikiWords. For example:
#
# WikiWord --> `WikiWord`
# \WikiWord --> WikiWord
# \\WikiWord --> \`WikiWord`
# ...
bs_count = count_backslashes_at_end(pre_wikiword_text)
if bs_count % 2 == 0:
# Even backslashes means that we're going to be producing a WikiWord
extra_bs = '\\' * (bs_count // 2)
pre_wikiword_text = pre_wikiword_text.rstrip('\\') + extra_bs
yield (False, pre_wikiword_text)
yield (True, wikiword)
else:
# Odd backslashes means we're escaping the WikiWord
extra_bs = '\\' * ((bs_count - 1) // 2)
pre_wikiword_text = pre_wikiword_text.rstrip('\\') + extra_bs
yield (False, pre_wikiword_text)
yield (False, wikiword)
text = post_wikiword_text
def wikiword_to_link(stream_tuple, fmt):
"""
Converts WikiWords back into links, which are considered plain text.
>>> wikiword_to_link((True, 'WikiWord'), '[{title}]({link})')
(False, '[WikiWord](/WikiWord.html)')
>>> wikiword_to_link((True, 'NameSpace:WikiWord'), '[{title}]({link})')
(False, '[NameSpace:WikiWord](/NameSpace/WikiWord.html)')
>>> wikiword_to_link((False, 'foo'), '[{title}]({link})')
(False, 'foo')
"""
is_wikiword, wikiword = stream_tuple
if not is_wikiword:
return stream_tuple
return (False, fmt.format(title=wikiword, link=wikiword_to_path(wikiword)))
def category_to_html(categories, category):
"""
Converts a category to HTML which links all the elements of that category.
"""
result = '''
<html>
<head>
<title> Pages In {title} </title>
<link rel="stylesheet" type="text/css" href="{css}" />
</head>
<body>
<h1 class="title"> Pages In {title} </h1>
<ul>
'''.format(title=category, css=CSS_FILE)
for page in categories[category]:
result += '<li> <a href="{path}"> {page} </a> </li>'.format(
path=wikiword_to_path(page),
page=page)
result += '''
</ul>
</body>
</html>
'''
return result
def convert_file(input_dir, output_dir, path, category_store, all_pages):
"""
Converts a file residing in the input directory, to an HTML file in the
output directory.
"""
title = path_to_wikiword(path)
out_path = wikiword_to_path(title, leading_slash=False)
all_pages.add(title)
with open(os.path.join(input_dir, path)) as in_file:
ww_1, ww_2 = itertools.tee(find_wikiwords(in_file))
# First, run the page through the converter, and get the output
with_links_converted = (wikiword_to_link(elem, LINK_FORMAT) for elem in ww_1)
preformatted_text = ''.join(text for (_, text) in with_links_converted)
subproc = subprocess.Popen(
CONVERT_CMDLINE.format(output=os.path.join(output_dir, path)),
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
shell=True)
formatted_text, _ = subproc.communicate(bytes(preformatted_text, 'utf-8'))
with open(os.path.join(output_dir, out_path), 'w') as out_file:
out_file.write(OUTPUT_HTML.format(
title=title,
css=CSS_FILE,
body=str(formatted_text, 'utf-8')))
# Add the text to all the categories it links too
for is_wikiword, wikiword in ww_2:
# Links to Category:FooPages are not categories, they are lists of
# links belonging to Category:Foo - thus, we ignore them
if (is_wikiword and wikiword.startswith('Category:') and
not wikiword.endswith('Pages')):
category_store[wikiword].add(title)
def write_categories(output_path, categories):
"""
Writes out HTML files for all the categories.
"""
for category in list(categories):
# You can have descriptions of a category at Category:Foo, and then
# a list of pages at Category:FooPages
proper_category = category + 'Pages'
html = category_to_html(categories, category)
path = wikiword_to_path(proper_category, leading_slash=False)
with open(os.path.join(output_path, path), 'w') as cat_out:
cat_out.write(html)
def write_master_file(output_path, all_pages):
"""
Writes out an index of every page in the wiki.
"""
result = '''
<html>
<head>
<title> Master Index </title>
<link rel="stylesheet" type="text/css" href="{css}" />
</head>
<body>
<h1 class="title"> Wiki Master Index </h1>
<ul>
'''.format(css=CSS_FILE)
for page in sorted(all_pages):
result += '<li> <a href="{path}"> {page} </a> </li>'.format(
path=wikiword_to_path(page),
page=page)
result += '''
</ul>
</body>
</html>
'''
with open(os.path.join(output_path, 'index.html'), 'w') as master_file:
master_file.write(result)
def convert_tree(input_dir, output_dir):
"""
Converts all the wiki files from the input directory to the output directory.
"""
all_pages = set()
categories = defaultdict(set)
os.chdir(input_dir)
for path, _, files in os.walk('.'):
path = os.path.normpath(path)
# Excuse hidden files, since they shouldn't be included
is_hidden = False
parts = get_path_elems(path)
for part in parts:
if part.startswith('.') and part != '.':
is_hidden = True
break
if is_hidden:
continue
for file in files:
name, ext = os.path.splitext(file)
if ext != WIKI_EXT:
continue
# Ensure that the converter routine has a place to actually put the
# files it generates
create_directory_tree(os.path.join(output_dir, path))
convert_file(input_dir, output_dir, os.path.join(path, file), categories, all_pages)
write_categories(output_dir, categories)
if MAIN_PAGE is None:
write_master_file(output_dir, all_pages)
else:
index_page = os.path.join(output_dir, 'index.html')
source_page = os.path.join(output_dir,
wikiword_to_path(MAIN_PAGE, leading_slash=False))
shutil.copy(source_page, index_page)
if __name__ == '__main__':
USAGE = sys.argv[0] + '<INPUT-DIR> <OUTPUT-DIR>'
try:
INPUT_DIR = os.path.abspath(sys.argv[1])
OUTPUT_DIR = os.path.abspath(sys.argv[2])
except IndexError:
print(USAGE, file=sys.stderr)
sys.exit(1)
config_file_path = os.path.join(INPUT_DIR, 'wiki.conf')
if not os.path.exists(config_file_path):
print('Warning: No configuration file at', config_file_path, '- using defaults',
file=sys.stderr)
config = configparser.ConfigParser()
config.read(config_file_path)
if 'wikiword' in config:
opts = config['wikiword']
if 'extension' in opts:
WIKI_EXT = opts['extension']
if 'converter' in opts:
CONVERT_CMDLINE = opts['converter']
if 'link_format' in opts:
LINK_FORMAT = opts['link_format']
if 'stylesheet' in opts:
CSS_FILE = opts['stylesheet']
if 'mainpage' in opts:
MAIN_PAGE = opts['mainpage']
if not os.path.exists(OUTPUT_DIR):
print('Warning: Creating output directory - move your stylesheet, etc. there',
file=sys.stderr)
os.mkdir(OUTPUT_DIR)
convert_tree(INPUT_DIR, OUTPUT_DIR)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment