adamnew123456/wikiword.py

## wikiword.py
"""
Converts a tree of wiki pages into a tree of HTML pages.
"""
from collections import defaultdict
import configparser
import glob
import itertools
import os, os.path
import re
import shutil
import subprocess
import sys

# For example, SomeNamespae:OtherPage
NAMESPACE_CHAR = ':'

# The underscore is included to avoid atrocities like CeeLanguage, replacing
# it with C_Language
WIKIWORD = re.compile(r'(\W|^)([A-Z][a-z0-9:_]+){2,}(\W|$)')

# The file extension used for wiki pages
WIKI_EXT = '.md'

# The main page to use, by copying it into index.html - None means that a
# 'master file' will be produced while includes all other pages
MAIN_PAGE = None

# The command line to use when converting wiki files into their HTML output.
# Note that no input file is given, because stdin is used; similarly, no output
# file is given, as stdout is used to collect output
CONVERT_CMDLINE = 'hsmarkdown'

# The link format to use when converting WikiWords to links.
LINK_FORMAT = '<span class="wikiword"> [{title}]({link}) </span>'

# The path of the CSS style to use
CSS_FILE = '/style.css'

# The HTML output to use as a template
OUTPUT_HTML = '''
<html>
    <head>
        <title> {title} </title>
        <link rel="stylesheet" type="text/css" href="{css}" />
    </head>
    <body>
        <h1 class="title"> {title} </h1>
        {body}
    </body>
</html>
'''

def get_path_elems(path):
    """
    Gets all parts of a path.

    >>> get_path_elems('/foo/bar/baz')
    ['foo', 'bar', 'baz']
    """
    path = os.path.normpath(path)
    elems = []
    while True:
        head, tail = os.path.split(path)
        if not tail:
            elems.reverse()
            return elems
        else:
            elems.append(tail)
            path = head


def create_directory_tree(path):
    """
    Creates a directory tree for all elements of the given path, if the
    path doesn't already point to a directory.
    """
    elems = get_path_elems(path)
    prefix = '/' + elems.pop(0)
    for elem in elems:
        full_path = os.path.join(prefix, elem)
        if not os.path.isdir(full_path):
            os.mkdir(full_path)

        prefix = os.path.join(prefix, elem)

def wikiword_to_path(wikiword, suffix='.html', leading_slash=True):
    """
    Converts a WikiWord to a link. Note that WikiWords may be namespaced,
    and so converts namespaces into directories.

    >>> wikiword_to_path('WikiWord')
    '/WikiWord.html'
    >>> wikiword_to_path('NameSpace:WikiWord')
    '/NameSpace/WikiWord.html'
    >>> wikiword_to_path('WikiWord', suffix='.foo')
    '/WikiWord.foo'
    >>> wikiword_to_path('WikiWord', leading_slash=False)
    'WikiWord.html'
    """
    elements = wikiword.split(NAMESPACE_CHAR)
    return (('/' if leading_slash else '') +
            os.sep.join(elements) + suffix)

def path_to_wikiword(path):
    """
    Converts a path (relative to either the input directory or the output
    directory) to a namespaced WikiWord, ignoring any suffixes.

    >>> path_to_wikiword('/WikiWord.html')
    'WikiWord'
    >>> path_to_wikiword('/NameSpace/WikiWord.html')
    'NameSpace:WikiWord'
    >>> path_to_wikiword('/WikiWord.foo')
    'WikiWord'
    >>> path_to_wikiword('WikiWord.html')
    'WikiWord'
    """
    path = os.path.normpath(path)
    elements = path.split(os.sep)

    # The path may start with /, so drop the header if it is empty
    if not elements[0]:
        elements.pop(0)

    # Purge any extension, since it isn't part of a valid WikiWord
    real_name, _ = os.path.splitext(elements[-1])
    elements[-1] = real_name

    return ':'.join(elements)

def count_backslashes_at_end(text):
    """
    Counts the number of backslahes at the end of a string. Used to figure out
    whether or not to escape a WikiWord, and how many backslashes to add when
    escaping one.
    """
    sans_backslashes = text.rstrip('\\')
    return len(text) - len(sans_backslashes)

def find_wikiwords(in_stream):
    """
    Takes a stream, and generates from it a list, which contains two types of
    tuples:

     - (True, 'WikiWord') indicates text that is a WikiWord
     - (False, '...') indicates text that is not a WikiWord
    """
    text = in_stream.read()
    while text:
        match = WIKIWORD.search(text)
        if not match:
            yield (False, text)
            break

        match_begin, match_end = match.span()
        wikiword = match.group()
        pre_wikiword_text = text[:match_begin]
        post_wikiword_text = text[match_end:]

        # Remove any leading characters which are a part of the match generated
        # by the regex, but which are not really a part of the WikiWord.
        leading_text, *_, trailing_text = match.groups()
        pre_wikiword_text += leading_text
        post_wikiword_text = trailing_text + post_wikiword_text
        wikiword = wikiword.lstrip(leading_text).rstrip(trailing_text)

        # If there isn't any preceding text, then we don't care bout escape
        # processing, since there cannot be any to process
        if not pre_wikiword_text:
            yield (True, wikiword)
            text = post_wikiword_text
            continue

        # Escapes stack up before WikiWords. For example:
        #
        #    WikiWord --> `WikiWord`
        #    \WikiWord --> WikiWord
        #    \\WikiWord --> \`WikiWord`
        # ...
        bs_count = count_backslashes_at_end(pre_wikiword_text)
        if bs_count % 2 == 0:
            # Even backslashes means that we're going to be producing a WikiWord
            extra_bs = '\\' * (bs_count // 2)
            pre_wikiword_text = pre_wikiword_text.rstrip('\\') + extra_bs

            yield (False, pre_wikiword_text)
            yield (True, wikiword)
        else:
            # Odd backslashes means we're escaping the WikiWord
            extra_bs = '\\' * ((bs_count - 1) // 2)
            pre_wikiword_text = pre_wikiword_text.rstrip('\\') + extra_bs

            yield (False, pre_wikiword_text)
            yield (False, wikiword)

        text = post_wikiword_text

def wikiword_to_link(stream_tuple, fmt):
    """
    Converts WikiWords back into links, which are considered plain text.

    >>> wikiword_to_link((True, 'WikiWord'), '[{title}]({link})')
    (False, '[WikiWord](/WikiWord.html)')
    >>> wikiword_to_link((True, 'NameSpace:WikiWord'), '[{title}]({link})')
    (False, '[NameSpace:WikiWord](/NameSpace/WikiWord.html)')
    >>> wikiword_to_link((False, 'foo'), '[{title}]({link})')
    (False, 'foo')
    """
    is_wikiword, wikiword = stream_tuple
    if not is_wikiword:
        return stream_tuple

    return (False, fmt.format(title=wikiword, link=wikiword_to_path(wikiword)))

def category_to_html(categories, category):
    """
    Converts a category to HTML which links all the elements of that category.
    """
    result = '''
<html>
    <head>
        <title> Pages In {title} </title>
        <link rel="stylesheet" type="text/css" href="{css}" />
    </head>
    <body>
        <h1 class="title"> Pages In {title} </h1>
        <ul>
'''.format(title=category, css=CSS_FILE)
    for page in categories[category]:
        result += '<li> <a href="{path}"> {page} </a> </li>'.format(
            path=wikiword_to_path(page),
            page=page)

    result += '''
        </ul>
    </body>
</html>
'''
    return result

def convert_file(input_dir, output_dir, path, category_store, all_pages):
    """
    Converts a file residing in the input directory, to an HTML file in the
    output directory.
    """
    title = path_to_wikiword(path)
    out_path = wikiword_to_path(title, leading_slash=False)

    all_pages.add(title)

    with open(os.path.join(input_dir, path)) as in_file:
        ww_1, ww_2 = itertools.tee(find_wikiwords(in_file))

        # First, run the page through the converter, and get the output
        with_links_converted = (wikiword_to_link(elem, LINK_FORMAT) for elem in ww_1)
        preformatted_text = ''.join(text for (_, text) in with_links_converted)

        subproc = subprocess.Popen(
            CONVERT_CMDLINE.format(output=os.path.join(output_dir, path)),
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            shell=True)
        formatted_text, _ = subproc.communicate(bytes(preformatted_text, 'utf-8'))

        with open(os.path.join(output_dir, out_path), 'w') as out_file:
            out_file.write(OUTPUT_HTML.format(
                title=title,
                css=CSS_FILE,
                body=str(formatted_text, 'utf-8')))

        # Add the text to all the categories it links too
        for is_wikiword, wikiword in ww_2:
            # Links to Category:FooPages are not categories, they are lists of
            # links belonging to Category:Foo - thus, we ignore them
            if (is_wikiword and wikiword.startswith('Category:') and
                    not wikiword.endswith('Pages')):
                category_store[wikiword].add(title)

def write_categories(output_path, categories):
    """
    Writes out HTML files for all the categories.
    """
    for category in list(categories):
        # You can have descriptions of a category at Category:Foo, and then
        # a list of pages at Category:FooPages
        proper_category = category + 'Pages'
        html = category_to_html(categories, category)

        path = wikiword_to_path(proper_category, leading_slash=False)
        with open(os.path.join(output_path, path), 'w') as cat_out:
            cat_out.write(html)

def write_master_file(output_path, all_pages):
    """
    Writes out an index of every page in the wiki.
    """
    result = '''
<html>
    <head>
        <title> Master Index </title>
        <link rel="stylesheet" type="text/css" href="{css}" />
    </head>
    <body>
        <h1 class="title"> Wiki Master Index </h1>
        <ul>
'''.format(css=CSS_FILE)
    for page in sorted(all_pages):
        result += '<li> <a href="{path}"> {page} </a> </li>'.format(
            path=wikiword_to_path(page),
            page=page)

    result += '''
        </ul>
    </body>
</html>
'''

    with open(os.path.join(output_path, 'index.html'), 'w') as master_file:
        master_file.write(result)

def convert_tree(input_dir, output_dir):
    """
    Converts all the wiki files from the input directory to the output directory.
    """
    all_pages = set()
    categories = defaultdict(set)
    os.chdir(input_dir)
    for path, _, files in os.walk('.'):
        path = os.path.normpath(path)

        # Excuse hidden files, since they shouldn't be included
        is_hidden = False
        parts = get_path_elems(path)
        for part in parts:
            if part.startswith('.') and part != '.':
                is_hidden = True
                break

        if is_hidden:
            continue

        for file in files:
            name, ext = os.path.splitext(file)
            if ext != WIKI_EXT:
                continue

            # Ensure that the converter routine has a place to actually put the
            # files it generates
            create_directory_tree(os.path.join(output_dir, path))
            convert_file(input_dir, output_dir, os.path.join(path, file), categories, all_pages)

    write_categories(output_dir, categories)

    if MAIN_PAGE is None:
        write_master_file(output_dir, all_pages)
    else:
        index_page = os.path.join(output_dir, 'index.html')
        source_page = os.path.join(output_dir,
            wikiword_to_path(MAIN_PAGE, leading_slash=False))
        shutil.copy(source_page, index_page)

if __name__ == '__main__':
    USAGE = sys.argv[0] + '<INPUT-DIR> <OUTPUT-DIR>'

    try:
        INPUT_DIR = os.path.abspath(sys.argv[1])
        OUTPUT_DIR = os.path.abspath(sys.argv[2])
    except IndexError:
        print(USAGE, file=sys.stderr)
        sys.exit(1)

    config_file_path = os.path.join(INPUT_DIR, 'wiki.conf')
    if not os.path.exists(config_file_path):
        print('Warning: No configuration file at', config_file_path, '- using defaults',
            file=sys.stderr)

    config = configparser.ConfigParser()
    config.read(config_file_path)

    if 'wikiword' in config:
        opts = config['wikiword']

        if 'extension' in opts:
            WIKI_EXT = opts['extension']

        if 'converter' in opts:
            CONVERT_CMDLINE = opts['converter']

        if 'link_format' in opts:
            LINK_FORMAT = opts['link_format']

        if 'stylesheet' in opts:
            CSS_FILE = opts['stylesheet']

        if 'mainpage' in opts:
            MAIN_PAGE = opts['mainpage']

    if not os.path.exists(OUTPUT_DIR):
        print('Warning: Creating output directory - move your stylesheet, etc. there',
            file=sys.stderr)
        os.mkdir(OUTPUT_DIR)

    convert_tree(INPUT_DIR, OUTPUT_DIR)
	"""
	Converts a tree of wiki pages into a tree of HTML pages.
	"""
	from collections import defaultdict
	import configparser
	import glob
	import itertools
	import os, os.path
	import re
	import shutil
	import subprocess
	import sys

	# For example, SomeNamespae:OtherPage
	NAMESPACE_CHAR = ':'

	# The underscore is included to avoid atrocities like CeeLanguage, replacing
	# it with C_Language
	WIKIWORD = re.compile(r'(\W\|^)([A-Z][a-z0-9:_]+){2,}(\W\|$)')

	# The file extension used for wiki pages
	WIKI_EXT = '.md'

	# The main page to use, by copying it into index.html - None means that a
	# 'master file' will be produced while includes all other pages
	MAIN_PAGE = None

	# The command line to use when converting wiki files into their HTML output.
	# Note that no input file is given, because stdin is used; similarly, no output
	# file is given, as stdout is used to collect output
	CONVERT_CMDLINE = 'hsmarkdown'

	# The link format to use when converting WikiWords to links.
	LINK_FORMAT = '<span class="wikiword"> [{title}]({link}) </span>'

	# The path of the CSS style to use
	CSS_FILE = '/style.css'

	# The HTML output to use as a template
	OUTPUT_HTML = '''
	<html>
	<head>
	<title> {title} </title>
	<link rel="stylesheet" type="text/css" href="{css}" />
	</head>
	<body>
	<h1 class="title"> {title} </h1>
	{body}
	</body>
	</html>
	'''

	def get_path_elems(path):
	"""
	Gets all parts of a path.

	>>> get_path_elems('/foo/bar/baz')
	['foo', 'bar', 'baz']
	"""
	path = os.path.normpath(path)
	elems = []
	while True:
	head, tail = os.path.split(path)
	if not tail:
	elems.reverse()
	return elems
	else:
	elems.append(tail)
	path = head


	def create_directory_tree(path):
	"""
	Creates a directory tree for all elements of the given path, if the
	path doesn't already point to a directory.
	"""
	elems = get_path_elems(path)
	prefix = '/' + elems.pop(0)
	for elem in elems:
	full_path = os.path.join(prefix, elem)
	if not os.path.isdir(full_path):
	os.mkdir(full_path)

	prefix = os.path.join(prefix, elem)

	def wikiword_to_path(wikiword, suffix='.html', leading_slash=True):
	"""
	Converts a WikiWord to a link. Note that WikiWords may be namespaced,
	and so converts namespaces into directories.

	>>> wikiword_to_path('WikiWord')
	'/WikiWord.html'
	>>> wikiword_to_path('NameSpace:WikiWord')
	'/NameSpace/WikiWord.html'
	>>> wikiword_to_path('WikiWord', suffix='.foo')
	'/WikiWord.foo'
	>>> wikiword_to_path('WikiWord', leading_slash=False)
	'WikiWord.html'
	"""
	elements = wikiword.split(NAMESPACE_CHAR)
	return (('/' if leading_slash else '') +
	os.sep.join(elements) + suffix)

	def path_to_wikiword(path):
	"""
	Converts a path (relative to either the input directory or the output
	directory) to a namespaced WikiWord, ignoring any suffixes.

	>>> path_to_wikiword('/WikiWord.html')
	'WikiWord'
	>>> path_to_wikiword('/NameSpace/WikiWord.html')
	'NameSpace:WikiWord'
	>>> path_to_wikiword('/WikiWord.foo')
	'WikiWord'
	>>> path_to_wikiword('WikiWord.html')
	'WikiWord'
	"""
	path = os.path.normpath(path)
	elements = path.split(os.sep)

	# The path may start with /, so drop the header if it is empty
	if not elements[0]:
	elements.pop(0)

	# Purge any extension, since it isn't part of a valid WikiWord
	real_name, _ = os.path.splitext(elements[-1])
	elements[-1] = real_name

	return ':'.join(elements)

	def count_backslashes_at_end(text):
	"""
	Counts the number of backslahes at the end of a string. Used to figure out
	whether or not to escape a WikiWord, and how many backslashes to add when
	escaping one.
	"""
	sans_backslashes = text.rstrip('\\')
	return len(text) - len(sans_backslashes)

	def find_wikiwords(in_stream):
	"""
	Takes a stream, and generates from it a list, which contains two types of
	tuples:

	- (True, 'WikiWord') indicates text that is a WikiWord
	- (False, '...') indicates text that is not a WikiWord
	"""
	text = in_stream.read()
	while text:
	match = WIKIWORD.search(text)
	if not match:
	yield (False, text)
	break

	match_begin, match_end = match.span()
	wikiword = match.group()
	pre_wikiword_text = text[:match_begin]
	post_wikiword_text = text[match_end:]

	# Remove any leading characters which are a part of the match generated
	# by the regex, but which are not really a part of the WikiWord.
	leading_text, *_, trailing_text = match.groups()
	pre_wikiword_text += leading_text
	post_wikiword_text = trailing_text + post_wikiword_text
	wikiword = wikiword.lstrip(leading_text).rstrip(trailing_text)

	# If there isn't any preceding text, then we don't care bout escape
	# processing, since there cannot be any to process
	if not pre_wikiword_text:
	yield (True, wikiword)
	text = post_wikiword_text
	continue

	# Escapes stack up before WikiWords. For example:
	#
	# WikiWord --> `WikiWord`
	# \WikiWord --> WikiWord
	# \\WikiWord --> \`WikiWord`
	# ...
	bs_count = count_backslashes_at_end(pre_wikiword_text)
	if bs_count % 2 == 0:
	# Even backslashes means that we're going to be producing a WikiWord
	extra_bs = '\\' * (bs_count // 2)
	pre_wikiword_text = pre_wikiword_text.rstrip('\\') + extra_bs

	yield (False, pre_wikiword_text)
	yield (True, wikiword)
	else:
	# Odd backslashes means we're escaping the WikiWord
	extra_bs = '\\' * ((bs_count - 1) // 2)
	pre_wikiword_text = pre_wikiword_text.rstrip('\\') + extra_bs

	yield (False, pre_wikiword_text)
	yield (False, wikiword)

	text = post_wikiword_text

	def wikiword_to_link(stream_tuple, fmt):
	"""
	Converts WikiWords back into links, which are considered plain text.

	>>> wikiword_to_link((True, 'WikiWord'), '[{title}]({link})')
	(False, '[WikiWord](/WikiWord.html)')
	>>> wikiword_to_link((True, 'NameSpace:WikiWord'), '[{title}]({link})')
	(False, '[NameSpace:WikiWord](/NameSpace/WikiWord.html)')
	>>> wikiword_to_link((False, 'foo'), '[{title}]({link})')
	(False, 'foo')
	"""
	is_wikiword, wikiword = stream_tuple
	if not is_wikiword:
	return stream_tuple

	return (False, fmt.format(title=wikiword, link=wikiword_to_path(wikiword)))

	def category_to_html(categories, category):
	"""
	Converts a category to HTML which links all the elements of that category.
	"""
	result = '''
	<html>
	<head>
	<title> Pages In {title} </title>
	<link rel="stylesheet" type="text/css" href="{css}" />
	</head>
	<body>
	<h1 class="title"> Pages In {title} </h1>
	<ul>
	'''.format(title=category, css=CSS_FILE)
	for page in categories[category]:
	result += '<li> <a href="{path}"> {page} </a> </li>'.format(
	path=wikiword_to_path(page),
	page=page)

	result += '''
	</ul>
	</body>
	</html>
	'''
	return result

	def convert_file(input_dir, output_dir, path, category_store, all_pages):
	"""
	Converts a file residing in the input directory, to an HTML file in the
	output directory.
	"""
	title = path_to_wikiword(path)
	out_path = wikiword_to_path(title, leading_slash=False)

	all_pages.add(title)

	with open(os.path.join(input_dir, path)) as in_file:
	ww_1, ww_2 = itertools.tee(find_wikiwords(in_file))

	# First, run the page through the converter, and get the output
	with_links_converted = (wikiword_to_link(elem, LINK_FORMAT) for elem in ww_1)
	preformatted_text = ''.join(text for (_, text) in with_links_converted)

	subproc = subprocess.Popen(
	CONVERT_CMDLINE.format(output=os.path.join(output_dir, path)),
	stdin=subprocess.PIPE,
	stdout=subprocess.PIPE,
	shell=True)
	formatted_text, _ = subproc.communicate(bytes(preformatted_text, 'utf-8'))

	with open(os.path.join(output_dir, out_path), 'w') as out_file:
	out_file.write(OUTPUT_HTML.format(
	title=title,
	css=CSS_FILE,
	body=str(formatted_text, 'utf-8')))

	# Add the text to all the categories it links too
	for is_wikiword, wikiword in ww_2:
	# Links to Category:FooPages are not categories, they are lists of
	# links belonging to Category:Foo - thus, we ignore them
	if (is_wikiword and wikiword.startswith('Category:') and
	not wikiword.endswith('Pages')):
	category_store[wikiword].add(title)

	def write_categories(output_path, categories):
	"""
	Writes out HTML files for all the categories.
	"""
	for category in list(categories):
	# You can have descriptions of a category at Category:Foo, and then
	# a list of pages at Category:FooPages
	proper_category = category + 'Pages'
	html = category_to_html(categories, category)

	path = wikiword_to_path(proper_category, leading_slash=False)
	with open(os.path.join(output_path, path), 'w') as cat_out:
	cat_out.write(html)

	def write_master_file(output_path, all_pages):
	"""
	Writes out an index of every page in the wiki.
	"""
	result = '''
	<html>
	<head>
	<title> Master Index </title>
	<link rel="stylesheet" type="text/css" href="{css}" />
	</head>
	<body>
	<h1 class="title"> Wiki Master Index </h1>
	<ul>
	'''.format(css=CSS_FILE)
	for page in sorted(all_pages):
	result += '<li> <a href="{path}"> {page} </a> </li>'.format(
	path=wikiword_to_path(page),
	page=page)

	result += '''
	</ul>
	</body>
	</html>
	'''

	with open(os.path.join(output_path, 'index.html'), 'w') as master_file:
	master_file.write(result)

	def convert_tree(input_dir, output_dir):
	"""
	Converts all the wiki files from the input directory to the output directory.
	"""
	all_pages = set()
	categories = defaultdict(set)
	os.chdir(input_dir)
	for path, _, files in os.walk('.'):
	path = os.path.normpath(path)

	# Excuse hidden files, since they shouldn't be included
	is_hidden = False
	parts = get_path_elems(path)
	for part in parts:
	if part.startswith('.') and part != '.':
	is_hidden = True
	break

	if is_hidden:
	continue

	for file in files:
	name, ext = os.path.splitext(file)
	if ext != WIKI_EXT:
	continue

	# Ensure that the converter routine has a place to actually put the
	# files it generates
	create_directory_tree(os.path.join(output_dir, path))
	convert_file(input_dir, output_dir, os.path.join(path, file), categories, all_pages)

	write_categories(output_dir, categories)

	if MAIN_PAGE is None:
	write_master_file(output_dir, all_pages)
	else:
	index_page = os.path.join(output_dir, 'index.html')
	source_page = os.path.join(output_dir,
	wikiword_to_path(MAIN_PAGE, leading_slash=False))
	shutil.copy(source_page, index_page)

	if __name__ == '__main__':
	USAGE = sys.argv[0] + '<INPUT-DIR> <OUTPUT-DIR>'

	try:
	INPUT_DIR = os.path.abspath(sys.argv[1])
	OUTPUT_DIR = os.path.abspath(sys.argv[2])
	except IndexError:
	print(USAGE, file=sys.stderr)
	sys.exit(1)

	config_file_path = os.path.join(INPUT_DIR, 'wiki.conf')
	if not os.path.exists(config_file_path):
	print('Warning: No configuration file at', config_file_path, '- using defaults',
	file=sys.stderr)

	config = configparser.ConfigParser()
	config.read(config_file_path)

	if 'wikiword' in config:
	opts = config['wikiword']

	if 'extension' in opts:
	WIKI_EXT = opts['extension']

	if 'converter' in opts:
	CONVERT_CMDLINE = opts['converter']

	if 'link_format' in opts:
	LINK_FORMAT = opts['link_format']

	if 'stylesheet' in opts:
	CSS_FILE = opts['stylesheet']

	if 'mainpage' in opts:
	MAIN_PAGE = opts['mainpage']

	if not os.path.exists(OUTPUT_DIR):
	print('Warning: Creating output directory - move your stylesheet, etc. there',
	file=sys.stderr)
	os.mkdir(OUTPUT_DIR)

	convert_tree(INPUT_DIR, OUTPUT_DIR)