Parse Markdown to HTML via Python
# -*- coding: utf-8 -*-
import re
from collections import OrderedDict
from string import Template
BASE_HTML = Template("""<!DOCTYPE html><!-- $TITLE, $TYPE written by $AUTHOR --><head><title>$TITLE</title><meta charset="$ENCODE"/><style>body{font-family:$FONT;}.underlined{text-decoration: underline;}.obfuscated{text-decoration:line-through;}.centered{text-align:center;}table{border-collapse:collapse;}th{border:1px solid black;padding:5px;background-color:#DDDDDD;}td{border:1px solid black;padding:5px;}</style></head><body>$html</body>""")
class Token:
def __init__(self, regex):
self.regex = regex
tokens = OrderedDict()
class token:
def __init__(self, regex, subparse=True):
self.regex = regex
self.subparse = subparse
def __call__(self, fn):
from functools import wraps
new_token = Token(self.regex)
new_token.parse = fn
tokens[fn.__name__] = new_token
def wrapper(match):
if self.subparse:
return subparse(fn(match))
return fn(match)
return wrapper
@token(r"""(?P<spec>\$_(?P<name>.+)\((?P<value>.*)\))\n""", subparse=False)
def spec(match): return ''
def em(match):
return '<em>{em}</em>'.format(**match)
def strong(match):
return '<strong>{strong}</strong>'.format(**match)
def underlined(match):
return '<span class="underlined">{underlined}</span>'.format(**match)
def obfuscated(match):
return '<span class="obfuscated">{obfuscated}</span>'.format(**match)
def code(match):
return '<code>{code}</code>'.format(**match)
def h1(match):
return '<h1>{h1}</h1>'.format(**match)
def h2(match):
return '<h2>{h2}</h2>'.format(**match)
def h3(match):
return '<h3>{h3}</h3>'.format(**match)
def h4(match):
return '<h4>{h4}</h4>'.format(**match)
def h5(match):
return '<h5>{h5}</h5>'.format(**match)
@token(r"""(?P<a>\[(?P<disp>.+)\] *\((?P<href>[^ \n]+)\))""")
def a(match):
return '<a href="{href}">{disp}</a>'.format(**match)
def anchor(match):
return '<div id="{anchor}"></>'.format(**match)
def centered(match):
return '<div class="centered">{centered}</div>'.format(**match)
def sub(match):
return '<sub>{sub}</sub>'.format(**match)
def sup(match):
return '<sup>{sup}</sup>'.format(**match)
def pre(match):
return '<pre' +\
' class="{class}"' * bool(len(match['class'])) +\
def table(match):
return '<table>{table}</table>'.format(table=parse_table(match['table']))
def br(match):
return '<br/>'
def mismatch(match):
return '{mismatch}'.format(**match)
#--------- Parsing ----------
reg_list = [tok.regex for tok in tokens.values()]
markup_regex = re.compile('|'.join(reg_list))
def translate(raw_md, base=BASE_HTML, **default):
default['html'] = ''
matches = re.finditer(markup_regex, raw_md)
for token in matches:
kind = token.lastgroup
if kind == 'spec':
name, value ='name'),'value')
default[name] = value
value = token.groupdict()
default['html'] += tokens[kind].parse(value)
if isinstance(base, Template):
return base.substitute(**default)
return base.format(**default)
def subparse(match):
subhtml, all_mismatch = '', True
matches = re.finditer(markup_regex, raw_md)
for token in matches:
kind = token.lastgroup
if kind is 'mismatch':
all_mismatch = False
value = token.groupdict()
subhtml += tokens[kind].parse(value)
if all_mismatch:
return subhtml
return subparse(subhtml)
def parse_table(table):
def tr_parse(tr):
tr_html = ''
table_regexes = OrderedDict([
('td', r"""(?P<td>(?<=\|)(?!:).*?(?<!:)(?=\|))"""),
('th', r"""(?P<th>(?<=\|:).*?(?=:\|))""")
markup_format = {
'td': lambda **kwds: '<td>{td}</td>'.format(**kwds),
'th': lambda **kwds: '<th>{th}</th>'.format(**kwds),
tr_sub = re.compile('|'.join(
['(?:{})'.format(reg) for reg in table_regexes.values()]))
for elt in re.finditer(tr_sub, tr):
kind = elt.lastgroup
value = elt.groupdict()
tr_html += markup_format[kind](**value)
return tr_html
rows = ''
tr_re = re.compile(r"""(?P<tr>.+(?=\n[+-]+))""")
for tr in tr_re.findall(table):
rows += '<tr>{}</tr>'.format(tr_parse(tr))
return rows
if __name__ == '__main__':
import sys
file_path, dest_path = sys.argv[1:]
with open(file_path) as file:
raw_md =
with open(dest_path, 'w') as dest:
html = translate(raw_md)
