macleginn/rusgram_postprocessor.py

## rusgram_postprocessor.py
import subprocess
import re

# Convert to html using pandoc and capture output
fn = 'sources/re_docx/reflexive_letuchiy_20141102_nst_site.docx'
txt = subprocess.check_output(['pandoc',
                                '-f', 'docx',
                                '-t', 'html',
                                fn]).decode('utf8')

# Add capt class to section headers
txt = txt.replace('ListParagraph', 'capt')

# Fix links
txt = txt.replace('file:///', '')

# Fix ids for hyperlinks
lines_rev = list(reversed(txt.splitlines()))
pattern = re.compile(r'\s*<h\d id="([^"]+)"( class="capt")?>((\d\.?)+)')
for l in lines_rev:
    m = pattern.search(l)
    if m:
        # print(m.groups())
        txt = txt.replace(m.group(1), m.group(3).replace('.', ''))

lines_in  = txt.splitlines()

# Paragraph to an open list entry
p2li = lambda x: x.replace('<p>', '<li>').replace('</p>', '')

# Fix toc
lines_out = []
section_p = re.compile(r'>((\d+\.?)+)')

i = 0
for l in lines_in:
    if 'href="#' in l:
        lines_out.append('<ul class="toc">')
        lines_out.append('\t' + p2li(l))
        lines_out.append('\t\t<ul>')
        i += 1
        break
    lines_out.append(l)
    i += 1

# Make nested ul’s
current_level = 2
for l in lines_in[i:]:
    if 'href="#' not in l:
        lines_out[-1] += '</ul>'
        if lines_out[-1].endswith('<ul></ul>'):
            lines_out = lines_out[:-1]
            lines_out[-1] += '</li>'
        else:
            lines_out.append('\t</li>')
        lines_out.append('</ul>')
        break
    section_level = len(section_p.search(l).group(1).split('.'))
    while section_level < current_level:
        if lines_out[-1].endswith('<ul>'):
            lines_out = lines_out[:-1]
            lines_out[-1] += '</li>'
        else:
            lines_out.append('\t' * (current_level-1)*2 + '</ul>')
            lines_out.append('\t' * ((current_level-1)*2-1) + '</il>')
        current_level -= 1
    lines_out.append('\t' * (current_level*2 - 1) + p2li(l))
    lines_out.append('\t' * (current_level*2) + '<ul>')
    current_level += 1
    i += 1

for l in lines_in[i:]:
    lines_out.append(l)

lines_in = lines_out

# Fix bibliographical lists and add 'example' class to examples.
# Still will have to add 'class="biblio"' manually later!
lines_out = []
inside    = False
example_p = re.compile(r'<p>\(\d+\)')
for l in lines_in:
    if l.startswith('<p>• '):
        if not inside:
            lines_out.append('<ul>')
            inside = True
        lines_out.append(l.replace('<p>• ', '<li>').replace('</p>', '</li>'))
    elif example_p.match(l):
        lines_out.append(l.replace('<p>', '<p class="example">'))
    else:
        if inside:
            lines_out.append('</ul>')
        lines_out.append(l)
        inside = False
txt = '\n'.join(lines_out)

# Add <style>
with open('style_header.html', 'r', encoding = 'utf-8') as inp:
    style_header = inp.read()

chunks = [style_header,
          '<div id="payload">',
          txt,
          '</div>']

with open('out.html', 'w', encoding = 'utf-8') as out:
    out.write('\n'.join(chunks))
	import subprocess
	import re

	# Convert to html using pandoc and capture output
	fn = 'sources/re_docx/reflexive_letuchiy_20141102_nst_site.docx'
	txt = subprocess.check_output(['pandoc',
	'-f', 'docx',
	'-t', 'html',
	fn]).decode('utf8')

	# Add capt class to section headers
	txt = txt.replace('ListParagraph', 'capt')

	# Fix links
	txt = txt.replace('file:///', '')

	# Fix ids for hyperlinks
	lines_rev = list(reversed(txt.splitlines()))
	pattern = re.compile(r'\s*<h\d id="([^"]+)"( class="capt")?>((\d\.?)+)')
	for l in lines_rev:
	m = pattern.search(l)
	if m:
	# print(m.groups())
	txt = txt.replace(m.group(1), m.group(3).replace('.', ''))

	lines_in = txt.splitlines()

	# Paragraph to an open list entry
	p2li = lambda x: x.replace('<p>', '<li>').replace('</p>', '')

	# Fix toc
	lines_out = []
	section_p = re.compile(r'>((\d+\.?)+)')

	i = 0
	for l in lines_in:
	if 'href="#' in l:
	lines_out.append('<ul class="toc">')
	lines_out.append('\t' + p2li(l))
	lines_out.append('\t\t<ul>')
	i += 1
	break
	lines_out.append(l)
	i += 1

	# Make nested ul’s
	current_level = 2
	for l in lines_in[i:]:
	if 'href="#' not in l:
	lines_out[-1] += '</ul>'
	if lines_out[-1].endswith('<ul></ul>'):
	lines_out = lines_out[:-1]
	lines_out[-1] += '</li>'
	else:
	lines_out.append('\t</li>')
	lines_out.append('</ul>')
	break
	section_level = len(section_p.search(l).group(1).split('.'))
	while section_level < current_level:
	if lines_out[-1].endswith('<ul>'):
	lines_out = lines_out[:-1]
	lines_out[-1] += '</li>'
	else:
	lines_out.append('\t' * (current_level-1)*2 + '</ul>')
	lines_out.append('\t' * ((current_level-1)*2-1) + '</il>')
	current_level -= 1
	lines_out.append('\t' * (current_level*2 - 1) + p2li(l))
	lines_out.append('\t' * (current_level*2) + '<ul>')
	current_level += 1
	i += 1

	for l in lines_in[i:]:
	lines_out.append(l)

	lines_in = lines_out

	# Fix bibliographical lists and add 'example' class to examples.
	# Still will have to add 'class="biblio"' manually later!
	lines_out = []
	inside = False
	example_p = re.compile(r'<p>\(\d+\)')
	for l in lines_in:
	if l.startswith('<p>• '):
	if not inside:
	lines_out.append('<ul>')
	inside = True
	lines_out.append(l.replace('<p>• ', '<li>').replace('</p>', '</li>'))
	elif example_p.match(l):
	lines_out.append(l.replace('<p>', '<p class="example">'))
	else:
	if inside:
	lines_out.append('</ul>')
	lines_out.append(l)
	inside = False
	txt = '\n'.join(lines_out)

	# Add <style>
	with open('style_header.html', 'r', encoding = 'utf-8') as inp:
	style_header = inp.read()

	chunks = [style_header,
	'<div id="payload">',
	txt,
	'</div>']

	with open('out.html', 'w', encoding = 'utf-8') as out:
	out.write('\n'.join(chunks))