Skip to content

Instantly share code, notes, and snippets.

@macleginn
Created January 12, 2017 18:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save macleginn/21f01c9d8bee5000dbc562662ea6d829 to your computer and use it in GitHub Desktop.
Save macleginn/21f01c9d8bee5000dbc562662ea6d829 to your computer and use it in GitHub Desktop.
Convertor and postprocessor of .docx files prepared for rusgram.ru
import subprocess
import re
# Convert to html using pandoc and capture output
fn = 'sources/re_docx/reflexive_letuchiy_20141102_nst_site.docx'
txt = subprocess.check_output(['pandoc',
'-f', 'docx',
'-t', 'html',
fn]).decode('utf8')
# Add capt class to section headers
txt = txt.replace('ListParagraph', 'capt')
# Fix links
txt = txt.replace('file:///', '')
# Fix ids for hyperlinks
lines_rev = list(reversed(txt.splitlines()))
pattern = re.compile(r'\s*<h\d id="([^"]+)"( class="capt")?>((\d\.?)+)')
for l in lines_rev:
m = pattern.search(l)
if m:
# print(m.groups())
txt = txt.replace(m.group(1), m.group(3).replace('.', ''))
lines_in = txt.splitlines()
# Paragraph to an open list entry
p2li = lambda x: x.replace('<p>', '<li>').replace('</p>', '')
# Fix toc
lines_out = []
section_p = re.compile(r'>((\d+\.?)+)')
i = 0
for l in lines_in:
if 'href="#' in l:
lines_out.append('<ul class="toc">')
lines_out.append('\t' + p2li(l))
lines_out.append('\t\t<ul>')
i += 1
break
lines_out.append(l)
i += 1
# Make nested ul’s
current_level = 2
for l in lines_in[i:]:
if 'href="#' not in l:
lines_out[-1] += '</ul>'
if lines_out[-1].endswith('<ul></ul>'):
lines_out = lines_out[:-1]
lines_out[-1] += '</li>'
else:
lines_out.append('\t</li>')
lines_out.append('</ul>')
break
section_level = len(section_p.search(l).group(1).split('.'))
while section_level < current_level:
if lines_out[-1].endswith('<ul>'):
lines_out = lines_out[:-1]
lines_out[-1] += '</li>'
else:
lines_out.append('\t' * (current_level-1)*2 + '</ul>')
lines_out.append('\t' * ((current_level-1)*2-1) + '</il>')
current_level -= 1
lines_out.append('\t' * (current_level*2 - 1) + p2li(l))
lines_out.append('\t' * (current_level*2) + '<ul>')
current_level += 1
i += 1
for l in lines_in[i:]:
lines_out.append(l)
lines_in = lines_out
# Fix bibliographical lists and add 'example' class to examples.
# Still will have to add 'class="biblio"' manually later!
lines_out = []
inside = False
example_p = re.compile(r'<p>\(\d+\)')
for l in lines_in:
if l.startswith('<p>• '):
if not inside:
lines_out.append('<ul>')
inside = True
lines_out.append(l.replace('<p>• ', '<li>').replace('</p>', '</li>'))
elif example_p.match(l):
lines_out.append(l.replace('<p>', '<p class="example">'))
else:
if inside:
lines_out.append('</ul>')
lines_out.append(l)
inside = False
txt = '\n'.join(lines_out)
# Add <style>
with open('style_header.html', 'r', encoding = 'utf-8') as inp:
style_header = inp.read()
chunks = [style_header,
'<div id="payload">',
txt,
'</div>']
with open('out.html', 'w', encoding = 'utf-8') as out:
out.write('\n'.join(chunks))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment