Created
January 12, 2017 18:33
-
-
Save macleginn/21f01c9d8bee5000dbc562662ea6d829 to your computer and use it in GitHub Desktop.
Convertor and postprocessor of .docx files prepared for rusgram.ru
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import re | |
# Convert to html using pandoc and capture output | |
fn = 'sources/re_docx/reflexive_letuchiy_20141102_nst_site.docx' | |
txt = subprocess.check_output(['pandoc', | |
'-f', 'docx', | |
'-t', 'html', | |
fn]).decode('utf8') | |
# Add capt class to section headers | |
txt = txt.replace('ListParagraph', 'capt') | |
# Fix links | |
txt = txt.replace('file:///', '') | |
# Fix ids for hyperlinks | |
lines_rev = list(reversed(txt.splitlines())) | |
pattern = re.compile(r'\s*<h\d id="([^"]+)"( class="capt")?>((\d\.?)+)') | |
for l in lines_rev: | |
m = pattern.search(l) | |
if m: | |
# print(m.groups()) | |
txt = txt.replace(m.group(1), m.group(3).replace('.', '')) | |
lines_in = txt.splitlines() | |
# Paragraph to an open list entry | |
p2li = lambda x: x.replace('<p>', '<li>').replace('</p>', '') | |
# Fix toc | |
lines_out = [] | |
section_p = re.compile(r'>((\d+\.?)+)') | |
i = 0 | |
for l in lines_in: | |
if 'href="#' in l: | |
lines_out.append('<ul class="toc">') | |
lines_out.append('\t' + p2li(l)) | |
lines_out.append('\t\t<ul>') | |
i += 1 | |
break | |
lines_out.append(l) | |
i += 1 | |
# Make nested ul’s | |
current_level = 2 | |
for l in lines_in[i:]: | |
if 'href="#' not in l: | |
lines_out[-1] += '</ul>' | |
if lines_out[-1].endswith('<ul></ul>'): | |
lines_out = lines_out[:-1] | |
lines_out[-1] += '</li>' | |
else: | |
lines_out.append('\t</li>') | |
lines_out.append('</ul>') | |
break | |
section_level = len(section_p.search(l).group(1).split('.')) | |
while section_level < current_level: | |
if lines_out[-1].endswith('<ul>'): | |
lines_out = lines_out[:-1] | |
lines_out[-1] += '</li>' | |
else: | |
lines_out.append('\t' * (current_level-1)*2 + '</ul>') | |
lines_out.append('\t' * ((current_level-1)*2-1) + '</il>') | |
current_level -= 1 | |
lines_out.append('\t' * (current_level*2 - 1) + p2li(l)) | |
lines_out.append('\t' * (current_level*2) + '<ul>') | |
current_level += 1 | |
i += 1 | |
for l in lines_in[i:]: | |
lines_out.append(l) | |
lines_in = lines_out | |
# Fix bibliographical lists and add 'example' class to examples. | |
# Still will have to add 'class="biblio"' manually later! | |
lines_out = [] | |
inside = False | |
example_p = re.compile(r'<p>\(\d+\)') | |
for l in lines_in: | |
if l.startswith('<p>• '): | |
if not inside: | |
lines_out.append('<ul>') | |
inside = True | |
lines_out.append(l.replace('<p>• ', '<li>').replace('</p>', '</li>')) | |
elif example_p.match(l): | |
lines_out.append(l.replace('<p>', '<p class="example">')) | |
else: | |
if inside: | |
lines_out.append('</ul>') | |
lines_out.append(l) | |
inside = False | |
txt = '\n'.join(lines_out) | |
# Add <style> | |
with open('style_header.html', 'r', encoding = 'utf-8') as inp: | |
style_header = inp.read() | |
chunks = [style_header, | |
'<div id="payload">', | |
txt, | |
'</div>'] | |
with open('out.html', 'w', encoding = 'utf-8') as out: | |
out.write('\n'.join(chunks)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment