Last active
August 29, 2015 14:06
-
-
Save johnbeard/1574e0f4e451c9869e5e to your computer and use it in GitHub Desktop.
ODT to MD processor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import zipfile | |
import subprocess | |
import base64 | |
import md5 | |
def get_image_data_from_odt(odt): | |
zf = zipfile.ZipFile(INFILE) | |
zipdir = "/tmp/ktmp"; | |
zf.extractall(zipdir) | |
print "Unzipped %s" % INFILE | |
image_dir = zipdir + os.path.sep + "Pictures" | |
print "Collecting base64 of images in: %s" % image_dir | |
images = {} | |
for image in os.listdir(image_dir): | |
print "\t%s" % image | |
path = os.path.join(image_dir, image) | |
b64 = base64.b64encode(open(path, 'r').read()) | |
hsh = md5.new(b64).hexdigest() | |
print "\t\t%s" % hsh | |
images[hsh] = path | |
print "Base64 hash -> image path pairs collected" | |
return images | |
def generate_md_from_odt(odt): | |
""" | |
Takes path to ODT file | |
Returns path to markdown file | |
""" | |
print "Converting to HTML" | |
root, ext = os.path.splitext(odt) | |
print root | |
tmpdir = "/tmp" | |
conversion_profile = os.path.join(tmpdir, "LibO_Conversion") | |
cmd = ["libreoffice", | |
"-env:UserInstallation=file://%s" % conversion_profile, | |
"--headless", | |
"--convert-to", "html", | |
"--outdir", tmpdir, | |
odt] | |
print cmd | |
subprocess.call(cmd) | |
html = root + ".html" | |
md = root + ".md" | |
print "Converting from HTML to markdown" | |
cmd = ["pandoc", html, "-t", "markdown", "-o", md] | |
subprocess.call(cmd) | |
print "Removing HTML intermediate" | |
os.remove(html) | |
return md | |
def sub_images(md, images): | |
""" | |
replace base64 encode images with links to real images here. | |
Warning: may contain ugly hacks | |
""" | |
def b64_replace(match): | |
""" | |
Replace a base64 blob with a matching image if found | |
""" | |
b64 = match.group(1) | |
hsh = md5.new(b64).hexdigest() | |
if hsh in images: | |
print "\tImage %s -> %s" % (hsh, images[hsh]) | |
return "(%s)" % images[hsh] | |
return match | |
prefix = "(data:image/png;base64," | |
print "Replacing images in %s" % md | |
mdtext = re.sub(r"\(data:image/png;base64,([A-Za-z0-9+/]+=*)\)", | |
b64_replace, open(md, 'r').read()) | |
outf = open(md, 'w') | |
outf.write(mdtext) | |
outf.close() | |
if __name__ == "__main__": | |
INFILE = "/tmp/kicad_EN.odt" | |
odtimg = get_image_data_from_odt(INFILE) | |
md_path = generate_md_from_odt(INFILE) | |
sub_images(md_path, odtimg) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment