johnbeard/odt_md.py

## odt_md.py

import os
import re
import zipfile
import subprocess
import base64
import md5

def get_image_data_from_odt(odt):

    zf = zipfile.ZipFile(INFILE)

    zipdir = "/tmp/ktmp";

    zf.extractall(zipdir)

    print "Unzipped %s" % INFILE

    image_dir = zipdir + os.path.sep + "Pictures"

    print "Collecting base64 of images in: %s" % image_dir

    images = {}

    for image in os.listdir(image_dir):

        print "\t%s" % image

        path = os.path.join(image_dir, image)

        b64 = base64.b64encode(open(path, 'r').read())

        hsh = md5.new(b64).hexdigest()

        print "\t\t%s" % hsh

        images[hsh] = path

    print "Base64 hash -> image path pairs collected"

    return images

def generate_md_from_odt(odt):
    """
    Takes path to ODT file

    Returns path to markdown file
    """

    print "Converting to HTML"

    root, ext = os.path.splitext(odt)

    print root

    tmpdir = "/tmp"

    conversion_profile = os.path.join(tmpdir, "LibO_Conversion")

    cmd = ["libreoffice",
            "-env:UserInstallation=file://%s" % conversion_profile,
            "--headless",
            "--convert-to", "html",
            "--outdir", tmpdir,
            odt]

    print cmd

    subprocess.call(cmd)

    html = root + ".html"
    md = root + ".md"

    print "Converting from HTML to markdown"

    cmd = ["pandoc", html, "-t", "markdown", "-o", md]

    subprocess.call(cmd)

    print "Removing HTML intermediate"

    os.remove(html)

    return md

def sub_images(md, images):
    """
    replace base64 encode images with links to real images here.

    Warning: may contain ugly hacks
    """

    def b64_replace(match):
        """
        Replace a base64 blob with a matching image if found
        """

        b64 = match.group(1)

        hsh = md5.new(b64).hexdigest()

        if hsh in images:
            print "\tImage %s -> %s" % (hsh, images[hsh])

            return "(%s)" % images[hsh]

        return match

    prefix = "(data:image/png;base64,"

    print "Replacing images in %s" % md

    mdtext = re.sub(r"\(data:image/png;base64,([A-Za-z0-9+/]+=*)\)",
                    b64_replace, open(md, 'r').read())

    outf = open(md, 'w')
    outf.write(mdtext)
    outf.close()


if __name__ == "__main__":

    INFILE = "/tmp/kicad_EN.odt"

    odtimg = get_image_data_from_odt(INFILE)

    md_path = generate_md_from_odt(INFILE)

    sub_images(md_path, odtimg)

	import os
	import re
	import zipfile
	import subprocess
	import base64
	import md5

	def get_image_data_from_odt(odt):

	zf = zipfile.ZipFile(INFILE)

	zipdir = "/tmp/ktmp";

	zf.extractall(zipdir)

	print "Unzipped %s" % INFILE

	image_dir = zipdir + os.path.sep + "Pictures"

	print "Collecting base64 of images in: %s" % image_dir

	images = {}

	for image in os.listdir(image_dir):

	print "\t%s" % image

	path = os.path.join(image_dir, image)

	b64 = base64.b64encode(open(path, 'r').read())

	hsh = md5.new(b64).hexdigest()

	print "\t\t%s" % hsh

	images[hsh] = path

	print "Base64 hash -> image path pairs collected"

	return images

	def generate_md_from_odt(odt):
	"""
	Takes path to ODT file

	Returns path to markdown file
	"""

	print "Converting to HTML"

	root, ext = os.path.splitext(odt)

	print root

	tmpdir = "/tmp"

	conversion_profile = os.path.join(tmpdir, "LibO_Conversion")

	cmd = ["libreoffice",
	"-env:UserInstallation=file://%s" % conversion_profile,
	"--headless",
	"--convert-to", "html",
	"--outdir", tmpdir,
	odt]

	print cmd

	subprocess.call(cmd)

	html = root + ".html"
	md = root + ".md"

	print "Converting from HTML to markdown"

	cmd = ["pandoc", html, "-t", "markdown", "-o", md]

	subprocess.call(cmd)

	print "Removing HTML intermediate"

	os.remove(html)

	return md

	def sub_images(md, images):
	"""
	replace base64 encode images with links to real images here.

	Warning: may contain ugly hacks
	"""

	def b64_replace(match):
	"""
	Replace a base64 blob with a matching image if found
	"""

	b64 = match.group(1)

	hsh = md5.new(b64).hexdigest()

	if hsh in images:
	print "\tImage %s -> %s" % (hsh, images[hsh])

	return "(%s)" % images[hsh]

	return match

	prefix = "(data:image/png;base64,"

	print "Replacing images in %s" % md

	mdtext = re.sub(r"\(data:image/png;base64,([A-Za-z0-9+/]+=*)\)",
	b64_replace, open(md, 'r').read())

	outf = open(md, 'w')
	outf.write(mdtext)
	outf.close()


	if __name__ == "__main__":

	INFILE = "/tmp/kicad_EN.odt"

	odtimg = get_image_data_from_odt(INFILE)

	md_path = generate_md_from_odt(INFILE)

	sub_images(md_path, odtimg)