dsanson/odt2pandoc.py

## odt2pandoc.py
#!/usr/bin/env python

"""
ODT2PANDOC
==========

ODT2PANDOC is a slight variant of ODT2TXT. Right now, the only differences are that it generates ATX style headers and uses *asterisks* for italics, neither of which is in any way specific to pandoc's extended markdown.

ODT2TXT
=======

ODT2TXT convers files in Open Document Text format (ODT) into
Markdown-formatted plain text.

Writteby by [Yuri Takhteyev](http://www.freewisdom.org).

Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php
Contact: yuri [at] freewisdom.org

License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD

Version: 0.1 (April 7, 2006)

"""


import sys, zipfile, xml.dom.minidom

IGNORED_TAGS = ["office:annotation"]

FOOTNOTE_STYLES = ["Footnote"]


class TextProps :
    """ Holds properties for a text style. """

    def __init__ (self):

        self.italic = False
        self.bold = False
        self.fixed = False

    def setItalic (self, value) :
        if value == "italic" :
            self.italic = True

    def setBold (self, value) :
        if value == "bold" :
            self.bold = True

    def setFixed (self, value) :
        self.fixed = value

    def __str__ (self) :

        return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic),
                                          str(self.bold),
                                          str(self.fixed))

class ParagraphProps :
    """ Holds properties of a paragraph style. """

    def __init__ (self):

        self.blockquote = False
        self.headingLevel = 0
        self.code = False
        self.title = False
        self.indented = 0

    def setIndented (self, value) :
        self.indented = value

    def setHeading (self, level) :
        self.headingLevel = level

    def setTitle (self, value):
        self.title = value

    def setCode (self, value) :
        self.code = value


    def __str__ (self) :

        return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
                                           self.headingLevel,
                                           str(self.code))


class ListProperties :
    """ Holds properties for a list style. """

    def __init__ (self):
        self.ordered = False

    def setOrdered (self, value) :
        self.ordered = value


class OpenDocumentTextFile :


    def __init__ (self, filepath) :
        self.footnotes = []
        self.footnoteCounter = 0
        self.textStyles = {"Standard" : TextProps()}
        self.paragraphStyles = {"Standard" : ParagraphProps()}
        self.listStyles = {}
        self.fixedFonts = []
        self.hasTitle = 0

        self.load(filepath)


    def processFontDeclarations (self, fontDecl) :
        """ Extracts necessary font information from a font-declaration
            element.
            """
        for fontFace in fontDecl.getElementsByTagName("style:font-face") :
            if fontFace.getAttribute("style:font-pitch") == "fixed" :
                self.fixedFonts.append(fontFace.getAttribute("style:name"))


    def extractTextProperties (self, style, parent=None) :
        """ Extracts text properties from a style element. """

        textProps = TextProps()

        if parent :
            parentProp = self.textStyles.get(parent, None)
            if parentProp :
                textProp = parentProp

        textPropEl = style.getElementsByTagName("style:text-properties")
        if not textPropEl : return textProps

        textPropEl = textPropEl[0]

        italic = textPropEl.getAttribute("fo:font-style")
        bold = textPropEl.getAttribute("fo:font-weight")

        textProps.setItalic(italic)
        textProps.setBold(bold)

        if textPropEl.getAttribute("style:font-name") in self.fixedFonts :
            textProps.setFixed(True)

        return textProps

    def extractParagraphProperties (self, style, parent=None) :
        """ Extracts paragraph properties from a style element. """

        paraProps = ParagraphProps()

        name = style.getAttribute("style:name")

        if name.startswith("Heading_20_") :
            level = name[11:]
            try :
                level = int(level)
                paraProps.setHeading(level)
            except :
                level = 0

        if name == "Title" :
            paraProps.setTitle(True)

        paraPropEl = style.getElementsByTagName("style:paragraph-properties")
        if paraPropEl :
            paraPropEl = paraPropEl[0]
            leftMargin = paraPropEl.getAttribute("fo:margin-left")
            if leftMargin :
                try :
                    leftMargin = float(leftMargin[:-2])
                    if leftMargin > 0.01 :
                        paraProps.setIndented(True)
                except :
                    pass

        textProps = self.extractTextProperties(style)
        if textProps.fixed :
            paraProps.setCode(True)

        return paraProps


    def processStyles(self, styleElements) :
        """ Runs through "style" elements extracting necessary information.
            """

        for style in styleElements :

            name = style.getAttribute("style:name")

            if name == "Standard" : continue

            family = style.getAttribute("style:family")
            parent = style.getAttribute("style:parent-style-name")

            if family == "text" :
                self.textStyles[name] = self.extractTextProperties(style,
                                                                   parent)

            elif family == "paragraph":
                self.paragraphStyles[name] = (
                                 self.extractParagraphProperties(style,
                                                                 parent))
    def processListStyles (self, listStyleElements) :

        for style in listStyleElements :
            name = style.getAttribute("style:name")

            prop = ListProperties()
            if style.childNodes :
                if ( style.childNodes[0].tagName
                     == "text:list-level-style-number" ) :
                    prop.setOrdered(True)

            self.listStyles[name] = prop


    def load(self, filepath) :
        """ Loads an ODT file. """

        zip = zipfile.ZipFile(filepath)

        styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
        self.processFontDeclarations(styles_doc.getElementsByTagName(
            "office:font-face-decls")[0])
        self.processStyles(styles_doc.getElementsByTagName("style:style"))
        self.processListStyles(styles_doc.getElementsByTagName(
            "text:list-style"))

        self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
        self.processFontDeclarations(self.content.getElementsByTagName(
            "office:font-face-decls")[0])
        self.processStyles(self.content.getElementsByTagName("style:style"))
        self.processListStyles(self.content.getElementsByTagName(
            "text:list-style"))

    def compressCodeBlocks(self, text) :
        """ Removes extra blank lines from code blocks. """

        lines = text.split("\n")
        buffer = ""
        numLines = len(lines)
        for i in range(numLines) :

            if (lines[i].strip() or i == numLines-1  or i == 0 or
                not ( lines[i-1].startswith("    ")
                      and lines[i+1].startswith("    ") ) ):
                buffer += "\n" + lines[i]

        return buffer


    def listToString (self, listElement) :

        buffer = ""

        styleName = listElement.getAttribute("text:style-name")
        props = self.listStyles.get(styleName, ListProperties())


        i = 0
        for item in listElement.childNodes :
            i += 1
            if props.ordered :
                number = str(i)
                number = number + "." + " "*(2-len(number))
                buffer += number + self.paragraphToString(item.childNodes[0],
                                                        indent=3)
            else :
                buffer += "* " + self.paragraphToString(item.childNodes[0],
                                                        indent=2)
            buffer += "\n\n"

        return buffer

    def toString (self) :
        """ Converts the document to a string. """
        body = self.content.getElementsByTagName("office:body")[0]
        text = self.content.getElementsByTagName("office:text")[0]

        buffer = u""


        paragraphs = [el for el in text.childNodes
                      if el.tagName in ["text:p", "text:h",
                                        "text:list"]]

        for paragraph in paragraphs :
            if paragraph.tagName == "text:list" :
                text = self.listToString(paragraph)
            else :
                text = self.paragraphToString(paragraph)
            if text :
                buffer += text + "\n\n"

        if self.footnotes :

            buffer += "--------\n\n"
            for cite, body in self.footnotes :
                buffer += "[^%s]: %s\n\n" % (cite, body)


        return self.compressCodeBlocks(buffer)


    def textToString(self, element) :

        buffer = u""

        for node in element.childNodes :

            if node.nodeType == xml.dom.Node.TEXT_NODE :
                buffer += node.nodeValue

            elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
                tag = node.tagName

                if tag == "text:span" :

                    text = self.textToString(node)

                    if not text.strip() :
                        return ""  # don't apply styles to white space

                    styleName = node.getAttribute("text:style-name")
                    style = self.textStyles.get(styleName, None)

                    #print styleName, str(style)

                    if style.fixed :
                        buffer += "`" + text + "`"
                        continue

                    if style :
                        if style.italic and style.bold :
                            mark = "***"
                        elif style.italic :
                            mark = "*"
                        elif style.bold :
                            mark = "**"
                        else :
                            mark = ""
                    else :
                        mark = "<" + styleName + ">"

                    buffer += "%s%s%s" % (mark, text, mark)

                elif tag == "text:note" :
                    cite = (node.getElementsByTagName("text:note-citation")[0]
                                .childNodes[0].nodeValue)

                    body = (node.getElementsByTagName("text:note-body")[0]
                                .childNodes[0])

                    self.footnotes.append((cite, self.textToString(body)))

                    buffer += "[^%s]" % cite

                elif tag in IGNORED_TAGS :
                    pass

                elif tag == "text:s" :
                    try :
                        num = int(node.getAttribute("text:c"))
                        buffer += " "*num
                    except :
                        buffer += " "

                elif tag == "text:tab" :
                    buffer += "    "


                elif tag == "text:a" :

                    text = self.textToString(node)
                    link = node.getAttribute("xlink:href")
                    buffer += "[%s](%s)" % (text, link)

        return buffer

    def paragraphToString(self, paragraph, indent = 0) :


        style_name = paragraph.getAttribute("text:style-name")
        paraProps = self.paragraphStyles.get(style_name) #, None)
        text = self.textToString(paragraph)

        #print style_name

        if paraProps and not paraProps.code :
            text = text.strip()

        if paraProps.title :
            self.hasTitle = 1
            return text + "\n" + ("=" * len(text))

        if paraProps.headingLevel :

            level = paraProps.headingLevel
            if self.hasTitle : level += 1

            return "#" * level + " " + text

        elif paraProps.code :
            lines = ["    %s" % line for line in text.split("\n")]
            return "\n".join(lines)

        if paraProps.indented :
            return self.wrapParagraph(text, indent = indent, blockquote=True)

        else :
            return self.wrapParagraph(text, indent = indent)


    def wrapParagraph(self, text, indent = 0, blockquote=False) :

        counter = 0
        buffer = ""
        LIMIT = 50

        if blockquote :
            buffer += "> "

        for token in text.split() :

            if counter > LIMIT - indent :
                buffer += "\n" + " "*indent
                if blockquote :
                    buffer += "> "
                counter = 0

            buffer += token + " "
            counter += len(token)

        return buffer


if __name__ == "__main__" :


    odt = OpenDocumentTextFile(sys.argv[1])

    #print odt.fixedFonts

    #sys.exit(0)
    #out = open("out.txt", "wb")

    unicode = odt.toString()
    out_utf8 = unicode.encode("utf-8")

    sys.stdout.write(out_utf8)

    #out.write(
	#!/usr/bin/env python

	"""
	ODT2PANDOC
	==========

	ODT2PANDOC is a slight variant of ODT2TXT. Right now, the only differences are that it generates ATX style headers and uses asterisks for italics, neither of which is in any way specific to pandoc's extended markdown.

	ODT2TXT
	=======

	ODT2TXT convers files in Open Document Text format (ODT) into
	Markdown-formatted plain text.

	Writteby by [Yuri Takhteyev](http://www.freewisdom.org).

	Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php
	Contact: yuri [at] freewisdom.org

	License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD

	Version: 0.1 (April 7, 2006)

	"""



	import sys, zipfile, xml.dom.minidom

	IGNORED_TAGS = ["office:annotation"]

	FOOTNOTE_STYLES = ["Footnote"]


	class TextProps :
	""" Holds properties for a text style. """

	def __init__ (self):

	self.italic = False
	self.bold = False
	self.fixed = False

	def setItalic (self, value) :
	if value == "italic" :
	self.italic = True

	def setBold (self, value) :
	if value == "bold" :
	self.bold = True

	def setFixed (self, value) :
	self.fixed = value

	def __str__ (self) :

	return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic),
	str(self.bold),
	str(self.fixed))

	class ParagraphProps :
	""" Holds properties of a paragraph style. """

	def __init__ (self):

	self.blockquote = False
	self.headingLevel = 0
	self.code = False
	self.title = False
	self.indented = 0

	def setIndented (self, value) :
	self.indented = value

	def setHeading (self, level) :
	self.headingLevel = level

	def setTitle (self, value):
	self.title = value

	def setCode (self, value) :
	self.code = value


	def __str__ (self) :

	return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
	self.headingLevel,
	str(self.code))


	class ListProperties :
	""" Holds properties for a list style. """

	def __init__ (self):
	self.ordered = False

	def setOrdered (self, value) :
	self.ordered = value



	class OpenDocumentTextFile :


	def __init__ (self, filepath) :
	self.footnotes = []
	self.footnoteCounter = 0
	self.textStyles = {"Standard" : TextProps()}
	self.paragraphStyles = {"Standard" : ParagraphProps()}
	self.listStyles = {}
	self.fixedFonts = []
	self.hasTitle = 0

	self.load(filepath)


	def processFontDeclarations (self, fontDecl) :
	""" Extracts necessary font information from a font-declaration
	element.
	"""
	for fontFace in fontDecl.getElementsByTagName("style:font-face") :
	if fontFace.getAttribute("style:font-pitch") == "fixed" :
	self.fixedFonts.append(fontFace.getAttribute("style:name"))



	def extractTextProperties (self, style, parent=None) :
	""" Extracts text properties from a style element. """

	textProps = TextProps()

	if parent :
	parentProp = self.textStyles.get(parent, None)
	if parentProp :
	textProp = parentProp

	textPropEl = style.getElementsByTagName("style:text-properties")
	if not textPropEl : return textProps

	textPropEl = textPropEl[0]

	italic = textPropEl.getAttribute("fo:font-style")
	bold = textPropEl.getAttribute("fo:font-weight")

	textProps.setItalic(italic)
	textProps.setBold(bold)

	if textPropEl.getAttribute("style:font-name") in self.fixedFonts :
	textProps.setFixed(True)

	return textProps

	def extractParagraphProperties (self, style, parent=None) :
	""" Extracts paragraph properties from a style element. """

	paraProps = ParagraphProps()

	name = style.getAttribute("style:name")

	if name.startswith("Heading_20_") :
	level = name[11:]
	try :
	level = int(level)
	paraProps.setHeading(level)
	except :
	level = 0

	if name == "Title" :
	paraProps.setTitle(True)

	paraPropEl = style.getElementsByTagName("style:paragraph-properties")
	if paraPropEl :
	paraPropEl = paraPropEl[0]
	leftMargin = paraPropEl.getAttribute("fo:margin-left")
	if leftMargin :
	try :
	leftMargin = float(leftMargin[:-2])
	if leftMargin > 0.01 :
	paraProps.setIndented(True)
	except :
	pass

	textProps = self.extractTextProperties(style)
	if textProps.fixed :
	paraProps.setCode(True)

	return paraProps


	def processStyles(self, styleElements) :
	""" Runs through "style" elements extracting necessary information.
	"""

	for style in styleElements :

	name = style.getAttribute("style:name")

	if name == "Standard" : continue

	family = style.getAttribute("style:family")
	parent = style.getAttribute("style:parent-style-name")

	if family == "text" :
	self.textStyles[name] = self.extractTextProperties(style,
	parent)

	elif family == "paragraph":
	self.paragraphStyles[name] = (
	self.extractParagraphProperties(style,
	parent))
	def processListStyles (self, listStyleElements) :

	for style in listStyleElements :
	name = style.getAttribute("style:name")

	prop = ListProperties()
	if style.childNodes :
	if ( style.childNodes[0].tagName
	== "text:list-level-style-number" ) :
	prop.setOrdered(True)

	self.listStyles[name] = prop


	def load(self, filepath) :
	""" Loads an ODT file. """

	zip = zipfile.ZipFile(filepath)

	styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
	self.processFontDeclarations(styles_doc.getElementsByTagName(
	"office:font-face-decls")[0])
	self.processStyles(styles_doc.getElementsByTagName("style:style"))
	self.processListStyles(styles_doc.getElementsByTagName(
	"text:list-style"))

	self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
	self.processFontDeclarations(self.content.getElementsByTagName(
	"office:font-face-decls")[0])
	self.processStyles(self.content.getElementsByTagName("style:style"))
	self.processListStyles(self.content.getElementsByTagName(
	"text:list-style"))

	def compressCodeBlocks(self, text) :
	""" Removes extra blank lines from code blocks. """

	lines = text.split("\n")
	buffer = ""
	numLines = len(lines)
	for i in range(numLines) :

	if (lines[i].strip() or i == numLines-1 or i == 0 or
	not ( lines[i-1].startswith(" ")
	and lines[i+1].startswith(" ") ) ):
	buffer += "\n" + lines[i]

	return buffer



	def listToString (self, listElement) :

	buffer = ""

	styleName = listElement.getAttribute("text:style-name")
	props = self.listStyles.get(styleName, ListProperties())



	i = 0
	for item in listElement.childNodes :
	i += 1
	if props.ordered :
	number = str(i)
	number = number + "." + " "*(2-len(number))
	buffer += number + self.paragraphToString(item.childNodes[0],
	indent=3)
	else :
	buffer += "* " + self.paragraphToString(item.childNodes[0],
	indent=2)
	buffer += "\n\n"

	return buffer

	def toString (self) :
	""" Converts the document to a string. """
	body = self.content.getElementsByTagName("office:body")[0]
	text = self.content.getElementsByTagName("office:text")[0]

	buffer = u""


	paragraphs = [el for el in text.childNodes
	if el.tagName in ["text:p", "text:h",
	"text:list"]]

	for paragraph in paragraphs :
	if paragraph.tagName == "text:list" :
	text = self.listToString(paragraph)
	else :
	text = self.paragraphToString(paragraph)
	if text :
	buffer += text + "\n\n"

	if self.footnotes :

	buffer += "--------\n\n"
	for cite, body in self.footnotes :
	buffer += "[^%s]: %s\n\n" % (cite, body)


	return self.compressCodeBlocks(buffer)


	def textToString(self, element) :

	buffer = u""

	for node in element.childNodes :

	if node.nodeType == xml.dom.Node.TEXT_NODE :
	buffer += node.nodeValue

	elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
	tag = node.tagName

	if tag == "text:span" :

	text = self.textToString(node)

	if not text.strip() :
	return "" # don't apply styles to white space

	styleName = node.getAttribute("text:style-name")
	style = self.textStyles.get(styleName, None)

	#print styleName, str(style)

	if style.fixed :
	buffer += "`" + text + "`"
	continue

	if style :
	if style.italic and style.bold :
	mark = "***"
	elif style.italic :
	mark = "*"
	elif style.bold :
	mark = "**"
	else :
	mark = ""
	else :
	mark = "<" + styleName + ">"

	buffer += "%s%s%s" % (mark, text, mark)

	elif tag == "text:note" :
	cite = (node.getElementsByTagName("text:note-citation")[0]
	.childNodes[0].nodeValue)

	body = (node.getElementsByTagName("text:note-body")[0]
	.childNodes[0])

	self.footnotes.append((cite, self.textToString(body)))

	buffer += "[^%s]" % cite

	elif tag in IGNORED_TAGS :
	pass

	elif tag == "text:s" :
	try :
	num = int(node.getAttribute("text:c"))
	buffer += " "*num
	except :
	buffer += " "

	elif tag == "text:tab" :
	buffer += " "


	elif tag == "text:a" :

	text = self.textToString(node)
	link = node.getAttribute("xlink:href")
	buffer += "[%s](%s)" % (text, link)

	return buffer

	def paragraphToString(self, paragraph, indent = 0) :


	style_name = paragraph.getAttribute("text:style-name")
	paraProps = self.paragraphStyles.get(style_name) #, None)
	text = self.textToString(paragraph)

	#print style_name

	if paraProps and not paraProps.code :
	text = text.strip()

	if paraProps.title :
	self.hasTitle = 1
	return text + "\n" + ("=" * len(text))

	if paraProps.headingLevel :

	level = paraProps.headingLevel
	if self.hasTitle : level += 1

	return "#" * level + " " + text

	elif paraProps.code :
	lines = [" %s" % line for line in text.split("\n")]
	return "\n".join(lines)

	if paraProps.indented :
	return self.wrapParagraph(text, indent = indent, blockquote=True)

	else :
	return self.wrapParagraph(text, indent = indent)


	def wrapParagraph(self, text, indent = 0, blockquote=False) :

	counter = 0
	buffer = ""
	LIMIT = 50

	if blockquote :
	buffer += "> "

	for token in text.split() :

	if counter > LIMIT - indent :
	buffer += "\n" + " "*indent
	if blockquote :
	buffer += "> "
	counter = 0

	buffer += token + " "
	counter += len(token)

	return buffer



	if __name__ == "__main__" :


	odt = OpenDocumentTextFile(sys.argv[1])

	#print odt.fixedFonts

	#sys.exit(0)
	#out = open("out.txt", "wb")

	unicode = odt.toString()
	out_utf8 = unicode.encode("utf-8")

	sys.stdout.write(out_utf8)

	#out.write(