Skip to content

Instantly share code, notes, and snippets.

@btbytes
Forked from dsanson/odt2pandoc.py
Created July 17, 2014 19:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save btbytes/9ae3de9994ef745b3fc2 to your computer and use it in GitHub Desktop.
Save btbytes/9ae3de9994ef745b3fc2 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
ODT2PANDOC
==========
ODT2PANDOC is a slight variant of ODT2TXT. Right now, the only differences are that it generates ATX style headers and uses *asterisks* for italics, neither of which is in any way specific to pandoc's extended markdown.
ODT2TXT
=======
ODT2TXT convers files in Open Document Text format (ODT) into
Markdown-formatted plain text.
Writteby by [Yuri Takhteyev](http://www.freewisdom.org).
Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php
Contact: yuri [at] freewisdom.org
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
Version: 0.1 (April 7, 2006)
"""
import sys, zipfile, xml.dom.minidom
IGNORED_TAGS = ["office:annotation"]
FOOTNOTE_STYLES = ["Footnote"]
class TextProps :
""" Holds properties for a text style. """
def __init__ (self):
self.italic = False
self.bold = False
self.fixed = False
def setItalic (self, value) :
if value == "italic" :
self.italic = True
def setBold (self, value) :
if value == "bold" :
self.bold = True
def setFixed (self, value) :
self.fixed = value
def __str__ (self) :
return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic),
str(self.bold),
str(self.fixed))
class ParagraphProps :
""" Holds properties of a paragraph style. """
def __init__ (self):
self.blockquote = False
self.headingLevel = 0
self.code = False
self.title = False
self.indented = 0
def setIndented (self, value) :
self.indented = value
def setHeading (self, level) :
self.headingLevel = level
def setTitle (self, value):
self.title = value
def setCode (self, value) :
self.code = value
def __str__ (self) :
return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
self.headingLevel,
str(self.code))
class ListProperties :
""" Holds properties for a list style. """
def __init__ (self):
self.ordered = False
def setOrdered (self, value) :
self.ordered = value
class OpenDocumentTextFile :
def __init__ (self, filepath) :
self.footnotes = []
self.footnoteCounter = 0
self.textStyles = {"Standard" : TextProps()}
self.paragraphStyles = {"Standard" : ParagraphProps()}
self.listStyles = {}
self.fixedFonts = []
self.hasTitle = 0
self.load(filepath)
def processFontDeclarations (self, fontDecl) :
""" Extracts necessary font information from a font-declaration
element.
"""
for fontFace in fontDecl.getElementsByTagName("style:font-face") :
if fontFace.getAttribute("style:font-pitch") == "fixed" :
self.fixedFonts.append(fontFace.getAttribute("style:name"))
def extractTextProperties (self, style, parent=None) :
""" Extracts text properties from a style element. """
textProps = TextProps()
if parent :
parentProp = self.textStyles.get(parent, None)
if parentProp :
textProp = parentProp
textPropEl = style.getElementsByTagName("style:text-properties")
if not textPropEl : return textProps
textPropEl = textPropEl[0]
italic = textPropEl.getAttribute("fo:font-style")
bold = textPropEl.getAttribute("fo:font-weight")
textProps.setItalic(italic)
textProps.setBold(bold)
if textPropEl.getAttribute("style:font-name") in self.fixedFonts :
textProps.setFixed(True)
return textProps
def extractParagraphProperties (self, style, parent=None) :
""" Extracts paragraph properties from a style element. """
paraProps = ParagraphProps()
name = style.getAttribute("style:name")
if name.startswith("Heading_20_") :
level = name[11:]
try :
level = int(level)
paraProps.setHeading(level)
except :
level = 0
if name == "Title" :
paraProps.setTitle(True)
paraPropEl = style.getElementsByTagName("style:paragraph-properties")
if paraPropEl :
paraPropEl = paraPropEl[0]
leftMargin = paraPropEl.getAttribute("fo:margin-left")
if leftMargin :
try :
leftMargin = float(leftMargin[:-2])
if leftMargin > 0.01 :
paraProps.setIndented(True)
except :
pass
textProps = self.extractTextProperties(style)
if textProps.fixed :
paraProps.setCode(True)
return paraProps
def processStyles(self, styleElements) :
""" Runs through "style" elements extracting necessary information.
"""
for style in styleElements :
name = style.getAttribute("style:name")
if name == "Standard" : continue
family = style.getAttribute("style:family")
parent = style.getAttribute("style:parent-style-name")
if family == "text" :
self.textStyles[name] = self.extractTextProperties(style,
parent)
elif family == "paragraph":
self.paragraphStyles[name] = (
self.extractParagraphProperties(style,
parent))
def processListStyles (self, listStyleElements) :
for style in listStyleElements :
name = style.getAttribute("style:name")
prop = ListProperties()
if style.childNodes :
if ( style.childNodes[0].tagName
== "text:list-level-style-number" ) :
prop.setOrdered(True)
self.listStyles[name] = prop
def load(self, filepath) :
""" Loads an ODT file. """
zip = zipfile.ZipFile(filepath)
styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
self.processFontDeclarations(styles_doc.getElementsByTagName(
"office:font-face-decls")[0])
self.processStyles(styles_doc.getElementsByTagName("style:style"))
self.processListStyles(styles_doc.getElementsByTagName(
"text:list-style"))
self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
self.processFontDeclarations(self.content.getElementsByTagName(
"office:font-face-decls")[0])
self.processStyles(self.content.getElementsByTagName("style:style"))
self.processListStyles(self.content.getElementsByTagName(
"text:list-style"))
def compressCodeBlocks(self, text) :
""" Removes extra blank lines from code blocks. """
lines = text.split("\n")
buffer = ""
numLines = len(lines)
for i in range(numLines) :
if (lines[i].strip() or i == numLines-1 or i == 0 or
not ( lines[i-1].startswith(" ")
and lines[i+1].startswith(" ") ) ):
buffer += "\n" + lines[i]
return buffer
def listToString (self, listElement) :
buffer = ""
styleName = listElement.getAttribute("text:style-name")
props = self.listStyles.get(styleName, ListProperties())
i = 0
for item in listElement.childNodes :
i += 1
if props.ordered :
number = str(i)
number = number + "." + " "*(2-len(number))
buffer += number + self.paragraphToString(item.childNodes[0],
indent=3)
else :
buffer += "* " + self.paragraphToString(item.childNodes[0],
indent=2)
buffer += "\n\n"
return buffer
def toString (self) :
""" Converts the document to a string. """
body = self.content.getElementsByTagName("office:body")[0]
text = self.content.getElementsByTagName("office:text")[0]
buffer = u""
paragraphs = [el for el in text.childNodes
if el.tagName in ["text:p", "text:h",
"text:list"]]
for paragraph in paragraphs :
if paragraph.tagName == "text:list" :
text = self.listToString(paragraph)
else :
text = self.paragraphToString(paragraph)
if text :
buffer += text + "\n\n"
if self.footnotes :
buffer += "--------\n\n"
for cite, body in self.footnotes :
buffer += "[^%s]: %s\n\n" % (cite, body)
return self.compressCodeBlocks(buffer)
def textToString(self, element) :
buffer = u""
for node in element.childNodes :
if node.nodeType == xml.dom.Node.TEXT_NODE :
buffer += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
tag = node.tagName
if tag == "text:span" :
text = self.textToString(node)
if not text.strip() :
return "" # don't apply styles to white space
styleName = node.getAttribute("text:style-name")
style = self.textStyles.get(styleName, None)
#print styleName, str(style)
if style.fixed :
buffer += "`" + text + "`"
continue
if style :
if style.italic and style.bold :
mark = "***"
elif style.italic :
mark = "*"
elif style.bold :
mark = "**"
else :
mark = ""
else :
mark = "<" + styleName + ">"
buffer += "%s%s%s" % (mark, text, mark)
elif tag == "text:note" :
cite = (node.getElementsByTagName("text:note-citation")[0]
.childNodes[0].nodeValue)
body = (node.getElementsByTagName("text:note-body")[0]
.childNodes[0])
self.footnotes.append((cite, self.textToString(body)))
buffer += "[^%s]" % cite
elif tag in IGNORED_TAGS :
pass
elif tag == "text:s" :
try :
num = int(node.getAttribute("text:c"))
buffer += " "*num
except :
buffer += " "
elif tag == "text:tab" :
buffer += " "
elif tag == "text:a" :
text = self.textToString(node)
link = node.getAttribute("xlink:href")
buffer += "[%s](%s)" % (text, link)
return buffer
def paragraphToString(self, paragraph, indent = 0) :
style_name = paragraph.getAttribute("text:style-name")
paraProps = self.paragraphStyles.get(style_name) #, None)
text = self.textToString(paragraph)
#print style_name
if paraProps and not paraProps.code :
text = text.strip()
if paraProps.title :
self.hasTitle = 1
return text + "\n" + ("=" * len(text))
if paraProps.headingLevel :
level = paraProps.headingLevel
if self.hasTitle : level += 1
return "#" * level + " " + text
elif paraProps.code :
lines = [" %s" % line for line in text.split("\n")]
return "\n".join(lines)
if paraProps.indented :
return self.wrapParagraph(text, indent = indent, blockquote=True)
else :
return self.wrapParagraph(text, indent = indent)
def wrapParagraph(self, text, indent = 0, blockquote=False) :
counter = 0
buffer = ""
LIMIT = 50
if blockquote :
buffer += "> "
for token in text.split() :
if counter > LIMIT - indent :
buffer += "\n" + " "*indent
if blockquote :
buffer += "> "
counter = 0
buffer += token + " "
counter += len(token)
return buffer
if __name__ == "__main__" :
odt = OpenDocumentTextFile(sys.argv[1])
#print odt.fixedFonts
#sys.exit(0)
#out = open("out.txt", "wb")
unicode = odt.toString()
out_utf8 = unicode.encode("utf-8")
sys.stdout.write(out_utf8)
#out.write(
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment