Skip to content

Embed URL

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
a slight tweak of odt2txt.py
#!/usr/bin/env python
"""
ODT2PANDOC
==========
ODT2PANDOC is a slight variant of ODT2TXT. Right now, the only differences are that it generates ATX style headers and uses *asterisks* for italics, neither of which is in any way specific to pandoc's extended markdown.
ODT2TXT
=======
ODT2TXT convers files in Open Document Text format (ODT) into
Markdown-formatted plain text.
Writteby by [Yuri Takhteyev](http://www.freewisdom.org).
Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php
Contact: yuri [at] freewisdom.org
License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
Version: 0.1 (April 7, 2006)
"""
import sys, zipfile, xml.dom.minidom
IGNORED_TAGS = ["office:annotation"]
FOOTNOTE_STYLES = ["Footnote"]
class TextProps :
""" Holds properties for a text style. """
def __init__ (self):
self.italic = False
self.bold = False
self.fixed = False
def setItalic (self, value) :
if value == "italic" :
self.italic = True
def setBold (self, value) :
if value == "bold" :
self.bold = True
def setFixed (self, value) :
self.fixed = value
def __str__ (self) :
return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic),
str(self.bold),
str(self.fixed))
class ParagraphProps :
""" Holds properties of a paragraph style. """
def __init__ (self):
self.blockquote = False
self.headingLevel = 0
self.code = False
self.title = False
self.indented = 0
def setIndented (self, value) :
self.indented = value
def setHeading (self, level) :
self.headingLevel = level
def setTitle (self, value):
self.title = value
def setCode (self, value) :
self.code = value
def __str__ (self) :
return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
self.headingLevel,
str(self.code))
class ListProperties :
""" Holds properties for a list style. """
def __init__ (self):
self.ordered = False
def setOrdered (self, value) :
self.ordered = value
class OpenDocumentTextFile :
def __init__ (self, filepath) :
self.footnotes = []
self.footnoteCounter = 0
self.textStyles = {"Standard" : TextProps()}
self.paragraphStyles = {"Standard" : ParagraphProps()}
self.listStyles = {}
self.fixedFonts = []
self.hasTitle = 0
self.load(filepath)
def processFontDeclarations (self, fontDecl) :
""" Extracts necessary font information from a font-declaration
element.
"""
for fontFace in fontDecl.getElementsByTagName("style:font-face") :
if fontFace.getAttribute("style:font-pitch") == "fixed" :
self.fixedFonts.append(fontFace.getAttribute("style:name"))
def extractTextProperties (self, style, parent=None) :
""" Extracts text properties from a style element. """
textProps = TextProps()
if parent :
parentProp = self.textStyles.get(parent, None)
if parentProp :
textProp = parentProp
textPropEl = style.getElementsByTagName("style:text-properties")
if not textPropEl : return textProps
textPropEl = textPropEl[0]
italic = textPropEl.getAttribute("fo:font-style")
bold = textPropEl.getAttribute("fo:font-weight")
textProps.setItalic(italic)
textProps.setBold(bold)
if textPropEl.getAttribute("style:font-name") in self.fixedFonts :
textProps.setFixed(True)
return textProps
def extractParagraphProperties (self, style, parent=None) :
""" Extracts paragraph properties from a style element. """
paraProps = ParagraphProps()
name = style.getAttribute("style:name")
if name.startswith("Heading_20_") :
level = name[11:]
try :
level = int(level)
paraProps.setHeading(level)
except :
level = 0
if name == "Title" :
paraProps.setTitle(True)
paraPropEl = style.getElementsByTagName("style:paragraph-properties")
if paraPropEl :
paraPropEl = paraPropEl[0]
leftMargin = paraPropEl.getAttribute("fo:margin-left")
if leftMargin :
try :
leftMargin = float(leftMargin[:-2])
if leftMargin > 0.01 :
paraProps.setIndented(True)
except :
pass
textProps = self.extractTextProperties(style)
if textProps.fixed :
paraProps.setCode(True)
return paraProps
def processStyles(self, styleElements) :
""" Runs through "style" elements extracting necessary information.
"""
for style in styleElements :
name = style.getAttribute("style:name")
if name == "Standard" : continue
family = style.getAttribute("style:family")
parent = style.getAttribute("style:parent-style-name")
if family == "text" :
self.textStyles[name] = self.extractTextProperties(style,
parent)
elif family == "paragraph":
self.paragraphStyles[name] = (
self.extractParagraphProperties(style,
parent))
def processListStyles (self, listStyleElements) :
for style in listStyleElements :
name = style.getAttribute("style:name")
prop = ListProperties()
if style.childNodes :
if ( style.childNodes[0].tagName
== "text:list-level-style-number" ) :
prop.setOrdered(True)
self.listStyles[name] = prop
def load(self, filepath) :
""" Loads an ODT file. """
zip = zipfile.ZipFile(filepath)
styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
self.processFontDeclarations(styles_doc.getElementsByTagName(
"office:font-face-decls")[0])
self.processStyles(styles_doc.getElementsByTagName("style:style"))
self.processListStyles(styles_doc.getElementsByTagName(
"text:list-style"))
self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
self.processFontDeclarations(self.content.getElementsByTagName(
"office:font-face-decls")[0])
self.processStyles(self.content.getElementsByTagName("style:style"))
self.processListStyles(self.content.getElementsByTagName(
"text:list-style"))
def compressCodeBlocks(self, text) :
""" Removes extra blank lines from code blocks. """
lines = text.split("\n")
buffer = ""
numLines = len(lines)
for i in range(numLines) :
if (lines[i].strip() or i == numLines-1 or i == 0 or
not ( lines[i-1].startswith(" ")
and lines[i+1].startswith(" ") ) ):
buffer += "\n" + lines[i]
return buffer
def listToString (self, listElement) :
buffer = ""
styleName = listElement.getAttribute("text:style-name")
props = self.listStyles.get(styleName, ListProperties())
i = 0
for item in listElement.childNodes :
i += 1
if props.ordered :
number = str(i)
number = number + "." + " "*(2-len(number))
buffer += number + self.paragraphToString(item.childNodes[0],
indent=3)
else :
buffer += "* " + self.paragraphToString(item.childNodes[0],
indent=2)
buffer += "\n\n"
return buffer
def toString (self) :
""" Converts the document to a string. """
body = self.content.getElementsByTagName("office:body")[0]
text = self.content.getElementsByTagName("office:text")[0]
buffer = u""
paragraphs = [el for el in text.childNodes
if el.tagName in ["text:p", "text:h",
"text:list"]]
for paragraph in paragraphs :
if paragraph.tagName == "text:list" :
text = self.listToString(paragraph)
else :
text = self.paragraphToString(paragraph)
if text :
buffer += text + "\n\n"
if self.footnotes :
buffer += "--------\n\n"
for cite, body in self.footnotes :
buffer += "[^%s]: %s\n\n" % (cite, body)
return self.compressCodeBlocks(buffer)
def textToString(self, element) :
buffer = u""
for node in element.childNodes :
if node.nodeType == xml.dom.Node.TEXT_NODE :
buffer += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
tag = node.tagName
if tag == "text:span" :
text = self.textToString(node)
if not text.strip() :
return "" # don't apply styles to white space
styleName = node.getAttribute("text:style-name")
style = self.textStyles.get(styleName, None)
#print styleName, str(style)
if style.fixed :
buffer += "`" + text + "`"
continue
if style :
if style.italic and style.bold :
mark = "***"
elif style.italic :
mark = "*"
elif style.bold :
mark = "**"
else :
mark = ""
else :
mark = "<" + styleName + ">"
buffer += "%s%s%s" % (mark, text, mark)
elif tag == "text:note" :
cite = (node.getElementsByTagName("text:note-citation")[0]
.childNodes[0].nodeValue)
body = (node.getElementsByTagName("text:note-body")[0]
.childNodes[0])
self.footnotes.append((cite, self.textToString(body)))
buffer += "[^%s]" % cite
elif tag in IGNORED_TAGS :
pass
elif tag == "text:s" :
try :
num = int(node.getAttribute("text:c"))
buffer += " "*num
except :
buffer += " "
elif tag == "text:tab" :
buffer += " "
elif tag == "text:a" :
text = self.textToString(node)
link = node.getAttribute("xlink:href")
buffer += "[%s](%s)" % (text, link)
return buffer
def paragraphToString(self, paragraph, indent = 0) :
style_name = paragraph.getAttribute("text:style-name")
paraProps = self.paragraphStyles.get(style_name) #, None)
text = self.textToString(paragraph)
#print style_name
if paraProps and not paraProps.code :
text = text.strip()
if paraProps.title :
self.hasTitle = 1
return text + "\n" + ("=" * len(text))
if paraProps.headingLevel :
level = paraProps.headingLevel
if self.hasTitle : level += 1
return "#" * level + " " + text
elif paraProps.code :
lines = [" %s" % line for line in text.split("\n")]
return "\n".join(lines)
if paraProps.indented :
return self.wrapParagraph(text, indent = indent, blockquote=True)
else :
return self.wrapParagraph(text, indent = indent)
def wrapParagraph(self, text, indent = 0, blockquote=False) :
counter = 0
buffer = ""
LIMIT = 50
if blockquote :
buffer += "> "
for token in text.split() :
if counter > LIMIT - indent :
buffer += "\n" + " "*indent
if blockquote :
buffer += "> "
counter = 0
buffer += token + " "
counter += len(token)
return buffer
if __name__ == "__main__" :
odt = OpenDocumentTextFile(sys.argv[1])
#print odt.fixedFonts
#sys.exit(0)
#out = open("out.txt", "wb")
unicode = odt.toString()
out_utf8 = unicode.encode("utf-8")
sys.stdout.write(out_utf8)
#out.write(
@AcarBurak

Thanks for the gift, but on Manjaro I get this error:

File "/home/burak/bin/odt2pandoc.py", line 462, in <module>
    sys.stdout.write(out_utf8)
TypeError: must be str, not bytes
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Something went wrong with that request. Please try again.