Skip to content

Instantly share code, notes, and snippets.

@spacedman
Created August 27, 2012 07:18
Show Gist options
  • Save spacedman/3486449 to your computer and use it in GitHub Desktop.
Save spacedman/3486449 to your computer and use it in GitHub Desktop.
Extract the text of notes from an OpenOffice presentation
#!/usr/bin/python
from lxml import etree
import zipfile
# some namespaces we may need
DRAW="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0"
DRAWC = "{%s}" % DRAW
PRES='urn:oasis:names:tc:opendocument:xmlns:presentation:1.0'
PRESC = "{%s}" % PRES
NS = {'presentation': PRES,
"draw":DRAW }
def getTree(path):
return etree.XML(path.read())
def printNotes(slides):
for slide in slides:
title = slide.get(DRAWC+"name")
print "---- %s ----" % title
notes = slide.xpath("presentation:notes",namespaces=NS)
for note in notes:
bits = [c.text for c in note.iterdescendants()]
print "\n".join(filter(lambda x: x, bits))
print " "
def getSlides(et):
return et.xpath("//draw:page",namespaces=NS)
if __name__=="__main__":
from sys import argv
filepath = argv[1]
z = zipfile.ZipFile(filepath)
content = z.open("content.xml")
et = getTree(content)
slides = getSlides(et)
printNotes(slides)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment