Created
August 27, 2012 07:18
-
-
Save spacedman/3486449 to your computer and use it in GitHub Desktop.
Extract the text of notes from an OpenOffice presentation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from lxml import etree | |
import zipfile | |
# some namespaces we may need | |
DRAW="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0" | |
DRAWC = "{%s}" % DRAW | |
PRES='urn:oasis:names:tc:opendocument:xmlns:presentation:1.0' | |
PRESC = "{%s}" % PRES | |
NS = {'presentation': PRES, | |
"draw":DRAW } | |
def getTree(path): | |
return etree.XML(path.read()) | |
def printNotes(slides): | |
for slide in slides: | |
title = slide.get(DRAWC+"name") | |
print "---- %s ----" % title | |
notes = slide.xpath("presentation:notes",namespaces=NS) | |
for note in notes: | |
bits = [c.text for c in note.iterdescendants()] | |
print "\n".join(filter(lambda x: x, bits)) | |
print " " | |
def getSlides(et): | |
return et.xpath("//draw:page",namespaces=NS) | |
if __name__=="__main__": | |
from sys import argv | |
filepath = argv[1] | |
z = zipfile.ZipFile(filepath) | |
content = z.open("content.xml") | |
et = getTree(content) | |
slides = getSlides(et) | |
printNotes(slides) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment