Skip to content

Instantly share code, notes, and snippets.

@SoulAuctioneer
Last active December 16, 2015 04:29
Show Gist options
  • Save SoulAuctioneer/5377469 to your computer and use it in GitHub Desktop.
Save SoulAuctioneer/5377469 to your computer and use it in GitHub Desktop.
Extract slide notes from PowerPoint
import zipfile
import cStringIO
from xml.dom import minidom
class ExtractPptNotes(object):
_chunks = None
def __init__(self, file_data):
self._chunks = self.__inflate_file_data(file_data)
def get_all_slide_notes(self):
notes = {}
for slide_name in self._chunks[u'ppt/notesSlides']:
notes[slide_name] = self.get_slide_notes(slide_name)
return notes
def get_slide_notes(self, slide_name):
notes = []
slide_xml_string = self._chunks[u'ppt/notesSlides'][slide_name]
slide_xml = minidom.parseString(slide_xml_string)
nodes = slide_xml.getElementsByTagName('a:t')
for node in nodes:
for child_node in node.childNodes:
if child_node.nodeType == child_node.TEXT_NODE:
notes.append(child_node.nodeValue)
return notes
def __inflate_file_data(self, file_data):
# Inflate the file
file_stream = cStringIO.StringIO(file_data)
openxml_zip = zipfile.ZipFile(file_stream, 'r', zipfile.ZIP_DEFLATED)
# Extract each xml file contained in the openxml doc into a chunk store
# where chunks[chunk_path][chunk_filename] = chunk_data
chunks = {}
for chunk_file_path in openxml_zip.namelist():
chunk_data = openxml_zip.read(chunk_file_path)
chunk_path_info = chunk_file_path.rsplit('/', 1)
if len(chunk_path_info) < 2:
chunk_name = chunk_path_info[0]
chunk_path = '/'
else:
chunk_name = chunk_path_info[1]
chunk_path = chunk_path_info[0]
if not chunk_path in chunks:
chunks[chunk_path] = {}
chunks[chunk_path][chunk_name] = chunk_data
openxml_zip.close()
file_stream.close()
return chunks
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment