Skip to content

Instantly share code, notes, and snippets.

@zxteloiv
Created November 27, 2015 11:55
Show Gist options
  • Save zxteloiv/2371c78fc0a5c8935a7a to your computer and use it in GitHub Desktop.
Save zxteloiv/2371c78fc0a5c8935a7a to your computer and use it in GitHub Desktop.
extract text from pptx files using pptx library
#!/usr/bin/env python2
# coding: utf-8
from pptx import Presentation
import chardet
import sys
def main(filename):
prs = Presentation(filename)
# text_runs will be populated with a list of strings,
# one for each text run in presentation
text_runs = []
for slide in prs.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
print u''.join(run.text for run in paragraph.runs).encode('utf-8')
#text = u''.join(run.text.decode(detector.feed(line)['encoding']) for run in paragraph.runs)
#text_runs.append(text)
if __name__ == "__main__":
main(sys.argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment