Skip to content

Instantly share code, notes, and snippets.

@swharden
Created January 31, 2023 17:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save swharden/82d6f36e2cd3ef7e4bd5fa97d883f506 to your computer and use it in GitHub Desktop.
Save swharden/82d6f36e2cd3ef7e4bd5fa97d883f506 to your computer and use it in GitHub Desktop.
Extract text from a folder of PPTX files and save the output in a HTML report
"""
This script finds text in a folder of PPT files and saves what is found
in a HMTL report that can be easily searched. It separates long phrases
from stray words to make important content easier to spot.
"""
import datetime
import collections
import collections.abc
import pptx # pip install python-pptx
import pathlib
def getLines(pptxFile: pathlib.Path, minCharCount: int = 3):
lines = []
with open(pptxFile, 'rb') as f:
pres = pptx.Presentation(f)
for slide in pres.slides:
for shape in slide.shapes:
if not shape.has_text_frame:
continue
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text = run.text.strip()
if (len(text) >= minCharCount):
lines.append(text)
return lines
def getLinesByFile(folder: pathlib.Path):
linesByFile = {}
for pptxFile in folder.glob("*.pptx"):
print(f"Analyzing {pptxFile.name}")
linesByFile[str(pptxFile)] = getLines(pptxFile)
return linesByFile
def makeReport(linesByFile: dict, saveAs: pathlib.Path):
html = ""
html += "<html><body>"
html += "<div style='text-align: center; margin: 3em 0;'>"
html += "<h1>Report</h1>"
html += "<div><i>This file facilitates searching for text across powerpoint files</i></div>"
html += f"<div style='margin: 1em 0;'><code>generated {datetime.datetime.now()}</code></div>"
html += "</div>"
for key in linesByFile:
filename = pathlib.Path(key).name
html += f"<h3>{filename}</h3>"
html += "<ul>"
minWordCount = 3
phrases = [x for x in linesByFile[key]
if len(x.split(" ")) >= minWordCount]
words = [x for x in linesByFile[key]
if len(x.split(" ")) < minWordCount]
words = ", ".join(words)
for line in phrases:
html += f"<li>{line}</li>"
html += f"<li>Words: {words}</li>"
html += "</ul>"
html += "</body></html>"
with open(saveAs, 'w') as f:
f.write(html)
print(f"Saved: {saveAs}")
if __name__ == "__main__":
folderPath = pathlib.Path(R"C:\path\to\folder")
linesByFile = getLinesByFile(folderPath)
reportFilePath = folderPath.joinpath("report.html")
makeReport(linesByFile, reportFilePath)
print("DONE")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment