Skip to content

Instantly share code, notes, and snippets.

@kaczmarj
Last active October 14, 2019 13:37
Show Gist options
  • Save kaczmarj/71e316792a4bc8e596ba8cc4875597a4 to your computer and use it in GitHub Desktop.
Save kaczmarj/71e316792a4bc8e596ba8cc4875597a4 to your computer and use it in GitHub Desktop.
"""Filter slides with red titles from powerpoints, and save the filtered slides as one PDF file.
This script can only be run on Windows because it requires some Windows-specific libraries.
How to use
----------
- Install comtypes, pypdf2, and python-pptx using pip. Powerpoint must also be installed.
- Put all of the powerpoints to be filtered in one directory.
- Navigate to that directory in a terminal.
- Run this script. After some time, a file will be created named "output.pdf" with the filtered slides.
"""
from pathlib import Path
import sys
import tempfile
import comtypes.client
from PyPDF2 import PdfFileMerger
import pptx
def slide_has_red_title(slide):
for shape in slide.shapes:
try:
is_red = shape.fill.fore_color.rgb == (255, 0, 0)
is_title = shape.top < 50000
if is_red and is_title:
return True
except (AttributeError, TypeError):
pass
return False
def delete_slide(presentation, slide):
id_dict = {slide.id: [i, slide.rId] for i,slide in enumerate(presentation.slides._sldIdLst)}
slide_id = slide.slide_id
presentation.part.drop_rel(id_dict[slide_id][1])
del presentation.slides._sldIdLst[id_dict[slide_id][0]]
def filter_red_title_slides(presentation, keep_first_slide=False):
for i, s in enumerate(presentation.slides):
if keep_first_slide and i == 0:
continue
if not slide_has_red_title(s):
delete_slide(presentation, s)
def batch_convert_ppt_to_pdf(infiles, format_type=32):
powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
powerpoint.Visible = 1
m = len(infiles)
for j, infile in enumerate(infiles):
deck = powerpoint.Presentations.Open(str(infile))
deck.SaveAs(str(infile.with_suffix('.pdf')), format_type)
deck.Close()
print(" {} / {}".format(j + 1, m), end='\r')
powerpoint.Quit()
def merge_pdfs(infiles, outfile):
merger = PdfFileMerger()
m = len(infiles)
for j, pdf in enumerate(infiles):
merger.append(str(pdf))
print("{} / {}".format(j + 1, m), end='\r')
merger.write(str(outfile))
merger.close()
if __name__ == '__main__':
with tempfile.TemporaryDirectory() as tmpdirname:
ppt_files = Path('.').glob('*.pptx')
# Filter all of the presentations, and save to a temporary directory.
print("++ Filtering presentations ...")
for j, filename in enumerate(ppt_files):
p = pptx.Presentation(filename)
filter_red_title_slides(p, keep_first_slide=True)
ptmp = Path(tmpdirname)
outpath = ptmp / '{}_filtered.pptx'.format(Path(filename).name)
p.save(outpath)
# Convert all the filtered presentations to PDF.
filtered = list(ptmp.glob('*_filtered.pptx'))
if not len(filtered):
raise ValueError("No filtered powerpoint files found.")
print("++ Converting filtered presentations to PDF ...")
batch_convert_ppt_to_pdf(infiles=filtered)
# Merge the PDFs, and save.
pdf_files = list(ptmp.glob('*_filtered.pdf'))
if not len(pdf_files):
raise ValueError("No PDF files found.")
print("++ Merging and saving to PDF ...")
merge_pdfs(pdf_files, "output.pdf")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment