Last active
October 14, 2019 13:37
-
-
Save kaczmarj/71e316792a4bc8e596ba8cc4875597a4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Filter slides with red titles from powerpoints, and save the filtered slides as one PDF file. | |
This script can only be run on Windows because it requires some Windows-specific libraries. | |
How to use | |
---------- | |
- Install comtypes, pypdf2, and python-pptx using pip. Powerpoint must also be installed. | |
- Put all of the powerpoints to be filtered in one directory. | |
- Navigate to that directory in a terminal. | |
- Run this script. After some time, a file will be created named "output.pdf" with the filtered slides. | |
""" | |
from pathlib import Path | |
import sys | |
import tempfile | |
import comtypes.client | |
from PyPDF2 import PdfFileMerger | |
import pptx | |
def slide_has_red_title(slide): | |
for shape in slide.shapes: | |
try: | |
is_red = shape.fill.fore_color.rgb == (255, 0, 0) | |
is_title = shape.top < 50000 | |
if is_red and is_title: | |
return True | |
except (AttributeError, TypeError): | |
pass | |
return False | |
def delete_slide(presentation, slide): | |
id_dict = {slide.id: [i, slide.rId] for i,slide in enumerate(presentation.slides._sldIdLst)} | |
slide_id = slide.slide_id | |
presentation.part.drop_rel(id_dict[slide_id][1]) | |
del presentation.slides._sldIdLst[id_dict[slide_id][0]] | |
def filter_red_title_slides(presentation, keep_first_slide=False): | |
for i, s in enumerate(presentation.slides): | |
if keep_first_slide and i == 0: | |
continue | |
if not slide_has_red_title(s): | |
delete_slide(presentation, s) | |
def batch_convert_ppt_to_pdf(infiles, format_type=32): | |
powerpoint = comtypes.client.CreateObject("Powerpoint.Application") | |
powerpoint.Visible = 1 | |
m = len(infiles) | |
for j, infile in enumerate(infiles): | |
deck = powerpoint.Presentations.Open(str(infile)) | |
deck.SaveAs(str(infile.with_suffix('.pdf')), format_type) | |
deck.Close() | |
print(" {} / {}".format(j + 1, m), end='\r') | |
powerpoint.Quit() | |
def merge_pdfs(infiles, outfile): | |
merger = PdfFileMerger() | |
m = len(infiles) | |
for j, pdf in enumerate(infiles): | |
merger.append(str(pdf)) | |
print("{} / {}".format(j + 1, m), end='\r') | |
merger.write(str(outfile)) | |
merger.close() | |
if __name__ == '__main__': | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
ppt_files = Path('.').glob('*.pptx') | |
# Filter all of the presentations, and save to a temporary directory. | |
print("++ Filtering presentations ...") | |
for j, filename in enumerate(ppt_files): | |
p = pptx.Presentation(filename) | |
filter_red_title_slides(p, keep_first_slide=True) | |
ptmp = Path(tmpdirname) | |
outpath = ptmp / '{}_filtered.pptx'.format(Path(filename).name) | |
p.save(outpath) | |
# Convert all the filtered presentations to PDF. | |
filtered = list(ptmp.glob('*_filtered.pptx')) | |
if not len(filtered): | |
raise ValueError("No filtered powerpoint files found.") | |
print("++ Converting filtered presentations to PDF ...") | |
batch_convert_ppt_to_pdf(infiles=filtered) | |
# Merge the PDFs, and save. | |
pdf_files = list(ptmp.glob('*_filtered.pdf')) | |
if not len(pdf_files): | |
raise ValueError("No PDF files found.") | |
print("++ Merging and saving to PDF ...") | |
merge_pdfs(pdf_files, "output.pdf") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment