Skip to content

Instantly share code, notes, and snippets.

@michaelosthege
Created December 7, 2022 19:33
Show Gist options
  • Save michaelosthege/a6cc9556ff4e2b64d5f7d3aaee43be70 to your computer and use it in GitHub Desktop.
Save michaelosthege/a6cc9556ff4e2b64d5f7d3aaee43be70 to your computer and use it in GitHub Desktop.
Python script to convert PDF pages with grayscale content to grayscale color information.
# MIT License
# Copyright (c) 2022 Michael Osthege
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
This script processes a PDF with ghostscript such that pages are treated as grayscale where possible.
To install ghostscript:
```bash
apt-get update
apt-get install ghostscript -y
```
To run this script:
```bash
python compress_colors.py --fp_in=thesis.pdf --fp_out=thesis-print.pdf --fp_report=print-metrics.log
```
"""
import argparse
import logging
import shutil
import subprocess
from pathlib import Path
from typing import Dict, List, Sequence
_log = logging.getLogger(__file__)
logging.basicConfig(level=logging.INFO)
def is_grayscale(c: str, m: str, y: str, *, threshold: float) -> bool:
if threshold < 0:
raise Exception(
"Invalid threshold setting {threshold}. Should be positive."
)
if c == m == y and float(c) <= threshold:
return True
return False
def log_composition(pages_is_grayscale: Dict[int, bool]):
n = len(pages_is_grayscale)
ngray = sum(pages_is_grayscale.values())
ncol = n - ngray
perc = ncol / n * 100
message = f"{ncol} color and {ngray} grayscale pages out of {n} in total ({perc:.1f} % color)."
return message
def analyze(fp: Path, threshold: float) -> Dict[int, bool]:
"""
Determines which pages in a PDF can be considered grayscale,
based on CMY channel values.
Parameters
----------
fp
Path to the PDF.
threshold
Pages with CMY channel averages above this threshold remain colored.
"""
dp_cache = fp.parent / "cache"
dp_cache.mkdir(exist_ok=True)
# Cache the analysis result
fp_analysis = dp_cache / f"analysis_{fp.name}.txt"
if not fp_analysis.exists():
_log.info("Extracting color information from %s", fp)
analysis_stdout = subprocess.check_output(
["gs", "-o", "-", "-sDEVICE=inkcov", str(fp.absolute())]
).decode("utf-8")
fp_analysis.write_text(analysis_stdout, encoding="utf-8")
else:
_log.info("Loading color information from %s", fp_analysis)
analysis_stdout = fp_analysis.read_text("utf-8")
_log.info("Parsing output...")
analysis_lines = analysis_stdout.split("\n")
pages_is_grayscale = {}
for l, line in enumerate(analysis_lines):
if not line.startswith("Page"):
continue
p = int(line.strip("Page "))
c, m, y = analysis_lines[l + 1][1:].split(" ")[:3]
pages_is_grayscale[p] = is_grayscale(c, m, y, threshold=threshold)
_log.info(
"Page %i (%s, %s, %s) → %s",
p,
c,
m,
y,
["color", "grayscale"][int(pages_is_grayscale[p])],
)
return pages_is_grayscale
def extract_page(fp: Path, page: int, fp_out: Path, *, in_gray: bool):
"""Extract one page from a PDF either with original colors, or in grayscale."""
if in_gray:
command = "gs -sDEVICE=pdfwrite -dPDFUseOldCMS=false -dProcessColorModel=/DeviceGray -dColorConversionStrategy=/Gray"
else:
command = "gs -sDEVICE=pdfwrite -dPDFUseOldCMS=false"
command += f" -dFirstPage={page} -dLastPage={page} -o {str(fp_out)} -f {str(fp)}"
subprocess.check_call(command.split(" "))
return
def split_convert(fp: Path, pages_is_grayscale: Dict[int, bool]) -> List[Path]:
"""Split the input to individual color and grayscale pages."""
dp_cache = fp.parent / "cache"
dp_cache.mkdir(exist_ok=True)
# Extract each page as color or grayscale
fps_pages = []
for p, in_gray in pages_is_grayscale.items():
fp_page = dp_cache / (
f"page_{p}_gray.pdf" if in_gray else f"page_{p}_color.pdf"
)
if not fp_page.exists():
extract_page(fp, p, fp_page, in_gray=in_gray)
fps_pages.append(fp_page)
return fps_pages
def combine(parts: Sequence[Path], fp_out: Path):
"""Merge many PDFs into one."""
_log.info("Merging %i parts to %s", len(parts), fp_out)
# We must pass input with a wildcard to avoid exceeding the maximum command length.
# Therefore, we copy all parts to a temporary directory.
fp_temp = (fp_out.parent / "__temp").absolute()
try:
fp_temp.mkdir()
_log.info("Copying %i parts to %s", len(parts), fp_temp)
for p, fp in enumerate(parts):
shutil.copy2(fp, fp_temp / f"{p+1:04d}.pdf")
_log.info("Merging...")
command = f"gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE={str(fp_out)} -dQUIET -dSAFER -dBATCH {str(fp_temp)}/*.pdf"
# Invoke via bash to get the wildcard expanded
subprocess.check_call(
[
"bash",
"-c",
command,
]
)
except:
_log.error("Failed to merge.")
raise
finally:
_log.info("Removing %s", fp_temp)
shutil.rmtree(fp_temp)
return
def run(fp_in: Path, fp_out: Path, fp_log: Path, threshold: float):
_log.info("Processing %s → %s", fp_in, fp_out)
pages_is_grayscale = analyze(fp_in, threshold=threshold)
msg = log_composition(pages_is_grayscale)
_log.info("Input composition is %s", msg)
fp_log.write_text(msg)
fps_pages = split_convert(fp_in, pages_is_grayscale)
combine(fps_pages, fp_out)
_log.info("Conversion completed!")
return
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--fp_in", help="Path to the input PDF")
parser.add_argument("--fp_out", help="Path for the output PDF")
parser.add_argument("--fp_report", help="Path for the conversion report")
parser.add_argument(
"--threshold",
default=0.004,
help=(
"Pages with CMY channel averages above this threshold remain colored."
),
type=float,
)
args, _ = parser.parse_known_args()
run(
fp_in=Path(args.fp_in),
fp_out=Path(args.fp_out),
fp_log=Path(args.fp_report),
threshold=args.threshold,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment