michaelosthege/compress_colors.py

## compress_colors.py
# MIT License

# Copyright (c) 2022 Michael Osthege

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
This script processes a PDF with ghostscript such that pages are treated as grayscale where possible.

To install ghostscript:

```bash
apt-get update
apt-get install ghostscript -y
```

To run this script:

```bash
python compress_colors.py --fp_in=thesis.pdf --fp_out=thesis-print.pdf --fp_report=print-metrics.log
```
"""
import argparse
import logging
import shutil
import subprocess
from pathlib import Path
from typing import Dict, List, Sequence

_log = logging.getLogger(__file__)
logging.basicConfig(level=logging.INFO)


def is_grayscale(c: str, m: str, y: str, *, threshold: float) -> bool:
    if threshold < 0:
        raise Exception(
            "Invalid threshold setting {threshold}. Should be positive."
        )
    if c == m == y and float(c) <= threshold:
        return True
    return False


def log_composition(pages_is_grayscale: Dict[int, bool]):
    n = len(pages_is_grayscale)
    ngray = sum(pages_is_grayscale.values())
    ncol = n - ngray
    perc = ncol / n * 100
    message = f"{ncol} color and {ngray} grayscale pages out of {n} in total ({perc:.1f} % color)."
    return message


def analyze(fp: Path, threshold: float) -> Dict[int, bool]:
    """
    Determines which pages in a PDF can be considered grayscale,
    based on CMY channel values.

    Parameters
    ----------
    fp
        Path to the PDF.
    threshold
        Pages with CMY channel averages above this threshold remain colored.
    """
    dp_cache = fp.parent / "cache"
    dp_cache.mkdir(exist_ok=True)

    # Cache the analysis result
    fp_analysis = dp_cache / f"analysis_{fp.name}.txt"
    if not fp_analysis.exists():
        _log.info("Extracting color information from %s", fp)
        analysis_stdout = subprocess.check_output(
            ["gs", "-o", "-", "-sDEVICE=inkcov", str(fp.absolute())]
        ).decode("utf-8")
        fp_analysis.write_text(analysis_stdout, encoding="utf-8")
    else:
        _log.info("Loading color information from %s", fp_analysis)
        analysis_stdout = fp_analysis.read_text("utf-8")

    _log.info("Parsing output...")
    analysis_lines = analysis_stdout.split("\n")
    pages_is_grayscale = {}
    for l, line in enumerate(analysis_lines):
        if not line.startswith("Page"):
            continue
        p = int(line.strip("Page "))
        c, m, y = analysis_lines[l + 1][1:].split("  ")[:3]
        pages_is_grayscale[p] = is_grayscale(c, m, y, threshold=threshold)
        _log.info(
            "Page %i (%s, %s, %s) → %s",
            p,
            c,
            m,
            y,
            ["color", "grayscale"][int(pages_is_grayscale[p])],
        )

    return pages_is_grayscale


def extract_page(fp: Path, page: int, fp_out: Path, *, in_gray: bool):
    """Extract one page from a PDF either with original colors, or in grayscale."""
    if in_gray:
        command = "gs -sDEVICE=pdfwrite -dPDFUseOldCMS=false -dProcessColorModel=/DeviceGray -dColorConversionStrategy=/Gray"
    else:
        command = "gs -sDEVICE=pdfwrite -dPDFUseOldCMS=false"

    command += f" -dFirstPage={page} -dLastPage={page} -o {str(fp_out)} -f {str(fp)}"
    subprocess.check_call(command.split(" "))
    return


def split_convert(fp: Path, pages_is_grayscale: Dict[int, bool]) -> List[Path]:
    """Split the input to individual color and grayscale pages."""
    dp_cache = fp.parent / "cache"
    dp_cache.mkdir(exist_ok=True)

    # Extract each page as color or grayscale
    fps_pages = []
    for p, in_gray in pages_is_grayscale.items():
        fp_page = dp_cache / (
            f"page_{p}_gray.pdf" if in_gray else f"page_{p}_color.pdf"
        )
        if not fp_page.exists():
            extract_page(fp, p, fp_page, in_gray=in_gray)
        fps_pages.append(fp_page)

    return fps_pages


def combine(parts: Sequence[Path], fp_out: Path):
    """Merge many PDFs into one."""
    _log.info("Merging %i parts to %s", len(parts), fp_out)

    # We must pass input with a wildcard to avoid exceeding the maximum command length.
    # Therefore, we copy all parts to a temporary directory.
    fp_temp = (fp_out.parent / "__temp").absolute()
    try:
        fp_temp.mkdir()
        _log.info("Copying %i parts to %s", len(parts), fp_temp)
        for p, fp in enumerate(parts):
            shutil.copy2(fp, fp_temp / f"{p+1:04d}.pdf")

        _log.info("Merging...")
        command = f"gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE={str(fp_out)} -dQUIET -dSAFER -dBATCH {str(fp_temp)}/*.pdf"
        # Invoke via bash to get the wildcard expanded
        subprocess.check_call(
            [
                "bash",
                "-c",
                command,
            ]
        )
    except:
        _log.error("Failed to merge.")
        raise
    finally:
        _log.info("Removing %s", fp_temp)
        shutil.rmtree(fp_temp)
    return


def run(fp_in: Path, fp_out: Path, fp_log: Path, threshold: float):
    _log.info("Processing %s → %s", fp_in, fp_out)

    pages_is_grayscale = analyze(fp_in, threshold=threshold)
    msg = log_composition(pages_is_grayscale)
    _log.info("Input composition is %s", msg)
    fp_log.write_text(msg)

    fps_pages = split_convert(fp_in, pages_is_grayscale)
    combine(fps_pages, fp_out)

    _log.info("Conversion completed!")
    return


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--fp_in", help="Path to the input PDF")
    parser.add_argument("--fp_out", help="Path for the output PDF")
    parser.add_argument("--fp_report", help="Path for the conversion report")
    parser.add_argument(
        "--threshold",
        default=0.004,
        help=(
            "Pages with CMY channel averages above this threshold remain colored."
        ),
        type=float,
    )
    args, _ = parser.parse_known_args()

    run(
        fp_in=Path(args.fp_in),
        fp_out=Path(args.fp_out),
        fp_log=Path(args.fp_report),
        threshold=args.threshold,
    )
	# MIT License

	# Copyright (c) 2022 Michael Osthege

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	"""
	This script processes a PDF with ghostscript such that pages are treated as grayscale where possible.

	To install ghostscript:

	```bash
	apt-get update
	apt-get install ghostscript -y
	```

	To run this script:

	```bash
	python compress_colors.py --fp_in=thesis.pdf --fp_out=thesis-print.pdf --fp_report=print-metrics.log
	```
	"""
	import argparse
	import logging
	import shutil
	import subprocess
	from pathlib import Path
	from typing import Dict, List, Sequence

	_log = logging.getLogger(__file__)
	logging.basicConfig(level=logging.INFO)


	def is_grayscale(c: str, m: str, y: str, *, threshold: float) -> bool:
	if threshold < 0:
	raise Exception(
	"Invalid threshold setting {threshold}. Should be positive."
	)
	if c == m == y and float(c) <= threshold:
	return True
	return False


	def log_composition(pages_is_grayscale: Dict[int, bool]):
	n = len(pages_is_grayscale)
	ngray = sum(pages_is_grayscale.values())
	ncol = n - ngray
	perc = ncol / n * 100
	message = f"{ncol} color and {ngray} grayscale pages out of {n} in total ({perc:.1f} % color)."
	return message


	def analyze(fp: Path, threshold: float) -> Dict[int, bool]:
	"""
	Determines which pages in a PDF can be considered grayscale,
	based on CMY channel values.

	Parameters
	----------
	fp
	Path to the PDF.
	threshold
	Pages with CMY channel averages above this threshold remain colored.
	"""
	dp_cache = fp.parent / "cache"
	dp_cache.mkdir(exist_ok=True)

	# Cache the analysis result
	fp_analysis = dp_cache / f"analysis_{fp.name}.txt"
	if not fp_analysis.exists():
	_log.info("Extracting color information from %s", fp)
	analysis_stdout = subprocess.check_output(
	["gs", "-o", "-", "-sDEVICE=inkcov", str(fp.absolute())]
	).decode("utf-8")
	fp_analysis.write_text(analysis_stdout, encoding="utf-8")
	else:
	_log.info("Loading color information from %s", fp_analysis)
	analysis_stdout = fp_analysis.read_text("utf-8")

	_log.info("Parsing output...")
	analysis_lines = analysis_stdout.split("\n")
	pages_is_grayscale = {}
	for l, line in enumerate(analysis_lines):
	if not line.startswith("Page"):
	continue
	p = int(line.strip("Page "))
	c, m, y = analysis_lines[l + 1][1:].split(" ")[:3]
	pages_is_grayscale[p] = is_grayscale(c, m, y, threshold=threshold)
	_log.info(
	"Page %i (%s, %s, %s) → %s",
	p,
	c,
	m,
	y,
	["color", "grayscale"][int(pages_is_grayscale[p])],
	)

	return pages_is_grayscale


	def extract_page(fp: Path, page: int, fp_out: Path, *, in_gray: bool):
	"""Extract one page from a PDF either with original colors, or in grayscale."""
	if in_gray:
	command = "gs -sDEVICE=pdfwrite -dPDFUseOldCMS=false -dProcessColorModel=/DeviceGray -dColorConversionStrategy=/Gray"
	else:
	command = "gs -sDEVICE=pdfwrite -dPDFUseOldCMS=false"

	command += f" -dFirstPage={page} -dLastPage={page} -o {str(fp_out)} -f {str(fp)}"
	subprocess.check_call(command.split(" "))
	return


	def split_convert(fp: Path, pages_is_grayscale: Dict[int, bool]) -> List[Path]:
	"""Split the input to individual color and grayscale pages."""
	dp_cache = fp.parent / "cache"
	dp_cache.mkdir(exist_ok=True)

	# Extract each page as color or grayscale
	fps_pages = []
	for p, in_gray in pages_is_grayscale.items():
	fp_page = dp_cache / (
	f"page_{p}_gray.pdf" if in_gray else f"page_{p}_color.pdf"
	)
	if not fp_page.exists():
	extract_page(fp, p, fp_page, in_gray=in_gray)
	fps_pages.append(fp_page)

	return fps_pages


	def combine(parts: Sequence[Path], fp_out: Path):
	"""Merge many PDFs into one."""
	_log.info("Merging %i parts to %s", len(parts), fp_out)

	# We must pass input with a wildcard to avoid exceeding the maximum command length.
	# Therefore, we copy all parts to a temporary directory.
	fp_temp = (fp_out.parent / "__temp").absolute()
	try:
	fp_temp.mkdir()
	_log.info("Copying %i parts to %s", len(parts), fp_temp)
	for p, fp in enumerate(parts):
	shutil.copy2(fp, fp_temp / f"{p+1:04d}.pdf")

	_log.info("Merging...")
	command = f"gs -dNOPAUSE -sDEVICE=pdfwrite -sOUTPUTFILE={str(fp_out)} -dQUIET -dSAFER -dBATCH {str(fp_temp)}/*.pdf"
	# Invoke via bash to get the wildcard expanded
	subprocess.check_call(
	[
	"bash",
	"-c",
	command,
	]
	)
	except:
	_log.error("Failed to merge.")
	raise
	finally:
	_log.info("Removing %s", fp_temp)
	shutil.rmtree(fp_temp)
	return


	def run(fp_in: Path, fp_out: Path, fp_log: Path, threshold: float):
	_log.info("Processing %s → %s", fp_in, fp_out)

	pages_is_grayscale = analyze(fp_in, threshold=threshold)
	msg = log_composition(pages_is_grayscale)
	_log.info("Input composition is %s", msg)
	fp_log.write_text(msg)

	fps_pages = split_convert(fp_in, pages_is_grayscale)
	combine(fps_pages, fp_out)

	_log.info("Conversion completed!")
	return


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--fp_in", help="Path to the input PDF")
	parser.add_argument("--fp_out", help="Path for the output PDF")
	parser.add_argument("--fp_report", help="Path for the conversion report")
	parser.add_argument(
	"--threshold",
	default=0.004,
	help=(
	"Pages with CMY channel averages above this threshold remain colored."
	),
	type=float,
	)
	args, _ = parser.parse_known_args()

	run(
	fp_in=Path(args.fp_in),
	fp_out=Path(args.fp_out),
	fp_log=Path(args.fp_report),
	threshold=args.threshold,
	)