Skip to content

Instantly share code, notes, and snippets.

@martok
Last active March 26, 2023 16:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save martok/403b71fdd9322b3b806f4dfb26c86edf to your computer and use it in GitHub Desktop.
Save martok/403b71fdd9322b3b806f4dfb26c86edf to your computer and use it in GitHub Desktop.
Count Color/BW pages in PDF
import argparse
import sys
from pathlib import Path
import subprocess
import tempfile
import cv2
import numpy as np
GS = r"gswin64c.exe"
KIND_NAMES = {
"bw": "B/W",
"gray": "Grayscale",
"color": "Color",
}
def gs(args):
with subprocess.Popen([GS, *args], bufsize=0, stdout=subprocess.PIPE) as p:
for line in iter(p.stdout.readline, b''):
yield line.rstrip().decode("utf-8")
def gs_inkcov(file: Path):
yield from gs(["-o", "-", "-sDEVICE=inkcov", "-f", str(file)])
def gs_page_tiffsep(file: Path, pageno: int, dpi=150):
tmpbase = Path(tempfile.gettempdir()) / f"gspagecount{pageno}.tif"
files = [Path(tempfile.gettempdir()) / f"gspagecount{pageno}({col}).tif"
for col in ["Cyan", "Magenta", "Yellow", "Black"]]
def cleanup():
for f in files:
f.unlink(missing_ok=True)
tmpbase.unlink(missing_ok=True)
cleanup()
try:
for output in gs(["-o", str(tmpbase), f"-r{dpi}x{dpi}", "-sDEVICE=tiffsep", "-sPageList=" + str(pageno), "-f", str(file)]):
# print(output)
pass
images = [cv2.imread(str(col), cv2.IMREAD_UNCHANGED) for col in files]
return np.dstack(images)
finally:
cleanup()
pass
def precise_measure(file: Path, current: int, dpi: int):
cmyk = gs_page_tiffsep(file, current, dpi)
if cmyk is None:
return
color_use = 1.0 - cmyk / np.full(cmyk.shape, 255)
painted = np.max(color_use, axis=2) > 0.0
color_grayvalue = np.mean(color_use[..., :3], axis=2)
# pixels that are printed with more black than their color grey value don't count as color
darkened = color_use[..., 3] > color_grayvalue
color_use[..., :3][darkened] = 0.0
return color_use.mean(axis=0).mean(axis=0)
def classify(cmyk) -> str:
if max(cmyk[:3]) < 1e-4:
return "bw"
elif cmyk[0] == cmyk[1] == cmyk[2]:
return "gray"
return "color"
def run_file(pdf: Path, *,
summarize: bool,
precise_dpi: int):
ST = 0
total = 0
first = 0
current = 0
current_kind = ""
counts = {
"total": 0,
"bw": 0,
"gray": 0,
"color": 0,
}
if summarize:
def print_head():
print(f"{'Pages':>9s} {'BW':>6s} {'Gray':>6s} {'Color':>6s}")
def print_page():
print(f"{counts['total']:>4}/{total:>4} {counts['bw']:>6} {counts['gray']:>6} {counts['color']:>6}", end="\r")
else:
def print_head():
print(f"{'Page':>4s} {'Type':<6s} {'Cyan':>5s} {'Mag':>5s} {'Yell':>5s} {'Black':>5s}")
def print_page():
print(f"{current:>4} {KIND_NAMES.get(current_kind):<6s} {cmyk[0]:.3f} {cmyk[1]:.3f} {cmyk[2]:.3f} {cmyk[3]:.3f}")
for ln in gs_inkcov(pdf):
words = list(filter(None, map(str.strip, ln.split(" "))))
if ST == 0:
if ln.startswith("Processing pages"):
first = int(words[2])
total = int(words[4][:-1])
print_head()
ST = 1
elif ST == 1:
if ln.startswith("Page "):
current = int(words[1])
ST = 2
elif ST == 2:
cmyk = [float(x) for x in words[0:4]]
counts["total"] += 1
if precise_dpi:
cmyk = precise_measure(pdf, current, precise_dpi)
current_kind = classify(cmyk)
if current_kind:
counts[current_kind] += 1
print_page()
ST = 1
print("")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("FILE", type=str,
help="PDF file")
parser.add_argument("-s", "--summarize", action="store_true",
help="Summarize Totals")
parser.add_argument("-p", "--precise", type=int, default=0, nargs="?", const=36,
help="Precise per-pixel count (account for overprinting)")
args = parser.parse_args()
return run_file(Path(args.FILE),
summarize=args.summarize, precise_dpi=args.precise)
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment