Skip to content

Instantly share code, notes, and snippets.

@GjjvdBurg
Last active November 8, 2019 17:08
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save GjjvdBurg/22de176179b46678b0b6206dc70a407d to your computer and use it in GitHub Desktop.
Save GjjvdBurg/22de176179b46678b0b6206dc70a407d to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This must be one of the more nit-picky things I've done.
This script requires pdftk to be available on the path:
https://www.pdflabs.com/tools/pdftk-the-pdf-toolkit/
It fixes artifacts in the pdf images by changing the "fill and stroke" paths to
simple "fill" paths.
Author: Gertjan van den Burg
"""
import argparse
import os
import re
import subprocess
import sys
import tempfile
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-i", "--input-file", help="Input file")
parser.add_argument("-o", "--output-file", help="Output file")
return parser.parse_args()
def fix_stack(stack):
""" Fix the block if we encounter it """
# We are specifically looking for blocks of the following form:
#
# B
# float float float RG
# float float float rg
# float float m
# float float l
# float float l
# float float l
# h
#
# Anything that does not match this format is not affected
# The fix is to replace the 'B' line with an 'f' line.
if any(s is None for s in stack):
return
block = b"".join(stack)
regex = b"B \n((\d|0\.\d+)\ ){3}RG\ \n((\d|0\.\d+)\ ){3}rg\ \n(\-?(\d+|\d+\.\d+)\ ){2}m\ \n((\-?(\d+|\d+\.\d+)\ ){2}l\ \n){3}h\ \n"
if re.fullmatch(regex, block):
stack[0] = b"f \n"
def uncompress(source, target):
# decompress the pdf
status = subprocess.call(["pdftk", source, "output", target, "uncompress"])
if not status == 0:
print("Failed to decompress the file with pdftk", file=sys.stderr)
raise SystemExit(1)
def compress(source, target):
status = subprocess.call(["pdftk", source, "output", target, "compress"])
if not status == 0:
print("Failed to compress the file with pdftk", file=sys.stderr)
raise SystemExit(2)
def run_fix(uncompressed, fixed):
# replace the blocks where necessary
stack = [None] * 8
with open(fixed, "wb") as ofp, open(uncompressed, "rb") as ifp:
for line in ifp:
first = stack.pop(0)
if not first is None:
ofp.write(first)
stack.append(line)
fix_stack(stack)
for line in stack:
ofp.write(line)
def main():
args = parse_args()
with tempfile.TemporaryDirectory(prefix="pdf_fix_") as tmpdir:
t1 = os.path.join(tmpdir, "uncompressed.pdf")
t2 = os.path.join(tmpdir, "fixed.pdf")
uncompress(args.input_file, t1)
run_fix(t1, t2)
compress(t2, args.output_file)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment