Skip to content

Instantly share code, notes, and snippets.

@lukelbd
Last active October 7, 2022 20:48
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukelbd/2bdcfd5c766d114bb3231affa1ab3bbb to your computer and use it in GitHub Desktop.
Save lukelbd/2bdcfd5c766d114bb3231affa1ab3bbb to your computer and use it in GitHub Desktop.
Script to change the color of the highlight annotations in PDF document(s).
#!/usr/bin/env python
"""
Change the color of the highlight annotations in PDF document(s).
The destination color is hardcoded in this script.
"""
# Adapted from:
# https://unix.stackexchange.com/a/118492/112647
# PDF format reference:
# http://wwwimages.adobe.com/www.adobe.com/content/dam/Adobe/en/devnet/pdf/pdfs/PDF32000_2008.pdf
# pages 12 to 13 define the character sets
import re
import os
# Color specification
PREFIX = b'/C ['
SUFFIX = b']'
COLOR = b'1.0 0.969 0.416' # your favorite color here!
# Regular expressions
_to_char_group = lambda chars: '[' + re.escape(chars) + ']'
FLOAT = r'(?=\+|\-)?\d+(\.\d*)?'
WHITESPACE = '\x00\x09\x0A\x0C\x0D\x20'
DELIMITER = '()<>[]{}/%'
WS = _to_char_group(WHITESPACE)
DELIM = _to_char_group(DELIMITER)
SPECIAL = _to_char_group(WHITESPACE + DELIMITER)
REGEX_OBJ = re.compile(
fr'(?<={SPECIAL})obj{WS}*<<.*?>>{WS}*endobj(?={SPECIAL})'.encode(), re.DOTALL
)
REGEX_ANNOT = re.compile(
f'/Type{WS}*/Annot(?={SPECIAL})'.encode()
)
REGEX_HIGH = re.compile(
f'/Subtype{WS}*/Highlight(?={SPECIAL})'.encode()
)
REGEX_COLOR = re.compile(
fr'/C{WS}*\[{WS}*({FLOAT}){WS}+({FLOAT}){WS}+({FLOAT}){WS}*\]'.encode()
)
def process_color(match):
diff = len(match.group(0)) - len(COLOR) - len(PREFIX) - len(SUFFIX)
if diff < 0:
raise NotImplementedError(
'Replacement is too long and would require the size of the PDF '
'file to change. This is not implemented.'
)
result = PREFIX + diff * b' ' + COLOR + SUFFIX
return result
def process_obj(match):
is_annot = REGEX_ANNOT.search(match.group(0))
is_highlight = REGEX_HIGH.search(match.group(0))
if is_annot and is_highlight:
return REGEX_COLOR.sub(process_color, match.group(0))
else:
return match.group(0)
def process_file(filename, overwrite=False):
with open(filename, 'rb') as f:
data = f.read()
data = REGEX_OBJ.sub(process_obj, data)
if not overwrite:
filename, _ = os.path.splitext(filename)
filename += '_recolored.pdf'
with open(filename, 'wb') as f:
f.write(data)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description=__doc__, add_help=False)
parser.add_argument(
'-h',
'--help',
action='help',
help='Show this help message and exit.'
)
parser.add_argument(
'-o',
'--overwrite',
action='store_true',
help=(
'Whether to overwrite the files instead of creating new ones '
"with the suffix '_recolored.pdf'. Default behavior is the latter."
)
)
parser.add_argument(
'filename',
nargs='*',
type=str,
help='PDF file(s) to process.'
)
args = parser.parse_args()
for filename in args.filename:
print(f'Processing file: {filename!r}')
process_file(filename, overwrite=args.overwrite)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment