vonavi/pdf_paint.py

## pdf_paint.py
#!/usr/bin/env python3
import os

import argparse
import random

import fitz

class Config:
    def get_mark_range(self, y, mark):
        rel_pos = (self.bottom - y) / (self.bottom - self.top)
        if mark:
            alpha = rel_pos + 2.0 * (1 - rel_pos)
            mark_range = (0.5 * alpha, 1.0 * alpha)
        else:
            alpha = rel_pos + 1.0 * (1 - rel_pos)
            mark_range = (1.0 * alpha, 1.5 * alpha)
        return mark_range

    def __init__(self, debug):
        if debug:
            # https://htmlcolorcodes.com/color-chart/material-design-color-chart/
            self.color = (255, 87, 51)

            def get_width(top, bottom, mark):
                mark_range = self.get_mark_range(top, mark)
                height = bottom - top
                return height * 0.5 * sum(mark_range)
            self.get_width = get_width

        else:
            # https://htmlcolorcodes.com/color-chart/material-design-color-chart/
            self.color = (255, 249, 196)

            def get_width(top, bottom, mark):
                mark_range = self.get_mark_range(top, mark)
                height = bottom - top
                return height * random.uniform(*mark_range)
            self.get_width = get_width

class Paint:
    def __init__(self):
        self.__mark = True
        self.__next_w = None
        self.__offset = 0

    def generate(self, rect, cfg):
        if self.__next_w is None:
            self.__next_w = cfg.get_width(rect.y0, rect.y1, self.__mark)

        extra_width = rect.width + self.__offset
        while True:
            if self.__mark:
                mark_left = rect.x1 - extra_width
                r = fitz.Rect(mark_left, rect.y0,
                              mark_left + self.__next_w, rect.y1)
                yield r & rect

            if extra_width < self.__next_w:
                break

            extra_width -= self.__next_w
            self.__mark = not self.__mark
            self.__next_w = cfg.get_width(rect.y0, rect.y1, self.__mark)

        extra_width += rect.width
        self.__offset = extra_width - rect.width

def page_paint(page, cfg):
    rect = page.rect
    cfg.top = rect.y0
    cfg.bottom = rect.y1

    paint = Paint()

    page_dict = page.getText('dict')
    for block in page_dict['blocks']:
        if block['type'] != 0:
            continue

        for line in block['lines']:
            for r in paint.generate(fitz.Rect(line['bbox']), cfg):
                # create a pixmap with RGB as colorspace and bounded by irect
                pm = fitz.Pixmap(fitz.Colorspace(fitz.CS_RGB), r.round())
                pm.clearWith(0xff)
                pm.tintWith(*cfg.color)
                page.insertImage(r, pixmap=pm, overlay=True)

def pdf_paint(pdf, pages, cfg):
    doc = fitz.open(pdf)
    # Restrict pages to those presented in document
    pages = pages & set(range(doc.pageCount))

    page_count = len(pages)
    for count, page_num in enumerate(sorted(pages)):
        page = doc[page_num]
        print('Processing page {} / {}...'.format(count + 1, page_count),
              end='\r')
        page_paint(page, cfg)
    print()

    basename, ext = os.path.splitext(pdf)
    doc.save(basename + '_paint' + ext)
    doc.close()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('pdf', type=str, help='PDF file')
    parser.add_argument('-p', '--pages', required=True, type=int, nargs='+',
                        help='Pages to paint')
    parser.add_argument('-d', '--debug', action='store_true',
                        help='Debug the script')
    args = parser.parse_args()

    # Pass zero-based pages to function
    pages = set(map(lambda n: n - 1, args.pages))
    pdf_paint(args.pdf, pages, Config(args.debug))

## requirements.txt
PyMuPDF>=1.14,<=1.14.10
	#!/usr/bin/env python3
	import os

	import argparse
	import random

	import fitz

	class Config:
	def get_mark_range(self, y, mark):
	rel_pos = (self.bottom - y) / (self.bottom - self.top)
	if mark:
	alpha = rel_pos + 2.0 * (1 - rel_pos)
	mark_range = (0.5 * alpha, 1.0 * alpha)
	else:
	alpha = rel_pos + 1.0 * (1 - rel_pos)
	mark_range = (1.0 * alpha, 1.5 * alpha)
	return mark_range

	def __init__(self, debug):
	if debug:
	# https://htmlcolorcodes.com/color-chart/material-design-color-chart/
	self.color = (255, 87, 51)

	def get_width(top, bottom, mark):
	mark_range = self.get_mark_range(top, mark)
	height = bottom - top
	return height * 0.5 * sum(mark_range)
	self.get_width = get_width

	else:
	# https://htmlcolorcodes.com/color-chart/material-design-color-chart/
	self.color = (255, 249, 196)

	def get_width(top, bottom, mark):
	mark_range = self.get_mark_range(top, mark)
	height = bottom - top
	return height * random.uniform(*mark_range)
	self.get_width = get_width

	class Paint:
	def __init__(self):
	self.__mark = True
	self.__next_w = None
	self.__offset = 0

	def generate(self, rect, cfg):
	if self.__next_w is None:
	self.__next_w = cfg.get_width(rect.y0, rect.y1, self.__mark)

	extra_width = rect.width + self.__offset
	while True:
	if self.__mark:
	mark_left = rect.x1 - extra_width
	r = fitz.Rect(mark_left, rect.y0,
	mark_left + self.__next_w, rect.y1)
	yield r & rect

	if extra_width < self.__next_w:
	break

	extra_width -= self.__next_w
	self.__mark = not self.__mark
	self.__next_w = cfg.get_width(rect.y0, rect.y1, self.__mark)

	extra_width += rect.width
	self.__offset = extra_width - rect.width

	def page_paint(page, cfg):
	rect = page.rect
	cfg.top = rect.y0
	cfg.bottom = rect.y1

	paint = Paint()

	page_dict = page.getText('dict')
	for block in page_dict['blocks']:
	if block['type'] != 0:
	continue

	for line in block['lines']:
	for r in paint.generate(fitz.Rect(line['bbox']), cfg):
	# create a pixmap with RGB as colorspace and bounded by irect
	pm = fitz.Pixmap(fitz.Colorspace(fitz.CS_RGB), r.round())
	pm.clearWith(0xff)
	pm.tintWith(*cfg.color)
	page.insertImage(r, pixmap=pm, overlay=True)

	def pdf_paint(pdf, pages, cfg):
	doc = fitz.open(pdf)
	# Restrict pages to those presented in document
	pages = pages & set(range(doc.pageCount))

	page_count = len(pages)
	for count, page_num in enumerate(sorted(pages)):
	page = doc[page_num]
	print('Processing page {} / {}...'.format(count + 1, page_count),
	end='\r')
	page_paint(page, cfg)
	print()

	basename, ext = os.path.splitext(pdf)
	doc.save(basename + '_paint' + ext)
	doc.close()

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('pdf', type=str, help='PDF file')
	parser.add_argument('-p', '--pages', required=True, type=int, nargs='+',
	help='Pages to paint')
	parser.add_argument('-d', '--debug', action='store_true',
	help='Debug the script')
	args = parser.parse_args()

	# Pass zero-based pages to function
	pages = set(map(lambda n: n - 1, args.pages))
	pdf_paint(args.pdf, pages, Config(args.debug))