-
-
Save vonavi/1368605a4700008ba38278b0f6bc2fde to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
import os | |
import argparse | |
import random | |
import fitz | |
class Config: | |
def get_mark_range(self, y, mark): | |
rel_pos = (self.bottom - y) / (self.bottom - self.top) | |
if mark: | |
alpha = rel_pos + 2.0 * (1 - rel_pos) | |
mark_range = (0.5 * alpha, 1.0 * alpha) | |
else: | |
alpha = rel_pos + 1.0 * (1 - rel_pos) | |
mark_range = (1.0 * alpha, 1.5 * alpha) | |
return mark_range | |
def __init__(self, debug): | |
if debug: | |
# https://htmlcolorcodes.com/color-chart/material-design-color-chart/ | |
self.color = (255, 87, 51) | |
def get_width(top, bottom, mark): | |
mark_range = self.get_mark_range(top, mark) | |
height = bottom - top | |
return height * 0.5 * sum(mark_range) | |
self.get_width = get_width | |
else: | |
# https://htmlcolorcodes.com/color-chart/material-design-color-chart/ | |
self.color = (255, 249, 196) | |
def get_width(top, bottom, mark): | |
mark_range = self.get_mark_range(top, mark) | |
height = bottom - top | |
return height * random.uniform(*mark_range) | |
self.get_width = get_width | |
class Paint: | |
def __init__(self): | |
self.__mark = True | |
self.__next_w = None | |
self.__offset = 0 | |
def generate(self, rect, cfg): | |
if self.__next_w is None: | |
self.__next_w = cfg.get_width(rect.y0, rect.y1, self.__mark) | |
extra_width = rect.width + self.__offset | |
while True: | |
if self.__mark: | |
mark_left = rect.x1 - extra_width | |
r = fitz.Rect(mark_left, rect.y0, | |
mark_left + self.__next_w, rect.y1) | |
yield r & rect | |
if extra_width < self.__next_w: | |
break | |
extra_width -= self.__next_w | |
self.__mark = not self.__mark | |
self.__next_w = cfg.get_width(rect.y0, rect.y1, self.__mark) | |
extra_width += rect.width | |
self.__offset = extra_width - rect.width | |
def page_paint(page, cfg): | |
rect = page.rect | |
cfg.top = rect.y0 | |
cfg.bottom = rect.y1 | |
paint = Paint() | |
page_dict = page.getText('dict') | |
for block in page_dict['blocks']: | |
if block['type'] != 0: | |
continue | |
for line in block['lines']: | |
for r in paint.generate(fitz.Rect(line['bbox']), cfg): | |
# create a pixmap with RGB as colorspace and bounded by irect | |
pm = fitz.Pixmap(fitz.Colorspace(fitz.CS_RGB), r.round()) | |
pm.clearWith(0xff) | |
pm.tintWith(*cfg.color) | |
page.insertImage(r, pixmap=pm, overlay=True) | |
def pdf_paint(pdf, pages, cfg): | |
doc = fitz.open(pdf) | |
# Restrict pages to those presented in document | |
pages = pages & set(range(doc.pageCount)) | |
page_count = len(pages) | |
for count, page_num in enumerate(sorted(pages)): | |
page = doc[page_num] | |
print('Processing page {} / {}...'.format(count + 1, page_count), | |
end='\r') | |
page_paint(page, cfg) | |
print() | |
basename, ext = os.path.splitext(pdf) | |
doc.save(basename + '_paint' + ext) | |
doc.close() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('pdf', type=str, help='PDF file') | |
parser.add_argument('-p', '--pages', required=True, type=int, nargs='+', | |
help='Pages to paint') | |
parser.add_argument('-d', '--debug', action='store_true', | |
help='Debug the script') | |
args = parser.parse_args() | |
# Pass zero-based pages to function | |
pages = set(map(lambda n: n - 1, args.pages)) | |
pdf_paint(args.pdf, pages, Config(args.debug)) |
PyMuPDF>=1.14,<=1.14.10 |
I have checked that the last version (1.14.13
) of PyMuPDF causes segmentation fault. Use the previous one, as requirements.txt
shows.
Managed to make program work inside Docker (thank you for the idea).
I'm completely new to Docker and python novice, so I decided to have some training - make the program run as pipe.
Here is the private repo I created, because I cannot (don't know how) share code here.
https://github.com/Tonna/docker-test
sample usage
docker build . -t pdf_paint:1.0
cat sample.pdf | docker run --rm -i pdf_print:1.0 > sample_print.pdf
Downsides are that it requires to build full image for such simple task and processing takes significant amount of time.
P.S. Dependencies in program require python version > 3.6
I'm running on Debian 9.
Ok, I see that the problem is that Debian 9 doesn't have Python 3.6. I will rewrite the script and throw away dataclasses
.
Now it should work. I prefer to use virtualenv
to manage Python environments:
vonavi@desktop ~> mkdir -p ~/.virtualenvs
vonavi@desktop ~> virtualenv -p /usr/bin/python3.5 ~/.virtualenvs/mupdf
Running virtualenv with interpreter /usr/bin/python3.5
Using base prefix '/usr'
New python executable in /home/vonavi/.virtualenvs/mupdf/bin/python3.5
Also creating executable in /home/vonavi/.virtualenvs/mupdf/bin/python
Installing setuptools, pip, wheel...done.
vonavi@desktop ~> source ~/.virtualenvs/mupdf/bin/activate
(mupdf) vonavi@desktop ~> git clone https://gist.github.com/1368605a4700008ba38278b0f6bc2fde.git
Cloning into '1368605a4700008ba38278b0f6bc2fde'...
remote: Enumerating objects: 18, done.
remote: Counting objects: 100% (18/18), done.
remote: Compressing objects: 100% (13/13), done.
remote: Total 18 (delta 3), reused 11 (delta 1), pack-reused 0
Unpacking objects: 100% (18/18), done.
(mupdf) vonavi@desktop ~> cd 1368605a4700008ba38278b0f6bc2fde
(mupdf) vonavi@desktop ~/1368605a4700008ba38278b0f6bc2fde> pip install -r requirements.txt master
Collecting PyMuPDF==1.14.12 (from -r requirements.txt (line 1))
Using cached https://files.pythonhosted.org/packages/11/2d/80deffcca33da2321ef28d32f4dd401e8fcac8cb0fef0cf6d6f2f2b46600/PyMuPDF-1.14.12-cp35-cp35m-manylinux1_x86_64.whl
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.14.12
(mupdf) vonavi@desktop ~/1368605a4700008ba38278b0f6bc2fde> ./pdf_paint.py ~/sample.pdf -p $(seq 1 10) master
Processing page 2 / 2...
(mupdf) vonavi@desktop ~/1368605a4700008ba38278b0f6bc2fde> xdg-open ~/sample_paint.pdf
I fixed PyMuPDF versions which work correctly.
I followed your virtualenv guide and script worked fine.
Thank you!
Tried to launch script today.
-
Needed to upgrade dependency from "PyMuPDF>=1.14,<=1.14.10" to "PyMuPDF>=1.14,<=1.14.20".
-
Also tried to replace line 13 with "mark_range = (0.3 * alpha, 0.2 * alpha)" - easier for new person, less shock :)
Can "difficulty" be passed in script on launch time?
Sorry for a delay. I will check item 1 soon and fix the script in accordance. As for item 2, I agree to add such a parameter as "difficulty". The problem is how to formulate the difficulty itself when we need to control four parameters. Do you have a hint?
No worries. This script depends on the library, so issue with version will happen again and again I guess.
The problem is how to formulate the difficulty itself when we need to control four parameters. Do you have a hint?
I couldn't wrap my head around those parameters. I just tweaked them till result was acceptable. Image below generated by parameters - "mark_range = (1.0 * alpha, 0.3 * alpha)" - if I remember correctly, code was lost.
I think by experimenting one could come up to 3-4 configurations corresponding to easy-...-hard difficulty. I can do that later (by later I mean "don't know when"). No need for you to make any changes for now.
I'm not suggesting for you to make actual git repo, but it will allow making pull requests.
Hi Anton! What is the problem with pip? Can I ask you to show the output with error?
To check how the script works, I can prepare Dockerfile, build MuPDF and upload it in PAA.