Last active
August 6, 2017 11:44
-
-
Save davidfoerster/b1c59e58a532b7ae4458d3f205810bb5 to your computer and use it in GitHub Desktop.
Crop random regions from a series of PDF documents (https://askubuntu.com/q/932193/175814)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, random | |
if len(sys.argv) < 3: | |
sys.argv.append(input()) | |
cropsize = float(sys.argv[1]) | |
llx, lly, urx, ury = map(float, sys.argv[2].split(None, 3)) | |
width = urx - llx | |
height = ury - lly | |
if width < cropsize or height < cropsize: | |
print('Crop size too small for bounds', llx, lly, urx, ury, file=sys.stderr) | |
sys.exit(1) | |
cropllx = random.uniform(0, width - cropsize) + llx | |
croplly = random.uniform(0, height - cropsize) + lly | |
cropurx = cropllx + cropsize | |
cropury = croplly + cropsize | |
print(*map(lambda x: format(x, '.6f'), (cropllx, croplly, cropurx, cropury))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -e -u -o pipefail | |
# default values: | |
size=256 | |
cropsize=50 | |
density=600 | |
quality=75% | |
dstext=jpg | |
get_cropbox() | |
{ | |
gs -q -dSAFER -dBATCH -dNOPAUSE -dNOPROMPT -sDEVICE=bbox -f "$1" 2>&1 >&- | | |
sed -ne 's/^%%HiResBoundingBox:\s*//p' | | |
python3 "$exedir/crop-calc.py" "$2" | |
} | |
cat_pdf_page() | |
{ | |
gs -q -dSAFER -dBATCH -dNOPAUSE -dNOPROMPT -sDEVICE=pdfwrite \ | |
-dFirstPage="$3" -dLastPage="$3" -sOutputFile="$2" -f "$1" | |
} | |
crop_pdf() | |
{ | |
gs -q -dSAFER -dBATCH -dNOPAUSE -dNOPROMPT -sDEVICE=pdfwrite \ | |
-sOutputFile=%stdout% \ | |
-c "[/CropBox [$(get_cropbox "$1" "$cropsize")] /PAGES pdfmark" \ | |
-f "$1" | | |
convert -define pdf:use-cropbox=true -colorspace sRGB -density "$density" pdf:- \ | |
-flatten -resize "${size}x${size}^" -crop "${size}x${size}+0+0!" \ | |
-quality "$quality" "$2" | |
} | |
get_random_area() | |
{ | |
local -i pagecount="$(pdfinfo "$1" | sed -ne 's/^Pages:\s*//p')" && | |
local -i page=$(($RANDOM % $pagecount + 1)) && | |
local pagefile="${1%.*}.p${page}.${1##*.}" && | |
[ "$1" -ot "$pagefile" ] || cat_pdf_page "$1" "$pagefile" "$page" && | |
crop_pdf "$pagefile" "$2" | |
} | |
get_random_areas() | |
{ | |
local -i i | |
for ((i=1; i <= "$2"; i++)); do | |
get_random_area "$1" "${1%.*}.$i.$dstext" || return | |
done | |
} | |
exe="$(exec readlink -f -- "$0")" | |
exedir="${exe%/*}" | |
args="$(exec getopt -s bash -n "${0##*/}" -o 's:c:d:q:' -l 'size:,cropsize:,density:,quality:,dstext:' -- "$@")" || exit 64 | |
eval set -- "$args" | |
unset args | |
while :; do | |
case "$1" in | |
--size|--cropsize|--density|--quality|--dstext) | |
eval "${1:2}=\$2"; shift 2;; | |
--) | |
shift; break;; | |
-*) | |
echo 'Internal error!' "$1" >&2; exit 127;; | |
*) | |
break;; | |
esac | |
done | |
declare -i areacount="$1" | |
shift | |
for f; do | |
get_random_areas "$f" "$areacount" || exit | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment