-
-
Save TurnerSoftwareDev/784feb557fe0094c3878b6c7cede8696 to your computer and use it in GitHub Desktop.
Extracts images from PDF while preserving PNG transparency
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Extracts images from a PDF into a directory, preserving transparency. | |
# | |
# Inspired by https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359 | |
# | |
# Options: | |
# | |
# --help, -h Display help | |
# --pdf, -p The PDF to extract images from | |
# --directory, -d The directory to output images to | |
# | |
# Examples: | |
# | |
# extract-pdf-images -p ./foo.pdf -d ./foo | |
# | |
# Exit on first error | |
set -e | |
# Debug | |
#set -x | |
# Checks that a file is valid | |
validfile() { [[ -r ${1} ]]; } | |
validdir() { [[ -w ${1} ]]; } | |
# Checks that a command is installed | |
installed () { command -v "${1}" >/dev/null 2>&1 || { >&2 echo "Cannot execute the ${1} command"; exit 1; } } | |
help() { | |
scriptName=$(basename "$0") | |
echo "${scriptName} [options]" | |
echo | |
echo Options: | |
echo | |
echo "--help, -h Prints this help menu" | |
echo "--pdf, -p The PDF file to extract images from" | |
echo "--directory, -d The directory to extract images into" | |
echo | |
echo Example: | |
echo | |
echo "${scriptName}" --pdf "./foo.pdf" --directory "./foo" | |
echo | |
} | |
installed pdfimages # Install poppler-utils to get pdfimages | |
installed convert | |
pdf= | |
outputDir= | |
# Exit if no parameters were provided | |
if [[ $# = 0 ]]; then | |
help | |
exit 1 | |
fi | |
# Parse the parameters | |
while [[ $# != 0 ]]; do | |
case "${1}" in | |
-h|--help) | |
help | |
exit 0 | |
;; | |
-p|--pdf) | |
pdf="$2" | |
shift 2 | |
;; | |
-d|--directory) | |
outputDir="${2}" | |
shift 2 | |
;; | |
*) | |
shift | |
;; | |
esac | |
done | |
# Accept pdf name on stdin too | |
if [[ -z $pdf ]]; then | |
pdf=$(cat) | |
fi | |
# Check the parameters | |
if ! validfile "${pdf}"; then | |
>&2 echo "${pdf} cannot be read" | |
exit 1 | |
fi | |
if ! validdir "${outputDir}"; then | |
>&2 echo "${outputDir} cannot be written to" | |
exit 1 | |
fi | |
tmpdir=$(mktemp -d /tmp/extract-pdf-images-XXXXXX) | |
mkdir "${tmpdir}/extracted" | |
# Remove the tmpdir when we're done | |
trap 'rm -rf ${tmpdir}' EXIT INT | |
pdfimages -all "${pdf}" "${tmpdir}/extracted/image" || exit 1 | |
# Rename images based on object id and whether or not they are a mask | |
pdfimages -list "${pdf}" | tail -n +3 | while read -r row; do | |
num=$(echo "$row" | awk '{print $2}'); | |
imgtype=$(echo "$row" | awk '{print $3}'); | |
imgenc=$(echo "$row" | awk '{print $9}'); | |
objectid=$(echo "$row" | awk '{print $11}'); | |
if [[ "$imgenc" == "jpeg" ]]; then | |
ext="jpg"; | |
else | |
ext="png"; | |
fi | |
src=$(printf "${tmpdir}/extracted/image-%03d.$ext" "$num"); | |
if [[ "$imgtype" == "smask" ]]; then | |
dest=$(printf "${tmpdir}/image-%03d-mask.$ext" "$objectid"); | |
else | |
dest=$(printf "${tmpdir}/image-%03d.$ext" "$objectid"); | |
fi | |
#echo "$src -> $dest"; | |
mv "$src" "$dest" || exit 1; | |
done | |
rmdir "${tmpdir}/extracted" | |
## Merge the images that have a mask | |
pdfimages -list "${pdf}" | tail -n +3 | while read -r row; do | |
imgtype=$(echo "$row" | awk '{print $3}'); | |
imgenc=$(echo "$row" | awk '{print $9}'); | |
objectid=$(echo "$row" | awk '{print $11}'); | |
if [[ "$imgenc" == "jpeg" ]]; then | |
ext="jpg"; | |
else | |
ext="png"; | |
fi | |
if [[ "$imgtype" == "smask" ]]; then | |
mask=$(printf "${tmpdir}/image-%03d-mask.${ext}" "$objectid"); | |
# Use the image that corresponds to this mask. Sometimes the image and its mask are different types. | |
pngimg=$(printf "${tmpdir}/image-%03d.png" "$objectid"); | |
jpgimg=$(printf "${tmpdir}/image-%03d.jpg" "$objectid"); | |
if validfile "${pngimg}"; then | |
img=$pngimg | |
else | |
img=$jpgimg | |
fi | |
#echo "convert $img $mask"; | |
convert "$img" "$mask" -alpha off -compose copy-opacity -composite "$img" || exit 1; | |
fi | |
done | |
rm -f "${tmpdir}"/image-*-mask.*; | |
mv "${tmpdir}"/* "${outputDir}/"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment