Skip to content

Instantly share code, notes, and snippets.

@TurnerSoftwareDev
Forked from bendavis78/extract_pdf_images.sh
Last active October 21, 2023 13:40
Show Gist options
  • Save TurnerSoftwareDev/784feb557fe0094c3878b6c7cede8696 to your computer and use it in GitHub Desktop.
Save TurnerSoftwareDev/784feb557fe0094c3878b6c7cede8696 to your computer and use it in GitHub Desktop.
Extracts images from PDF while preserving PNG transparency
#!/usr/bin/env bash
# Extracts images from a PDF into a directory, preserving transparency.
#
# Inspired by https://gist.github.com/bendavis78/ed22a974c2b4534305eabb2522956359
#
# Options:
#
# --help, -h Display help
# --pdf, -p The PDF to extract images from
# --directory, -d The directory to output images to
#
# Examples:
#
# extract-pdf-images -p ./foo.pdf -d ./foo
#
# Exit on first error
set -e
# Debug
#set -x
# Checks that a file is valid
validfile() { [[ -r ${1} ]]; }
validdir() { [[ -w ${1} ]]; }
# Checks that a command is installed
installed () { command -v "${1}" >/dev/null 2>&1 || { >&2 echo "Cannot execute the ${1} command"; exit 1; } }
help() {
scriptName=$(basename "$0")
echo "${scriptName} [options]"
echo
echo Options:
echo
echo "--help, -h Prints this help menu"
echo "--pdf, -p The PDF file to extract images from"
echo "--directory, -d The directory to extract images into"
echo
echo Example:
echo
echo "${scriptName}" --pdf "./foo.pdf" --directory "./foo"
echo
}
installed pdfimages # Install poppler-utils to get pdfimages
installed convert
pdf=
outputDir=
# Exit if no parameters were provided
if [[ $# = 0 ]]; then
help
exit 1
fi
# Parse the parameters
while [[ $# != 0 ]]; do
case "${1}" in
-h|--help)
help
exit 0
;;
-p|--pdf)
pdf="$2"
shift 2
;;
-d|--directory)
outputDir="${2}"
shift 2
;;
*)
shift
;;
esac
done
# Accept pdf name on stdin too
if [[ -z $pdf ]]; then
pdf=$(cat)
fi
# Check the parameters
if ! validfile "${pdf}"; then
>&2 echo "${pdf} cannot be read"
exit 1
fi
if ! validdir "${outputDir}"; then
>&2 echo "${outputDir} cannot be written to"
exit 1
fi
tmpdir=$(mktemp -d /tmp/extract-pdf-images-XXXXXX)
mkdir "${tmpdir}/extracted"
# Remove the tmpdir when we're done
trap 'rm -rf ${tmpdir}' EXIT INT
pdfimages -all "${pdf}" "${tmpdir}/extracted/image" || exit 1
# Rename images based on object id and whether or not they are a mask
pdfimages -list "${pdf}" | tail -n +3 | while read -r row; do
num=$(echo "$row" | awk '{print $2}');
imgtype=$(echo "$row" | awk '{print $3}');
imgenc=$(echo "$row" | awk '{print $9}');
objectid=$(echo "$row" | awk '{print $11}');
if [[ "$imgenc" == "jpeg" ]]; then
ext="jpg";
else
ext="png";
fi
src=$(printf "${tmpdir}/extracted/image-%03d.$ext" "$num");
if [[ "$imgtype" == "smask" ]]; then
dest=$(printf "${tmpdir}/image-%03d-mask.$ext" "$objectid");
else
dest=$(printf "${tmpdir}/image-%03d.$ext" "$objectid");
fi
#echo "$src -> $dest";
mv "$src" "$dest" || exit 1;
done
rmdir "${tmpdir}/extracted"
## Merge the images that have a mask
pdfimages -list "${pdf}" | tail -n +3 | while read -r row; do
imgtype=$(echo "$row" | awk '{print $3}');
imgenc=$(echo "$row" | awk '{print $9}');
objectid=$(echo "$row" | awk '{print $11}');
if [[ "$imgenc" == "jpeg" ]]; then
ext="jpg";
else
ext="png";
fi
if [[ "$imgtype" == "smask" ]]; then
mask=$(printf "${tmpdir}/image-%03d-mask.${ext}" "$objectid");
# Use the image that corresponds to this mask. Sometimes the image and its mask are different types.
pngimg=$(printf "${tmpdir}/image-%03d.png" "$objectid");
jpgimg=$(printf "${tmpdir}/image-%03d.jpg" "$objectid");
if validfile "${pngimg}"; then
img=$pngimg
else
img=$jpgimg
fi
#echo "convert $img $mask";
convert "$img" "$mask" -alpha off -compose copy-opacity -composite "$img" || exit 1;
fi
done
rm -f "${tmpdir}"/image-*-mask.*;
mv "${tmpdir}"/* "${outputDir}/";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment