Skip to content

Instantly share code, notes, and snippets.

@alanorth
Created January 26, 2020 12:05
Show Gist options
  • Save alanorth/1c7c8b2131a19559e273fbc1e58d6a71 to your computer and use it in GitHub Desktop.
Save alanorth/1c7c8b2131a19559e273fbc1e58d6a71 to your computer and use it in GitHub Desktop.
Script to download PDFs from a DSpace repository and create thumbnails for local quality comparison
#!/usr/bin/env bash
#
# create-thumbnails.sh v2020-01-26.0
#
HANDLES='10568/103447
10568/3149
10568/51999
10568/53155
10568/68624
10568/68680
10568/71249
10568/71259
10568/72646
10568/76976
10568/77628'
readonly THUMBNAIL_SIZE='600x600'
readonly CURL_BIN_PATH='/usr/bin/curl'
readonly CONVERT_BIN_PATH='/usr/bin/convert'
readonly IDENTIFY_BIN_PATH='/usr/bin/identify'
readonly GHOSTSCRIPT_RGB_PROFILE_PATH='/usr/share/ghostscript/9.50/iccprofiles/default_rgb.icc'
readonly GHOSTSCRIPT_CMYK_PROFILE_PATH='/usr/share/ghostscript/9.50/iccprofiles/default_cmyk.icc'
readonly DSPACE_REST_URL='https://dspacetest.cgiar.org/rest'
for handle in $HANDLES; do
echo "Processing $handle..."
# construct PDF filename from handle, ie 10568/3149 → 10568-3149.pdf
pdf_filename=${handle/\//-}.pdf
# check if PDF has already been downloaded, download if not
if ! [[ -f $pdf_filename ]]; then
# command to fetch the link for retrieving the PDF bitstream from DSpace
request_url="${DSPACE_REST_URL}/handle/${handle}?expand=bitstreams"
cmd="${CURL_BIN_PATH} -s ${request_url}" # | jq '.bitstreams[] | select(.bundleName==\"ORIGINAL\") | .retrieveLink'"
json_response=$($cmd)
# extract retrieveLink and strip quotes from URL
retrieveLink=$(echo $json_response | jq '.bitstreams[] | select(.bundleName=="ORIGINAL") | .retrieveLink' | sed -e 's/"//g')
# fetch the PDF (link includes a leading "/")
request_url="${DSPACE_REST_URL}$retrieveLink"
${CURL_BIN_PATH} -s "${request_url}" -o $pdf_filename
[[ $? -eq 0 ]] && echo "Downloaded $pdf_filename"
fi
# check if PDF uses CMYK colorspace
cmyk='no'
identify_output=$($IDENTIFY_BIN_PATH $pdf_filename\[0\] 2> /dev/null)
if [[ $identify_output =~ CMYK ]]; then
cmyk='yes'
fi
# Low-quality thumbnail (aka DSpace default)
lq_filename="${pdf_filename%%.pdf}-default.jpg"
if [[ $cmyk == 'yes' ]]; then
# Generate low-quality thumbnail (CMYK), pay attention to the order of the profiles...
${CONVERT_BIN_PATH} -profile $GHOSTSCRIPT_CMYK_PROFILE_PATH -thumbnail "$THUMBNAIL_SIZE" -flatten "$pdf_filename"\[0\] -profile $GHOSTSCRIPT_RGB_PROFILE_PATH "${lq_filename}" 2> /dev/null
else
# Generate 4x super sample
${CONVERT_BIN_PATH} -thumbnail "$THUMBNAIL_SIZE" -flatten "$pdf_filename"\[0\] "${lq_filename}" 2> /dev/null
fi
# High-quality thumbnail
# get a temporary file to write the hiqh-quality supersample to
temp_file="/tmp/$RANDOM.jpg"
hq_filename="${pdf_filename%%.pdf}-new.jpg"
if [[ $cmyk == 'yes' ]]; then
# Generate 4x super sample (CMYK), pay attention to the order of the profiles...
${CONVERT_BIN_PATH} -profile $GHOSTSCRIPT_CMYK_PROFILE_PATH -density 288 -filter lagrange -resize 25% -flatten "$pdf_filename"\[0\] -profile $GHOSTSCRIPT_RGB_PROFILE_PATH "${temp_file}" 2> /dev/null
else
# Generate 4x super sample
${CONVERT_BIN_PATH} -density 288 -filter lagrange -resize 25% -flatten "$pdf_filename"\[0\] "${temp_file}" 2> /dev/null
fi
# Generate thumbnail from super sample
${CONVERT_BIN_PATH} -thumbnail "$THUMBNAIL_SIZE" "${temp_file}" "${hq_filename}" 2> /dev/null
rm "$temp_file"
done
#jq '.bitstreams[] | select(.bundleName=="THUMBNAIL") | .retrieveLink'
#jq '.bitstreams[] | select(.bundleName=="ORIGINAL") | .retrieveLink'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment