Created
January 26, 2020 12:05
-
-
Save alanorth/1c7c8b2131a19559e273fbc1e58d6a71 to your computer and use it in GitHub Desktop.
Script to download PDFs from a DSpace repository and create thumbnails for local quality comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# | |
# create-thumbnails.sh v2020-01-26.0 | |
# | |
HANDLES='10568/103447 | |
10568/3149 | |
10568/51999 | |
10568/53155 | |
10568/68624 | |
10568/68680 | |
10568/71249 | |
10568/71259 | |
10568/72646 | |
10568/76976 | |
10568/77628' | |
readonly THUMBNAIL_SIZE='600x600' | |
readonly CURL_BIN_PATH='/usr/bin/curl' | |
readonly CONVERT_BIN_PATH='/usr/bin/convert' | |
readonly IDENTIFY_BIN_PATH='/usr/bin/identify' | |
readonly GHOSTSCRIPT_RGB_PROFILE_PATH='/usr/share/ghostscript/9.50/iccprofiles/default_rgb.icc' | |
readonly GHOSTSCRIPT_CMYK_PROFILE_PATH='/usr/share/ghostscript/9.50/iccprofiles/default_cmyk.icc' | |
readonly DSPACE_REST_URL='https://dspacetest.cgiar.org/rest' | |
for handle in $HANDLES; do | |
echo "Processing $handle..." | |
# construct PDF filename from handle, ie 10568/3149 → 10568-3149.pdf | |
pdf_filename=${handle/\//-}.pdf | |
# check if PDF has already been downloaded, download if not | |
if ! [[ -f $pdf_filename ]]; then | |
# command to fetch the link for retrieving the PDF bitstream from DSpace | |
request_url="${DSPACE_REST_URL}/handle/${handle}?expand=bitstreams" | |
cmd="${CURL_BIN_PATH} -s ${request_url}" # | jq '.bitstreams[] | select(.bundleName==\"ORIGINAL\") | .retrieveLink'" | |
json_response=$($cmd) | |
# extract retrieveLink and strip quotes from URL | |
retrieveLink=$(echo $json_response | jq '.bitstreams[] | select(.bundleName=="ORIGINAL") | .retrieveLink' | sed -e 's/"//g') | |
# fetch the PDF (link includes a leading "/") | |
request_url="${DSPACE_REST_URL}$retrieveLink" | |
${CURL_BIN_PATH} -s "${request_url}" -o $pdf_filename | |
[[ $? -eq 0 ]] && echo "Downloaded $pdf_filename" | |
fi | |
# check if PDF uses CMYK colorspace | |
cmyk='no' | |
identify_output=$($IDENTIFY_BIN_PATH $pdf_filename\[0\] 2> /dev/null) | |
if [[ $identify_output =~ CMYK ]]; then | |
cmyk='yes' | |
fi | |
# Low-quality thumbnail (aka DSpace default) | |
lq_filename="${pdf_filename%%.pdf}-default.jpg" | |
if [[ $cmyk == 'yes' ]]; then | |
# Generate low-quality thumbnail (CMYK), pay attention to the order of the profiles... | |
${CONVERT_BIN_PATH} -profile $GHOSTSCRIPT_CMYK_PROFILE_PATH -thumbnail "$THUMBNAIL_SIZE" -flatten "$pdf_filename"\[0\] -profile $GHOSTSCRIPT_RGB_PROFILE_PATH "${lq_filename}" 2> /dev/null | |
else | |
# Generate 4x super sample | |
${CONVERT_BIN_PATH} -thumbnail "$THUMBNAIL_SIZE" -flatten "$pdf_filename"\[0\] "${lq_filename}" 2> /dev/null | |
fi | |
# High-quality thumbnail | |
# get a temporary file to write the hiqh-quality supersample to | |
temp_file="/tmp/$RANDOM.jpg" | |
hq_filename="${pdf_filename%%.pdf}-new.jpg" | |
if [[ $cmyk == 'yes' ]]; then | |
# Generate 4x super sample (CMYK), pay attention to the order of the profiles... | |
${CONVERT_BIN_PATH} -profile $GHOSTSCRIPT_CMYK_PROFILE_PATH -density 288 -filter lagrange -resize 25% -flatten "$pdf_filename"\[0\] -profile $GHOSTSCRIPT_RGB_PROFILE_PATH "${temp_file}" 2> /dev/null | |
else | |
# Generate 4x super sample | |
${CONVERT_BIN_PATH} -density 288 -filter lagrange -resize 25% -flatten "$pdf_filename"\[0\] "${temp_file}" 2> /dev/null | |
fi | |
# Generate thumbnail from super sample | |
${CONVERT_BIN_PATH} -thumbnail "$THUMBNAIL_SIZE" "${temp_file}" "${hq_filename}" 2> /dev/null | |
rm "$temp_file" | |
done | |
#jq '.bitstreams[] | select(.bundleName=="THUMBNAIL") | .retrieveLink' | |
#jq '.bitstreams[] | select(.bundleName=="ORIGINAL") | .retrieveLink' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment