Shell script to retrieve sitemaps from a CN to a search instance, correcting URLs and preserving file timestamps.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#/bin/bash | |
# Retrieve and transform metacat generated sitemaps | |
# | |
# Requires wget, xmlstarlet | |
# | |
# Run with: | |
# sudo bash getsitemap.sh | |
#sitemap namespace | |
SM_NS="sm=http://www.sitemaps.org/schemas/sitemap/0.9" | |
#xmlstarlet app | |
XML="/usr/bin/xmlstarlet" | |
#Hostname to retrieve sitemaps from | |
SOURCE_HOST="cn-stage-ucsb-1.test.dataone.org" | |
#Hostname for where the sitemaps will be served | |
DEST_HOST="search-stage.test.dataone.org" | |
#Name of the index | |
SM_INDEX="https://${SOURCE_HOST}/metacat/sitemaps/sitemap_index.xml" | |
#Name of folder where transformed sitemaps will be placed. | |
DEST="/var/www/search-stage.test.dataone.org/sitemaps" | |
WORK_DIR="/tmp/${DEST_HOST}/originals" | |
#Prefix for logger | |
APP="$(basename ${0})/${SOURCE_HOST}" | |
#Setup workspace | |
mkdir -p "${DEST}" | |
mkdir -p "${WORK_DIR}/index" | |
logger "${APP} INFO: Start." | |
#Retrieve index and preserve timestamp. Note this will only retrieve the | |
#doc if it was modified on the host | |
wget -q -N ${SM_INDEX} -O "${WORK_DIR}/index/sitemap_index.xml" | |
if [[ "$?" -ne "0" ]]; then | |
logger "${APP} ERROR: Unable to retrieve ${SM_INDEX}. Aborting." | |
exit 1 | |
fi | |
if [ "${WORK_DIR}/index/sitemap_index.xml" -nt "${DEST}/sitemap_index.xml" ]; then | |
logger "${APP} INFO: New sitemap index." | |
fi | |
#Get the sitemap entries | |
sitemaps=($(${XML} sel -N "${SM_NS}" -t -m "//sm:sitemap" -v "sm:loc" -n "${WORK_DIR}/index/sitemap_index.xml")) | |
for sm in "${sitemaps[@]}"; do | |
sm_actual=$(echo ${sm} | sed 's,metacatui,metacat/sitemaps,g') | |
wget -P ${WORK_DIR} -q -N "${sm_actual}" | |
if [[ "$?" -ne "0" ]]; then | |
logger "${APP} ERROR: Unable to retrieve ${sm_actual}. Continuing." | |
fi | |
done | |
# Generate the sitemaps_index and preserve date modified | |
cat "${WORK_DIR}/index/sitemap_index.xml" | sed "s,${SOURCE_HOST}/metacatui,${DEST_HOST}/sitemaps,g" > "${DEST}/sitemap_index.xml" | |
touch -r "${WORK_DIR}/index/sitemap_index.xml" "${DEST}/sitemap_index.xml" | |
# Generate the sitemaps and preserve date modified | |
docs=(${WORK_DIR}/*.xml) | |
for doc in "${docs[@]}"; do | |
dest_doc="${DEST}/$(basename ${doc})" | |
if [ ${doc} -nt ${dest_doc} ]; then | |
logger "${APP} INFO: ${dest_doc} updated." | |
fi | |
cat ${doc} | sed "s,${SOURCE_HOST}/metacatui,${DEST_HOST},g" > "${dest_doc}" | |
touch -r "${doc}" "${dest_doc}" | |
done | |
logger "${APP} INFO: Done." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment