Skip to content

Instantly share code, notes, and snippets.

@fbettag
Last active September 9, 2017 11:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fbettag/0dd8c2dd7cb23ca9bec204c25112ef03 to your computer and use it in GitHub Desktop.
Save fbettag/0dd8c2dd7cb23ca9bec204c25112ef03 to your computer and use it in GitHub Desktop.
SOLR dump and restore script using plain sh and bc, does pagination and dumps to file, also forks for insertion
#!/bin/sh
#
# Dumps a subset of one Solr core and loads it into another one.
#
if [ $# -lt 4 ]; then
echo "Usage: $0 <source> <target> <limit> [core1 core2...]"
echo " $0 http://source.solr:8983/solr http://target.solr:8983/solr 100000 core1 core2"
exit 1
fi
SOURCE_URL=$1
TARGET_URL=$2
LIMIT=$3
sync()
{
_source=$1
_target=$2
_core=$3
_offset=$4
_page=$5
_fields=$6
_outfile="solr-dump.${_core}.${_page}.json"
# Dump the results of the query as json
_url="${_source}/select?q=*:*&wt=json&fl=${_fields}&start=${_offset}&rows=${LIMIT}"
curl -s "${_url}" | sed -e 's;^.*,"docs":;;' -e 's;]}}$;];' > ${_outfile} && \
echo "[${_core}] [Page ${_page}] Pulling data from source... ok" || \
echo "[${_core}] [Page ${_page}] Pulling data from source... failed!"
# Import the dump into the other solr core with a forked script, to speed things up
_tmp=$(mktemp)
cat >${_tmp}<<-EOF
# post to target and report result
curl -qsm 900 "${_target}/update/json?commit=true" --data-binary @${_outfile} -H 'Content-type:application/json; charset=utf-8' > /dev/null && \
echo "[${_core}] [Page ${_page}] Storing data in target... ok" || \
echo "[${_core}] [Page ${_page}] Storing data in target... failed"
# clean up outfile
rm ${_outfile}
EOF
sh ${_tmp} && rm ${_tmp} &
}
CORE=4
while [ $CORE -le $# ]; do
eval _core="\${$CORE}"
# count the number of documents for pagination
_docs=$(curl -s "$SOURCE_URL/${_core}/select?q=*:*&rows=0&wt=json" | egrep -oe '"numFound":[0-9]+' |sed -e 's;^.*:;;g')
# get field list and exclude _version_
_fields=$(curl -s "$SOURCE_URL/${_core}/select?q=*:*&fl=*&wt=csv&rows=0" | sed -e 's/,_version_,/,/' -e 's/,_version_//' -e 's/_version_,//')
_page=0
_pages=$(echo "(${_docs}/$LIMIT)+1" | bc)
echo "[${_core}] ${_docs} docs pages: ${_pages}"
while [ ${_page} -lt ${_pages} ]; do
_offset=$(echo "${_page}*$LIMIT"| bc)
_page=$(echo "${_page}+1" | bc)
sync "$SOURCE_URL/${_core}" "$TARGET_URL/${_core}" "${_core}" "${_offset}" "${_page}" "${_fields}"
done
CORE=$(echo "${CORE}+1" | bc)
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment