fbettag/dump-and-load.sh

## dump-and-load.sh
#!/bin/sh
#
# Dumps a subset of one Solr core and loads it into another one.
#

if [ $# -lt 4 ]; then
	echo "Usage: $0 <source> <target> <limit> [core1 core2...]"
	echo "       $0 http://source.solr:8983/solr http://target.solr:8983/solr 100000 core1 core2"
	exit 1
fi

SOURCE_URL=$1
TARGET_URL=$2
LIMIT=$3

sync()
{
        _source=$1
        _target=$2
        _core=$3
        _offset=$4
        _page=$5
        _fields=$6

        _outfile="solr-dump.${_core}.${_page}.json"

        # Dump the results of the query as json
        _url="${_source}/select?q=*:*&wt=json&fl=${_fields}&start=${_offset}&rows=${LIMIT}"
        curl -s "${_url}" | sed -e 's;^.*,"docs":;;' -e 's;]}}$;];' > ${_outfile} && \
                echo "[${_core}] [Page ${_page}] Pulling data from source... ok" || \
                echo "[${_core}] [Page ${_page}] Pulling data from source... failed!"

        # Import the dump into the other solr core with a forked script, to speed things up
        _tmp=$(mktemp)
        cat >${_tmp}<<-EOF
          # post to target and report result
          curl -qsm 900 "${_target}/update/json?commit=true" --data-binary @${_outfile} -H 'Content-type:application/json; charset=utf-8' > /dev/null && \
          echo "[${_core}] [Page ${_page}] Storing data in target... ok" || \
          echo "[${_core}] [Page ${_page}] Storing data in target... failed"
          # clean up outfile
          rm ${_outfile}
EOF
        sh ${_tmp} && rm ${_tmp} &
}

CORE=4
while [ $CORE -le $# ]; do
	eval _core="\${$CORE}"
        # count the number of documents for pagination
        _docs=$(curl -s "$SOURCE_URL/${_core}/select?q=*:*&rows=0&wt=json" | egrep -oe '"numFound":[0-9]+' |sed -e 's;^.*:;;g')
        # get field list and exclude _version_
        _fields=$(curl -s "$SOURCE_URL/${_core}/select?q=*:*&fl=*&wt=csv&rows=0" | sed -e 's/,_version_,/,/' -e 's/,_version_//' -e 's/_version_,//')

        _page=0
        _pages=$(echo "(${_docs}/$LIMIT)+1" | bc)
        echo "[${_core}] ${_docs} docs    pages: ${_pages}"
        while [ ${_page} -lt ${_pages} ]; do
                _offset=$(echo "${_page}*$LIMIT"| bc)
                _page=$(echo "${_page}+1" | bc)
                sync "$SOURCE_URL/${_core}" "$TARGET_URL/${_core}" "${_core}" "${_offset}" "${_page}" "${_fields}"
        done
        CORE=$(echo "${CORE}+1" | bc)
done
	#!/bin/sh
	#
	# Dumps a subset of one Solr core and loads it into another one.
	#

	if [ $# -lt 4 ]; then
	echo "Usage: $0 <source> <target> <limit> [core1 core2...]"
	echo " $0 http://source.solr:8983/solr http://target.solr:8983/solr 100000 core1 core2"
	exit 1
	fi

	SOURCE_URL=$1
	TARGET_URL=$2
	LIMIT=$3

	sync()
	{
	_source=$1
	_target=$2
	_core=$3
	_offset=$4
	_page=$5
	_fields=$6

	_outfile="solr-dump.${_core}.${_page}.json"

	# Dump the results of the query as json
	_url="${_source}/select?q=:&wt=json&fl=${_fields}&start=${_offset}&rows=${LIMIT}"
	curl -s "${_url}" \| sed -e 's;^.*,"docs":;;' -e 's;]}}$;];' > ${_outfile} && \
	echo "[${_core}] [Page ${_page}] Pulling data from source... ok" \|\| \
	echo "[${_core}] [Page ${_page}] Pulling data from source... failed!"

	# Import the dump into the other solr core with a forked script, to speed things up
	_tmp=$(mktemp)
	cat >${_tmp}<<-EOF
	# post to target and report result
	curl -qsm 900 "${_target}/update/json?commit=true" --data-binary @${_outfile} -H 'Content-type:application/json; charset=utf-8' > /dev/null && \
	echo "[${_core}] [Page ${_page}] Storing data in target... ok" \|\| \
	echo "[${_core}] [Page ${_page}] Storing data in target... failed"
	# clean up outfile
	rm ${_outfile}
	EOF
	sh ${_tmp} && rm ${_tmp} &
	}

	CORE=4
	while [ $CORE -le $# ]; do
	eval _core="\${$CORE}"
	# count the number of documents for pagination
	_docs=$(curl -s "$SOURCE_URL/${_core}/select?q=:&rows=0&wt=json" \| egrep -oe '"numFound":[0-9]+' \|sed -e 's;^.*:;;g')
	# get field list and exclude _version_
	_fields=$(curl -s "$SOURCE_URL/${_core}/select?q=:&fl=*&wt=csv&rows=0" \| sed -e 's/,_version_,/,/' -e 's/,_version_//' -e 's/_version_,//')

	_page=0
	_pages=$(echo "(${_docs}/$LIMIT)+1" \| bc)
	echo "[${_core}] ${_docs} docs pages: ${_pages}"
	while [ ${_page} -lt ${_pages} ]; do
	_offset=$(echo "${_page}*$LIMIT"\| bc)
	_page=$(echo "${_page}+1" \| bc)
	sync "$SOURCE_URL/${_core}" "$TARGET_URL/${_core}" "${_core}" "${_offset}" "${_page}" "${_fields}"
	done
	CORE=$(echo "${CORE}+1" \| bc)
	done