Skip to content

Instantly share code, notes, and snippets.

@Fedalto
Last active January 16, 2019 07:23
Show Gist options
  • Save Fedalto/5658639 to your computer and use it in GitHub Desktop.
Save Fedalto/5658639 to your computer and use it in GitHub Desktop.
The script monitor the Solr health. https://github.com/Fedalto/solr-monitor
The MIT License (MIT)
Copyright (c) 2015 Leonardo Fedalto
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
#!/bin/bash
# This script will get the list af active cores and issues a ping request to them.
# If a single request failes, the check will return ERROR
# All requests must return OK
# Local variables
EXIT_CODE=0
SOLR_HOST="localhost"
SOLR_HOST_PORT="8983"
# Last replication event must have been at least xxx sec ago.
REPLICATION_TIME_TOLERANCE=900 # = 15 min
OUT=""
set -u
parse_args() {
# If no arguments are given
if [ "$#" -eq 0 ]; then
pod2usage "$0"
exit 1
fi
# Parse arguments
while [ $# -gt 0 ]
do
case "$1" in
--help|-h|-\?)
pod2usage -verbose 1 "$0"
exit 1
;;
--man)
pod2usage -verbose 2 "$0"
exit 1
;;
--host|-H)
SOLR_HOST="$2"
shift 2
;;
--port|-P)
SOLR_HOST_PORT="$2"
shift 2
;;
--diff|-D)
REPLICATION_TIME_TOLERANCE="$2"
shift 2
;;
*)
# Non option argument
break # Finish for loop
;;
esac
done
# main code
[ -n "$SOLR_HOST" ] || {
echo "UNKNOWN: --host argument is not set"
exit 2
}
}
call_solr_api() {
curl --retry 3 --max-time 15 --fail --silent "http://${SOLR_HOST}:${SOLR_HOST_PORT}/solr/${1}"
if [ "$?" != "0" ]; then
echo "CRITICAL: server \"$SOLR_HOST\" is not responding or returned incorrect data." >&2
exit 2
fi
}
get_cores_status() {
call_solr_api "admin/cores"
}
get_replication_details() {
call_solr_api "${1}/replication?command=details"
}
ping_solr_core() {
call_solr_api "${1}/admin/ping"
}
get_solr_cores() {
local response=$(<<< "$1" xmllint --format - | grep '<str name="name"' | sed -e 's/[ ]*<str name="name">//g' -e 's/<\/str>//g')
if [ -z "$response" ]; then
echo "CRITICAL: server \"$SOLR_HOST\" returned an empty list of cores."
exit 2
fi
echo "$response"
}
is_master() {
[[ $(xmlstarlet sel -t -v "/response/lst[@name='details']/str[@name='isMaster']" <<< "$1") == "true" ]]
}
check_init_failures() {
local cores_status="$1"
local cores_with_init_failures=$(xmlstarlet sel -t -m "/response/lst[@name='initFailures']/str" -m "@name" -v . -n <<< "${cores_status="$1"}" | grep -v '^$')
for bad_core in $cores_with_init_failures; do
local problem=$(xmlstarlet sel -t -v "/response/lst[@name='initFailures']/str[@name='$bad_core']" <<< "${cores_status="$1"}")
OUT="${OUT}Core \"$bad_core\" could not initialize: ${problem}\n"
EXIT_CODE=2
done
}
check_master_core() {
local core="$1"
local result=$(ping_solr_core "$core")
result=$(xmlstarlet sel -t -v "/response/str[@name='status']" <<< "$result")
OUT="${OUT}Core \"${core}\" returned \"${result}\".\n"
[ "$result" = "OK" ] || EXIT_CODE=2
}
check_slave_core() {
local core="$1"
local replication_details="$2"
#Verifying replication health is harder, so there will be several checks.
# 1) Check if master host is defined.
# 2) Check indexReplicatedAtList and compare it with replicationFailedAtList. This will determine if replication is working.
# 3) Check if indexReplicatedAtList is more then 2 hours behind now(). If it is then something might be wrong.
# 4) Check if indexVersion on master and slave match.
# Get the URL to the corresponding master core
SOLR_SLAVE_MASTERURL=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/str[@name='masterUrl']" <<< "$replication_details")
# Get the master index version
SOLR_MASTER_INDEXVERSION=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/lst[@name='masterDetails']/long[@name='indexVersion']" <<< "$replication_details")
# Get slave index version
SOLR_SLAVE_INDEXVERSION=$(xmlstarlet sel -t -v "/response/lst[@name='details']/long[@name='indexVersion']" <<< "$replication_details")
# Get the last time the core replicated correctly.
SOLR_SLAVE_REPLICATEDAT=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/arr[@name='indexReplicatedAtList']/str[1]" <<< "$replication_details")
# Get the last time the core failed to replicate.
SOLR_SLAVE_FAILEDAT=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/arr[@name='replicationFailedAtList']/str[1]" <<< "$replication_details")
# Is this core replicating (aka pulling index from master) right now?
SOLR_SLAVE_REPLICATING=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/str[@name='isReplicating']" <<< "$replication_details")
# masterUrl is not set, replication is broken.
if [ -z "$SOLR_SLAVE_MASTERURL" ]; then
OUT="$OUT\n"."Core \"${core}\" has no masterUrl set, solr.xml misconfiguration or orphaned core."
EXIT_CODE=2
continue
fi
# Slave could not get the master index version. Maybe the master is down or network issues.
if [[ -z "$SOLR_MASTER_INDEXVERSION" ]]; then
OUT="${OUT}Core \"$core\" could not get master index version.\n"
EXIT_CODE=2
continue
fi
# If SOLR_SLAVE_REPLICATEDAT is empty then the instance has not replicated once, this is an error.
# Actually it's possible that this check will cause a false alert if the index is being replicated for the first time just now, so check for this also.
if [ -z "$SOLR_SLAVE_FAILEDAT" -a -z "$SOLR_SLAVE_REPLICATEDAT" -a "$SOLR_SLAVE_REPLICATING" == "true" ]; then
# Everything is ok, this is the first time this core has been triggered to replicate.
OUT="${OUT}Core \"${core}\" is replicating for the first time.\n"
continue
fi
if [ \( -n "$SOLR_SLAVE_FAILEDAT" -a -z "$SOLR_SLAVE_REPLICATEDAT" \) -o \( -z "$SOLR_SLAVE_FAILEDAT" -a -z "$SOLR_SLAVE_REPLICATEDAT" \) ]; then
# This core has never replicated, this is an error.
OUT="${OUT}Core \"${core}\" has problems replicating.\n"
EXIT_CODE=2
continue
fi
# We need to calculate if the last replication attempt was successfull.
LAST_REPLICATION_SUCCESSFUL=`date -d "$SOLR_SLAVE_REPLICATEDAT" +%s`
TOLERANCE=`date -d "now - $REPLICATION_TIME_TOLERANCE seconds" +%s`
# Verify that either the timestamp is not older then REPLICATION_TIME_TOLERANCE seconds and if it is check that indexversions match.
if [ $LAST_REPLICATION_SUCCESSFUL -gt $TOLERANCE -o $SOLR_MASTER_INDEXVERSION = $SOLR_SLAVE_INDEXVERSION ]; then
# Everything is ok
OUT="${OUT}Core \"${core}\" is up to date.\n"
else
# Slave if outdated.
OUT="${OUT}Core \"${core}\" is behind master more then $REPLICATION_TIME_TOLERANCE seconds.\n"
EXIT_CODE=2
fi
}
main() {
parse_args "$@"
local cores_status=$(get_cores_status)
local solr_cores=$(get_solr_cores "$cores_status")
check_init_failures "$cores_status"
for core in $solr_cores; do
local replication_details=$(get_replication_details "$core")
if is_master "$replication_details"; then
check_master_core "$core"
else
check_slave_core "$core" "$replication_details"
fi
done
echo -e -n "$OUT"
exit $EXIT_CODE
}
main "$@"
__END__
=pod
=head1 NAME
solr-monitor - Simple script to check the status of all defined cores on a solr server
=head1 SYNOPSIS
solr-monitor [OPTIONS]
=head1 OPTIONS
=over 4
=item B<--help> | B<-h>
Print the brief help message and exit.
=item B<--man>
Print the manual page and exit.
=item B<--host> | B<-H> HOST
Check this host instead of localhost.
=item B<--port> | B<-P> Port
Use this port instead of the default(8983) to connect.
=item B<--diff> | B<-D> Time difference between now and when solr last replicated
Use this option to set the maximum difference in seconds between the time when the solr slave replicated and now.
=back
Use '--' to separate options and argument if it starts with '-'.
=head1 DESCRIPTION
Simple script to run that will query the solr server for a list of defined cores
and then verify that all of them respond to ping requests
=head1 EXAMPLES
solr-monitor --host qa-c1-solrmst1
solr-monitor -H qa-c1-solrmst1 -P 80 -D 900
=head1 AUTHOR
Alexander V. Chykysh <ochykysh@magus.org.ua>
Leonardo Fedalto <lfedalto@gmail.com>
=cut
@Fedalto
Copy link
Author

Fedalto commented May 27, 2013

Dependencies: curl, xmllint, xmlstarlet.
Tested on Linux and OS X, Solr 4.1 API.

By default, it will check Solr cores in localhost on port 8983 and will have a replication tolerance of 15 minutes.
Meaning that if a slave core is behind the master and didn't replicate in 15 minutes, the script will return 2.

Return codes:

  • 0, if everything is OK.
  • 2, if we have a problem in a core (replication issues, core is down, Solr is not responding at all, returned an empty list of cores, etc)

Example usage:

$ ./solr-monitor -h
Usage:
    solr-monitor [OPTIONS]

Options:
    --help | -h
        Print the brief help message and exit.

    --man
        Print the manual page and exit.

    --host | -H HOST
        Check this host instead of localhost.

    --port | -P Port
        Use this port instead of the default(8983) to connect.

    --diff | -D Time difference in seconds between now and when solr last replicated
        Use this option to set the maximum difference in seconds between the
        time when the solr slave replicated and now.
$ ./solr-monitor --host solrmaster1
Core "core0" returned "OK".
Core "core1" returned "OK".
Core "core2" returned "OK".
$ echo $?
0

$ ./solr-monitor -H slave.solr.com -P 8080 --diff 3600
Core "slave0" is up to date.
Core "slave1" is up to date.
Core "slave2" could not get master index version.
Core "slave3" is up to date.
$ echo $?
2

@rajeshpal007
Copy link

Hi, I can help you improve this.

Let me know if you are interested.
skype : pal.rajesh

@SarthakTA3
Copy link

Hi, I am trying to use this script in dev server. I have only edited Line 68 "http://${SOLR_HOST}:${SOLR_HOST_PORT}/solr/${1}" to http://172.35.43.10:9001/solr/. The other local host and port detail configurations are same. While executing the script with ./checksolr.sh --host localhost, i am getting this error:

-:162.1: Premature end of data in tag link line 43
-:162.1: Premature end of data in tag link line 42
-:162.1: Premature end of data in tag link line 41
-:162.1: Premature end of data in tag link line 40
-:162.1: Premature end of data in tag link line 39
-:162.1: Premature end of data in tag link line 38
-:162.1: Premature end of data in tag link line 37
-:162.1: Premature end of data in tag link line 36
-:162.1: Premature end of data in tag link line 35
-:162.1: Premature end of data in tag link line 34
-:162.1: Premature end of data in tag link line 33
-:162.1: Premature end of data in tag link line 32
-:162.1: Premature end of data in tag link line 31
-:162.1: Premature end of data in tag link line 30
-:162.1: Premature end of data in tag link line 29
-:162.1: Premature end of data in tag link line 28
-:162.1: Premature end of data in tag link line 27
-:162.1: Premature end of data in tag link line 25
-:162.1: Premature end of data in tag head line 21
-:162.1: Premature end of data in tag html line 2
-:56.8: Opening and ending tag mismatch: meta line 45 and head

^ -:65.19: Entity 'nbsp' not defined

 

^ -:82.17: Entity 'nbsp' not defined   ^ -:111.59: Opening and ending tag mismatch: p line 111 and li operties" class="global">

Java Properties ^ -:130.15: Opening and ending tag mismatch: ul line 90 and div ^ -:160.8: Opening and ending tag mismatch: div line 59 and body ^ -:161.8: Opening and ending tag mismatch: body line 57 and html ^ -:162.1: Premature end of data in tag link line 43 -:162.1: Premature end of data in tag link line 42 -:162.1: Premature end of data in tag link line 41 -:162.1: Premature end of data in tag link line 40 -:162.1: Premature end of data in tag link line 39 -:162.1: Premature end of data in tag link line 38 -:162.1: Premature end of data in tag link line 37 -:162.1: Premature end of data in tag link line 36 -:162.1: Premature end of data in tag link line 35 -:162.1: Premature end of data in tag link line 34 -:162.1: Premature end of data in tag link line 33 -:162.1: Premature end of data in tag link line 32 -:162.1: Premature end of data in tag link line 31 -:162.1: Premature end of data in tag link line 30 -:162.1: Premature end of data in tag link line 29 -:162.1: Premature end of data in tag link line 28 -:162.1: Premature end of data in tag link line 27 -:162.1: Premature end of data in tag link line 25 -:162.1: Premature end of data in tag head line 21 -:162.1: Premature end of data in tag html line 2 -:56.8: Opening and ending tag mismatch: meta line 45 and head ^ -:65.19: Entity 'nbsp' not defined

 

^ -:82.17: Entity 'nbsp' not defined   ^ -:111.59: Opening and ending tag mismatch: p line 111 and li operties" class="global">

Java Properties ^ -:130.15: Opening and ending tag mismatch: ul line 90 and div ^ -:160.8: Opening and ending tag mismatch: div line 59 and body ^ -:161.8: Opening and ending tag mismatch: body line 57 and html ^ -:162.1: Premature end of data in tag link line 43 -:162.1: Premature end of data in tag link line 42 -:162.1: Premature end of data in tag link line 41 -:162.1: Premature end of data in tag link line 40 -:162.1: Premature end of data in tag link line 39 -:162.1: Premature end of data in tag link line 38 -:162.1: Premature end of data in tag link line 37 -:162.1: Premature end of data in tag link line 36 -:162.1: Premature end of data in tag link line 35 -:162.1: Premature end of data in tag link line 34 -:162.1: Premature end of data in tag link line 33 -:162.1: Premature end of data in tag link line 32 -:162.1: Premature end of data in tag link line 31 -:162.1: Premature end of data in tag link line 30 -:162.1: Premature end of data in tag link line 29 -:162.1: Premature end of data in tag link line 28 -:162.1: Premature end of data in tag link line 27 -:162.1: Premature end of data in tag link line 25 -:162.1: Premature end of data in tag head line 21 -:162.1: Premature end of data in tag html line 2 -:56.8: Opening and ending tag mismatch: meta line 45 and head ^ -:65.19: Entity 'nbsp' not defined

 

^ -:82.17: Entity 'nbsp' not defined   ^ -:111.59: Opening and ending tag mismatch: p line 111 and li operties" class="global">

Java Properties ^ -:130.15: Opening and ending tag mismatch: ul line 90 and div ^ -:160.8: Opening and ending tag mismatch: div line 59 and body ^ -:161.8: Opening and ending tag mismatch: body line 57 and html ^ -:162.1: Premature end of data in tag link line 43 -:162.1: Premature end of data in tag link line 42 -:162.1: Premature end of data in tag link line 41 -:162.1: Premature end of data in tag link line 40 -:162.1: Premature end of data in tag link line 39 -:162.1: Premature end of data in tag link line 38 -:162.1: Premature end of data in tag link line 37 -:162.1: Premature end of data in tag link line 36 -:162.1: Premature end of data in tag link line 35 -:162.1: Premature end of data in tag link line 34 -:162.1: Premature end of data in tag link line 33 -:162.1: Premature end of data in tag link line 32 -:162.1: Premature end of data in tag link line 31 -:162.1: Premature end of data in tag link line 30 -:162.1: Premature end of data in tag link line 29 -:162.1: Premature end of data in tag link line 28 -:162.1: Premature end of data in tag link line 27 -:162.1: Premature end of data in tag link line 25 -:162.1: Premature end of data in tag head line 21 -:162.1: Premature end of data in tag html line 2

.Core "CRITICAL:" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "server" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core ""localhost"" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "returned" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "an" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "empty" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "list" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "of" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "cores." has no masterUrl set, solr.xml misconfiguration or orphaned core.[ecommadm@k1c20050so101vl solr_script]$ ls -ltr
total 8
-rwxr-xr-x 1 ecommadm sorianahyb 8027 Apr 7 00:44 checksolr.sh
[ecommadm@k1c20050so101vl solr_script]$

Can someone please help me in resolving this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment