Skip to content

Instantly share code, notes, and snippets.

@Martin91
Last active April 24, 2016 13:03
Show Gist options
  • Save Martin91/4701798f9074c6a00fe0f49540e84c49 to your computer and use it in GitHub Desktop.
Save Martin91/4701798f9074c6a00fe0f49540e84c49 to your computer and use it in GitHub Desktop.
check solr cores and their replications through script
#!/usr/bin/env ruby
# Author: Martin Hong
# Date: 2016/04/24
#
# Monitor Solr health including replication status
# inspired in the script on https://issues.apache.org/jira/browse/SOLR-1855
require 'net/http'
require 'optparse'
require 'ostruct'
require 'json'
require 'date'
# require 'byebug'
$options = OpenStruct.new
$options.exit_code = 0
$options.solr_host = "localhost"
$options.solr_host_port = "8983"
# Last replication event must have been at least xxx sec ago.
$options.replication_time_tolerance = 3600 # = 1h
# parse options
OptionParser.new do |opts|
opts.banner = "Usage: solr_check [options]"
opts.separator ""
opts.separator "Specific options:"
opts.on("-h", "--host=HOST", "Solr Host") do |host|
$options.solr_host = host
end
opts.on("-p", "--port=PORT", "Solr Port") do |port|
$options.solr_host_port = port
end
opts.on("-d", "--difference=DIFFERENCE", "replication difference time tolerance(unit: seconds)") do |difference|
$options.replication_time_tolerance = difference
end
opts.on("-H", "--help", "Show this message") do
puts opts
exit
end
opts.separator ""
opts.separator "Example:"
opts.separator " ./solr_check.rb -H"
opts.separator " ./solr_check.rb --help"
opts.separator " ./solr_check.rb -h martin91.com -p 8983"
opts.separator " ./solr_check.rb --host=martin91.com --port=8983"
opts.separator " ./solr_check.rb -d 30"
end.parse!
def call_solr_api(path, params = {})
retried_count = 0
begin
uri = URI("http://#{$options.solr_host}:#{$options.solr_host_port}/solr/#{path}")
params.merge!(wt: :json)
uri.query = URI.encode_www_form(params)
res = Net::HTTP.get_response(uri)
JSON.parse(res.body)
rescue => e
if retried_count < 3
retried_count += 1
retry
else
puts "CRITICAL: server #{$options.solr_host} is not responding or returned incorrect data."
exit 4
end
end
end
def get_cores_overview
call_solr_api "admin/cores"
end
def get_replication_details(core)
call_solr_api "#{core}/replication", command: 'details'
end
def ping_solr_core(core)
call_solr_api "#{core}/admin/ping"
end
def check_init_failures(cores_overview)
if cores_overview["initFailures"].any?
cores_overview["initFailures"].each do |core, reason|
puts "Core #{core} could not initialize: #{reason}"
end
$options.exit_code = 2
end
end
def get_cores(cores_overview)
cores = cores_overview["status"].keys
if cores.empty?
puts "CRITICAL: server #{$options.solr_host} returned an empty list of cores."
exit 2
end
cores
end
def is_master?(core_replication_details)
core_replication_details["details"]["isMaster"] == "true"
end
def check_master_core(core)
response = ping_solr_core(core)
result = response["status"]
puts "Core #{core} acts as master returned #{result}\n"
$options.exit_code = 1 unless result == "OK"
end
def check_slave_core(core, core_replication_details)
master_url = core_replication_details["details"]["slave"]["masterUrl"] rescue nil
master_index_version = core_replication_details["details"]["slave"]["masterDetails"]["indexVersion"] rescue nil
slave_index_version = core_replication_details["details"]["indexVersion"] rescue nil
slave_replicated_at = core_replication_details["details"]["slave"]["indexReplicatedAt"] rescue nil
slave_failed_at = core_replication_details["details"]["slave"]["replicationFailedAtList"].first rescue nil
slave_replicating = core_replication_details["details"]["slave"]["isReplicating"] rescue nil
if master_url.to_s == ""
puts "Core #{core} has not set masterUrl!"
$options.exit_code = 1
end
if master_index_version.to_s == ""
puts "Core #{core} could not get master index version!"
$options.exit_code = 1
end
if slave_index_version.to_s == "" && slave_replicated_at.to_s == "" && slave_replicating == "true"
puts "Core #{core} is replicating for the first time."
end
if slave_failed_at.to_s != "" && slave_replicated_at.to_s == "" ||
slave_failed_at.to_s == "" && slave_replicated_at.to_s == ""
puts "Core #{core} has problems replicating."
$options.exit_code = 1
end
slave_replicated_at = DateTime.parse(slave_replicated_at).to_time
if (Time.now - slave_replicated_at) < $options.replication_time_tolerance.to_f || master_index_version == slave_index_version
puts "Core #{core} is up to date"
else
puts "Core #{core} is out of date"
$options.exit_code = 1
end
end
def main
cores_overview = get_cores_overview
check_init_failures(cores_overview)
cores = get_cores(cores_overview)
cores.each do |core|
core_replication_details = get_replication_details(core)
if is_master?(core_replication_details)
check_master_core(core)
else
check_slave_core(core, core_replication_details)
end
end
end
main
exit $options.exit_code
1. Firstly, download this script:
$ wget https://gist.github.com/Martin91/4701798f9074c6a00fe0f49540e84c49
2. Run the below command in your favourite shell:
$ chmod +x solr_check.rb
3. Run it according to the usage help:
$ ./solr_check.rb -H
More document:
================================================================================================================
Usage: solr_check [options]
Specific options:
-h, --host=HOST Solr Host
-p, --port=PORT Solr Port
-d, --difference=DIFFERENCE replication difference time tolerance(unit: seconds)
-H, --help Show this message
Example:
./solr_check.rb -H
./solr_check.rb --help
./solr_check.rb -h martin91.com -p 8983
./solr_check.rb --host=martin91.com --port=8983
./solr_check.rb -d 30
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment