Last active
April 24, 2016 13:03
-
-
Save Martin91/4701798f9074c6a00fe0f49540e84c49 to your computer and use it in GitHub Desktop.
check solr cores and their replications through script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# Author: Martin Hong | |
# Date: 2016/04/24 | |
# | |
# Monitor Solr health including replication status | |
# inspired in the script on https://issues.apache.org/jira/browse/SOLR-1855 | |
require 'net/http' | |
require 'optparse' | |
require 'ostruct' | |
require 'json' | |
require 'date' | |
# require 'byebug' | |
$options = OpenStruct.new | |
$options.exit_code = 0 | |
$options.solr_host = "localhost" | |
$options.solr_host_port = "8983" | |
# Last replication event must have been at least xxx sec ago. | |
$options.replication_time_tolerance = 3600 # = 1h | |
# parse options | |
OptionParser.new do |opts| | |
opts.banner = "Usage: solr_check [options]" | |
opts.separator "" | |
opts.separator "Specific options:" | |
opts.on("-h", "--host=HOST", "Solr Host") do |host| | |
$options.solr_host = host | |
end | |
opts.on("-p", "--port=PORT", "Solr Port") do |port| | |
$options.solr_host_port = port | |
end | |
opts.on("-d", "--difference=DIFFERENCE", "replication difference time tolerance(unit: seconds)") do |difference| | |
$options.replication_time_tolerance = difference | |
end | |
opts.on("-H", "--help", "Show this message") do | |
puts opts | |
exit | |
end | |
opts.separator "" | |
opts.separator "Example:" | |
opts.separator " ./solr_check.rb -H" | |
opts.separator " ./solr_check.rb --help" | |
opts.separator " ./solr_check.rb -h martin91.com -p 8983" | |
opts.separator " ./solr_check.rb --host=martin91.com --port=8983" | |
opts.separator " ./solr_check.rb -d 30" | |
end.parse! | |
def call_solr_api(path, params = {}) | |
retried_count = 0 | |
begin | |
uri = URI("http://#{$options.solr_host}:#{$options.solr_host_port}/solr/#{path}") | |
params.merge!(wt: :json) | |
uri.query = URI.encode_www_form(params) | |
res = Net::HTTP.get_response(uri) | |
JSON.parse(res.body) | |
rescue => e | |
if retried_count < 3 | |
retried_count += 1 | |
retry | |
else | |
puts "CRITICAL: server #{$options.solr_host} is not responding or returned incorrect data." | |
exit 4 | |
end | |
end | |
end | |
def get_cores_overview | |
call_solr_api "admin/cores" | |
end | |
def get_replication_details(core) | |
call_solr_api "#{core}/replication", command: 'details' | |
end | |
def ping_solr_core(core) | |
call_solr_api "#{core}/admin/ping" | |
end | |
def check_init_failures(cores_overview) | |
if cores_overview["initFailures"].any? | |
cores_overview["initFailures"].each do |core, reason| | |
puts "Core #{core} could not initialize: #{reason}" | |
end | |
$options.exit_code = 2 | |
end | |
end | |
def get_cores(cores_overview) | |
cores = cores_overview["status"].keys | |
if cores.empty? | |
puts "CRITICAL: server #{$options.solr_host} returned an empty list of cores." | |
exit 2 | |
end | |
cores | |
end | |
def is_master?(core_replication_details) | |
core_replication_details["details"]["isMaster"] == "true" | |
end | |
def check_master_core(core) | |
response = ping_solr_core(core) | |
result = response["status"] | |
puts "Core #{core} acts as master returned #{result}\n" | |
$options.exit_code = 1 unless result == "OK" | |
end | |
def check_slave_core(core, core_replication_details) | |
master_url = core_replication_details["details"]["slave"]["masterUrl"] rescue nil | |
master_index_version = core_replication_details["details"]["slave"]["masterDetails"]["indexVersion"] rescue nil | |
slave_index_version = core_replication_details["details"]["indexVersion"] rescue nil | |
slave_replicated_at = core_replication_details["details"]["slave"]["indexReplicatedAt"] rescue nil | |
slave_failed_at = core_replication_details["details"]["slave"]["replicationFailedAtList"].first rescue nil | |
slave_replicating = core_replication_details["details"]["slave"]["isReplicating"] rescue nil | |
if master_url.to_s == "" | |
puts "Core #{core} has not set masterUrl!" | |
$options.exit_code = 1 | |
end | |
if master_index_version.to_s == "" | |
puts "Core #{core} could not get master index version!" | |
$options.exit_code = 1 | |
end | |
if slave_index_version.to_s == "" && slave_replicated_at.to_s == "" && slave_replicating == "true" | |
puts "Core #{core} is replicating for the first time." | |
end | |
if slave_failed_at.to_s != "" && slave_replicated_at.to_s == "" || | |
slave_failed_at.to_s == "" && slave_replicated_at.to_s == "" | |
puts "Core #{core} has problems replicating." | |
$options.exit_code = 1 | |
end | |
slave_replicated_at = DateTime.parse(slave_replicated_at).to_time | |
if (Time.now - slave_replicated_at) < $options.replication_time_tolerance.to_f || master_index_version == slave_index_version | |
puts "Core #{core} is up to date" | |
else | |
puts "Core #{core} is out of date" | |
$options.exit_code = 1 | |
end | |
end | |
def main | |
cores_overview = get_cores_overview | |
check_init_failures(cores_overview) | |
cores = get_cores(cores_overview) | |
cores.each do |core| | |
core_replication_details = get_replication_details(core) | |
if is_master?(core_replication_details) | |
check_master_core(core) | |
else | |
check_slave_core(core, core_replication_details) | |
end | |
end | |
end | |
main | |
exit $options.exit_code |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. Firstly, download this script: | |
$ wget https://gist.github.com/Martin91/4701798f9074c6a00fe0f49540e84c49 | |
2. Run the below command in your favourite shell: | |
$ chmod +x solr_check.rb | |
3. Run it according to the usage help: | |
$ ./solr_check.rb -H | |
More document: | |
================================================================================================================ | |
Usage: solr_check [options] | |
Specific options: | |
-h, --host=HOST Solr Host | |
-p, --port=PORT Solr Port | |
-d, --difference=DIFFERENCE replication difference time tolerance(unit: seconds) | |
-H, --help Show this message | |
Example: | |
./solr_check.rb -H | |
./solr_check.rb --help | |
./solr_check.rb -h martin91.com -p 8983 | |
./solr_check.rb --host=martin91.com --port=8983 | |
./solr_check.rb -d 30 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment