Created
February 22, 2012 17:33
-
-
Save hcoyote/1886212 to your computer and use it in GitHub Desktop.
parse the dfsadmin report and give us some info about the cluster; warn if things are missing, if we have dead nodes, if we don't have enough datanodes, corrupt blocks, etc.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# | |
# This script is managed by puppet. | |
# | |
use strict; | |
use warnings; | |
use IO::File; | |
use Getopt::Long; | |
if (not exists $ENV{SUDO_USER} or $ENV{SUDO_USER} eq 'nagios') { | |
exec("/usr/bin/sudo -u hdfs $0 @ARGV"); | |
} | |
my $verbose; | |
my $help; | |
my @errors; | |
my %dfs_status; | |
my $hadoop_command = "/usr/bin/hadoop"; | |
my $dfs_report = "dfsadmin -report"; | |
my $warn = "90"; | |
my $crit_disk = "95"; | |
my $avail_threshold = 80 / 100; | |
GetOptions( | |
"help|h" => \$help, | |
); | |
if (defined $help) { | |
print <<EOF; | |
$0 [options] | |
--help | |
-h This. | |
EOF | |
exit 3; | |
} | |
my $fh = IO::File->new("$hadoop_command $dfs_report|"); | |
if (not defined $fh) { | |
print "UNKNOWN: could not run $hadoop_command $dfs_report: $!\n"; | |
exit 3; | |
} | |
while (<$fh>) { | |
chomp; | |
next if /^$/; | |
next if /last contact/i; | |
last if /^-----/; # break out to next section if we encounter the delimeter | |
my ($key, $value) = split(/:/); | |
$key=~s/\s+//g; | |
if ($key =~ /dfsused%/i) { | |
$value =~ s/%//; | |
} | |
$dfs_status{total}{lc $key} = $value; | |
} | |
while (<$fh>) { | |
chomp; | |
# Datanodes available: 12 (12 total, 0 dead) | |
next unless /Datanodes available/; | |
my ($avail_nodes, $total_nodes, $dead_nodes) = ($_ =~/Datanodes available:\s+(\d+)\s+\((\d+) total, (\d+) dead\)/); | |
$dfs_status{nodes}{total} = $total_nodes; | |
$dfs_status{nodes}{avail} = $avail_nodes; | |
$dfs_status{nodes}{dead} = $dead_nodes; | |
} | |
if (exists $dfs_status{total}) { | |
#Key:Configured Capacity Value: 118481750196224 (107.76 TB) | |
#Key:Present Capacity Value: 118481750196224 (107.76 TB) | |
#Key:DFS Remaining Value: 46083857448971 (41.91 TB) | |
#Key:DFS Used Value: 72397892747253 (65.85 TB) | |
#Key:DFS Used% Value: 61.1% | |
#Key:Under replicated blocks Value: 0 | |
#Key:Blocks with corrupt replicas Value: 0 | |
#Key:Missing blocks Value: 0 | |
if (exists $dfs_status{total}{'underreplicatedblocks'} and | |
defined $dfs_status{total}{'underreplicatedblocks'} and | |
$dfs_status{total}{'underreplicatedblocks'} > 0) { | |
push @errors, "Under replicated blocks > 0"; | |
} | |
if (exists $dfs_status{total}{'blockswithcorruptreplicas'} and | |
defined $dfs_status{total}{'blockswithcorruptreplicas'} and | |
$dfs_status{total}{'blockswithcorruptreplicas'} > 0) { | |
push @errors, "Blocks with corrupt replicas > 0"; | |
} | |
if (exists $dfs_status{total}{'missingblocks'} and | |
defined $dfs_status{total}{'missingblocks'} and | |
$dfs_status{total}{'missingblocks'} > 0) { | |
push @errors, "Missing blocks > 0"; | |
} | |
if (exists $dfs_status{nodes}{'total'} and exists $dfs_status{nodes}{'avail'}) { | |
if (($dfs_status{nodes}{'avail'} / $dfs_status{nodes}{'total'} ) < $avail_threshold) { | |
push @errors, "Less than $avail_threshold % of datanodes available"; | |
} | |
} | |
if (exists $dfs_status{nodes}{'dead'} and $dfs_status{nodes}{'dead'} > 0) { | |
push @errors, "Dead datanodes found in cluster"; | |
} | |
if (exists $dfs_status{total}{'dfsused%'} ) { | |
if (defined $dfs_status{total}{'dfsused%'} > $crit_disk) { | |
push @errors, "DFS % space used $dfs_status{total}{'dfsused%'} > threshold $crit_disk %"; | |
} | |
} | |
} | |
if (scalar @errors > 0) { | |
print "CRITICAL: ", join("; ", @errors), "\n"; | |
exit 2; | |
} else { | |
print "OK: dfsadmin report is happy.\n"; | |
exit 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment