Skip to content

Instantly share code, notes, and snippets.

@hcoyote
Created February 22, 2012 17:33
Show Gist options
  • Save hcoyote/1886212 to your computer and use it in GitHub Desktop.
Save hcoyote/1886212 to your computer and use it in GitHub Desktop.
parse the dfsadmin report and give us some info about the cluster; warn if things are missing, if we have dead nodes, if we don't have enough datanodes, corrupt blocks, etc.
#!/usr/bin/perl
#
# This script is managed by puppet.
#
use strict;
use warnings;
use IO::File;
use Getopt::Long;
if (not exists $ENV{SUDO_USER} or $ENV{SUDO_USER} eq 'nagios') {
exec("/usr/bin/sudo -u hdfs $0 @ARGV");
}
my $verbose;
my $help;
my @errors;
my %dfs_status;
my $hadoop_command = "/usr/bin/hadoop";
my $dfs_report = "dfsadmin -report";
my $warn = "90";
my $crit_disk = "95";
my $avail_threshold = 80 / 100;
GetOptions(
"help|h" => \$help,
);
if (defined $help) {
print <<EOF;
$0 [options]
--help
-h This.
EOF
exit 3;
}
my $fh = IO::File->new("$hadoop_command $dfs_report|");
if (not defined $fh) {
print "UNKNOWN: could not run $hadoop_command $dfs_report: $!\n";
exit 3;
}
while (<$fh>) {
chomp;
next if /^$/;
next if /last contact/i;
last if /^-----/; # break out to next section if we encounter the delimeter
my ($key, $value) = split(/:/);
$key=~s/\s+//g;
if ($key =~ /dfsused%/i) {
$value =~ s/%//;
}
$dfs_status{total}{lc $key} = $value;
}
while (<$fh>) {
chomp;
# Datanodes available: 12 (12 total, 0 dead)
next unless /Datanodes available/;
my ($avail_nodes, $total_nodes, $dead_nodes) = ($_ =~/Datanodes available:\s+(\d+)\s+\((\d+) total, (\d+) dead\)/);
$dfs_status{nodes}{total} = $total_nodes;
$dfs_status{nodes}{avail} = $avail_nodes;
$dfs_status{nodes}{dead} = $dead_nodes;
}
if (exists $dfs_status{total}) {
#Key:Configured Capacity Value: 118481750196224 (107.76 TB)
#Key:Present Capacity Value: 118481750196224 (107.76 TB)
#Key:DFS Remaining Value: 46083857448971 (41.91 TB)
#Key:DFS Used Value: 72397892747253 (65.85 TB)
#Key:DFS Used% Value: 61.1%
#Key:Under replicated blocks Value: 0
#Key:Blocks with corrupt replicas Value: 0
#Key:Missing blocks Value: 0
if (exists $dfs_status{total}{'underreplicatedblocks'} and
defined $dfs_status{total}{'underreplicatedblocks'} and
$dfs_status{total}{'underreplicatedblocks'} > 0) {
push @errors, "Under replicated blocks > 0";
}
if (exists $dfs_status{total}{'blockswithcorruptreplicas'} and
defined $dfs_status{total}{'blockswithcorruptreplicas'} and
$dfs_status{total}{'blockswithcorruptreplicas'} > 0) {
push @errors, "Blocks with corrupt replicas > 0";
}
if (exists $dfs_status{total}{'missingblocks'} and
defined $dfs_status{total}{'missingblocks'} and
$dfs_status{total}{'missingblocks'} > 0) {
push @errors, "Missing blocks > 0";
}
if (exists $dfs_status{nodes}{'total'} and exists $dfs_status{nodes}{'avail'}) {
if (($dfs_status{nodes}{'avail'} / $dfs_status{nodes}{'total'} ) < $avail_threshold) {
push @errors, "Less than $avail_threshold % of datanodes available";
}
}
if (exists $dfs_status{nodes}{'dead'} and $dfs_status{nodes}{'dead'} > 0) {
push @errors, "Dead datanodes found in cluster";
}
if (exists $dfs_status{total}{'dfsused%'} ) {
if (defined $dfs_status{total}{'dfsused%'} > $crit_disk) {
push @errors, "DFS % space used $dfs_status{total}{'dfsused%'} > threshold $crit_disk %";
}
}
}
if (scalar @errors > 0) {
print "CRITICAL: ", join("; ", @errors), "\n";
exit 2;
} else {
print "OK: dfsadmin report is happy.\n";
exit 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment