||
- #!/usr/bin/perl -w
- # check_status.pl Nagios Plugin - Version 1.3
- # Last Updated: 1/9/2003
- #
- # Report any bugs/questions to Russell Scibetti at russell@quadrix.com
- #
- # check_status Change Log:
- #
- # To do for 1.4
- # - Better help and documentation (separate doc?)
- # - Take argument (patterns to match) from a separate spec file
- #
- # New Addition to 1.3
- # - Added ChangeLog information and updated --help output
- # - hostdown (hd) argument for how a service check should respond
- # when its host is Down/Unreachable
- # (--hostdown="ok|warning|critical|unknown")
- # - Changed name from check_state to check_status
- # - Set hostdown to default to OK when the argument isn't specified
- # - Number of Hosts checked is now output in OK result
- #
- # Version 1.2 additions:
- #
- # - Added ability to handle ack'd and downtimed services differently
- # depending on argument provided
- # (--ack="ok|warning|critical|unknown|down|unreachable"
- # --dt="ok|warning|critical|unknown|down|unreachable")
- #
- # Version 1.1 additions:
- #
- # - Added --host=<regex>, --servhost=<regex> to allow for specific field
- # matching (host for matching hostname in host checks, servhost for
- # matching the hostname in service checks, service for matching the
- # service name in service checks)
- # - Output the number of OK services for an OK output
- #
- # Version 1.0 features:
- #
- # - Freshness check of status.log (timestamp)
- # - Match service or host checks
- # - Can ignore acknowledged or downtimes services/hosts (--ack, --dt)
- # - Can output different levels of detail dependent on # of problems
- # - Can check for number of critical, warning, or unknowns
- #
- #############################################################
- use Getopt::Long;
- use File::stat;
- Getopt::Long::Configure('bundling');
- GetOptions
- ("V" => \$version, "version" => \$version,
- "h" => \$help, "help" => \$help,
- "v" => \$verbose, "verbose" => \$verbose,
- "w=s" => \$warning, "warning=s" => \$warning,
- "c=s" => \$critical, "critical=s" => \$critical,
- "u=s" => \$unknown, "unknown=s" => \$unknown,
- "p=s" => \$pattern, "pattern=s" => \$pattern,
- "S:s" => \$service, "service:s" => \$service,
- "s=s" => \$status, "status=s" => \$status,
- "d=s" => \$dir, "dir=s" => \$dir,
- "D=s" => \$details, "details=s" => \$details,
- "H:s" => \$host, "host:s" => \$host,
- "f=s" => \$freshness, "freshness=s" => \$freshness,
- "servhost=s" => \$servhost,
- "a:s" => \$ack, "ack:s" => \$ack,
- "dt:s"=> \$dt, "downtime:s" => \$dt,
- "hd:s"=> \$hdown, "hostdown:s" => \$hdown,
- "ok" => \$ok);
- #Constants:
- my $OK = 0;
- my $WARNING = 1;
- my $CRITICAL = 2;
- my $UNKNOWN = 3;
- my $crit="CRITICAL";
- my $warn="WARNING";
- my $unk="UNKNOWN";
- my $down="DOWN";
- my $unreach="UNREACHABLE";
- # Print out Help information
- if ($help) {
- printVersion();
- printHelp();
- exitcheck($UNKNOWN);
- }
- # Print out version information
- if ($version) {
- printVersion();
- exitcheck($UNKNOWN);
- }
- # Check for status log or directory argument or print usage
- if (!$status) {
- if (!$dir) {
- print "Usage: $0 -s <status file> | -d <Nagios log dir>\n";
- print "Use the --help option for full list of arguments\n";
- exitcheck($UNKNOWN);
- }
- elsif ($dir =~ m#[^/]/$#) {
- $status = $dir . "status.log";
- }
- else {
- $status = $dir . "/status.log";
- }
- }
- if (defined $host) {
- if (!$host) {
- $host="[^\\s]*";
- }
- }
- if (!$host && !$servhost) {
- $servhost="[^\\s]*";
- }
- if (!$host && !$service) {
- $service="[^\\s]*";
- }
- if (defined $ack) {
- if (!$ack) {
- $ack="ok";
- }
- elsif (!($ack =~ "ok|critical|warning|unknown|down|unreachable")) {
- print "Invalid value for ack\n";
- exitcheck($UNKNOWN);
- }
- }
- if (defined $dt) {
- if (!$dt) {
- $dt="ok";
- }
- elsif (!($dt =~ "ok|critical|warning|unknown|down|unreachable")) {
- print "Invalid value for dt\n";
- exitcheck($UNKNOWN);
- }
- }
- if (defined $hdown) {
- if (!$hdown) {
- $hdown="ok";
- }
- elsif (!($hdown =~ "ok|critical|warning|unknown|down|unreachable")) {
- print "Invalid value for hostdown\n";
- exitcheck($UNKNOWN);
- }
- }
- my $much_details = 0;
- my $ServiceNotOK = "CRITICAL|WARNING|UNKNOWN";
- my $HostNotOK = "DOWN|UNREACHABLE";
- my %numprob = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
- my $CritOnly = 0;
- my $WarnOnly = 0;
- my $UnkOnly = 0;
- my @wlev;
- my @clev;
- my @ulev;
- my %warnlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
- my %critlevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
- my %unklevel = ("WARNING",0,"CRITICAL",0,"UNKNOWN",0);
- my %hostlevel = ("DOWN",0,"UNREACHABLE",0);
- # Store Hosts in downtime
- my @hostdowntime;
- my $numdowntime = 0;
- # Store Hosts in a Down/Unreachable state
- my @hostdown;
- my $numdown = 0;
- # Hash for storing state-change to OK times for hosts:
- my %hostoktimes;
- # Number of matches in parsing
- my $nummatch = 0;
- if ($warning) {
- if ($warning =~ /,/) {
- @wlev = split /,/,$warning;
- $warnlevel{"WARNING"} = $wlev[0];
- $warnlevel{"CRITICAL"} = $wlev[1];
- if ($wlev[2] ) {
- $warnlevel{"UNKNOWN"} = $wlev[2];
- }
- }
- else {
- $WarnOnly = $warning;
- }
- }
- else {
- $WarnOnly = 1;
- }
- if ($critical) {
- if ($critical =~ /,/) {
- @clev = split /,/,$critical;
- $critlevel{"WARNING"} = $clev[0];
- $critlevel{"CRITICAL"} = $clev[1];
- if ($clev[2] ) {
- $critlevel{"UNKNOWN"} = $clev[2];
- }
- }
- else {
- $CritOnly = $critical;
- }
- }
- else {
- $CritOnly = 1;
- }
-
- if ($unknown) {
- if ($unknown =~ /,/) {
- @ulev = split /,/,$unknown;
- $unklevel{"WARNING"} = $ulev[0];
- $unklevel{"CRITICAL"} = $ulev[1];
- if ($ulev[2] ) {
- $unklevel{"UNKNOWN"} = $ulev[2];
- }
- }
- else {
- $UnkOnly = $unknown;
- }
- }
- else {
- $UnkOnly = 1;
- }
- if (!$freshness) {
- $freshness = 30 * 60;
- }
- else {
- $freshness = $freshness * 60;
- }
- my %ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
- my %much_ct = ("CRITICAL",0,"WARNING",0,"UNKNOWN",0,"DOWN",0,"UNREACHABLE",0);
- my %output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
- my %much_output = ("CRITICAL","","WARNING","","UNKNOWN","","DOWN","","UNREACHABLE","");
- if ($details) {
- if ($details =~ /,/) {
- my @tempv = split /,/,$details;
- $much_details = $tempv[0];
- $details = $tempv[1];
- }
- }
- open("sta","$status") || die "Cannot open status file $status!";
- $curr_time = time;
- $file_time = stat($status)->mtime;
- if ($curr_time - $file_time > $freshness) {
- printf "State CRITICAL - Status file is stale!!!\n";
- exitcheck($CRITICAL);
- }
- while(<sta>) {
- chomp;
- if (/^[^\s]+[\s]+HOST;/) {
- @hdata = split /;/,$_;
-
- # If you care about matching hosts (not services):
- if ($host && $hdata[1] =~ /$host/) {
- $nummatch++;
- if ( $hdata[2] =~ /$HostNotOK/ ) {
- addproblem($_,$hdata[2]);
- }
- }
- # If you are matching services, gather host information:
- else {
- if ( $hdata[2] =~ /$HostNotOK/ ) {
- $hostdown[$numdown] = $hdata[1];
- $numdown++;
- }
- else {
- $hostoktimes{$hdata[1]} = $hdata[4];
- }
- if ( $hdata[17] ne "0" ) {
- $hostdowntime[$numdowntime] = $hdata[1];
- $numdowntime++;
- }
- }
- }
- elsif (!$host && /^[^\s]+[\s]+SERVICE;/) {
- @servdata = split /;/,$_;
- if ( ( $pattern && ($_ =~ /$pattern/)) ||
- (($servdata[1] =~ /$servhost/) && ($servdata[2] =~ /$service/)) ){
- $nummatch++;
- if (($servdata[5] eq "HARD") && ($servdata[3] =~ /$ServiceNotOK/)) {
- addproblem($_,$servdata[3]);
- }
- }
- }
- }
- close("sta");
- if ($nummatch==0) {
- print "Nothing Matches your criteria!\n";
- exitcheck($UNKNOWN);
- }
- # Count the number of problems (for reference):
- if ($host) {
- $total = $numprob{"DOWN"} + $numprob{"UNREACHABLE"};
- }
- else {
- $total = $numprob{"WARNING"} + $numprob{"CRITICAL"} + $numprob{"UNKNOWN"};
- }
- my $numok = $nummatch - $total;
- # If this is a host state check:
- if ($host) {
- if ($numprob{"DOWN"}>0 || $numprob{"UNREACHABLE"}>0 ) {
- if ($details && ($total <= $details)) {
- print "State CRITICAL - $total Host Problems: $output{$down} $output{$unreach}\n";
- exitcheck($CRITICAL);
- }
- else {
- print "State CRITICAL - $numprob{$down} Hosts Down, $numprob{$unreach} Hosts Unreachable\n";
- exitcheck($CRITICAL);
- }
- }
- else {
- print "State OK - $numok Hosts Up, $total Problems\n";
- exitcheck($OK);
- }
- }
- #If you only defined a Critical level in terms of # of criticals...
- elsif ($CritOnly && ($numprob{"CRITICAL"} >= $CritOnly)) {
- countAndPrint($crit,$numprob{$crit},0);
- exitcheck($CRITICAL);
- }
- #Critical in terms on # criticals and # warnings...
- elsif (!$CritOnly && ($numprob{"WARNING"} >= $critlevel{"WARNING"} ||
- $numprob{"CRITICAL"} >= $critlevel{"CRITICAL"} ||
- $numprob{"UNKNOWN"} >= $critlevel{"UNKNOWN"} )) {
- countAndPrint($crit,$total,1);
- exitcheck($CRITICAL);
- }
- #Warning in terms of # warnings only...
- elsif ($WarnOnly && ($numprob{"WARNING"} >= $WarnOnly)) {
- countAndPrint($warn,$numprob{$warn},0);
- exitcheck($WARNING);
- }
- #Warning in terms of # warnings and # criticals...
- elsif (!$WarnOnly && ($numprob{"WARNING"} >= $warnlevel{"WARNING"} ||
- $numprob{"CRITICAL"} >= $warnlevel{"CRITICAL"} ||
- $numprob{"UNKNOWN"} >= $warnlevel{"UNKNOWN"})) {
- countAndPrint($warn,$total,1);
- exitcheck($WARNING);
- }
- #Unknown in terms on # unknown only...
- elsif ( $UnkOnly && ($numprob{"UNKNOWN"}>=$UnkOnly) ) {
- countAndPrint($unk,$numprob{$unk},0);
- exitcheck($UNKNOWN);
- }
- #Unknown in terms of # warning, critical, and unknown...
- elsif (!$UnkOnly && ($numprob{"WARNING"} >= $unklevel{"WARNING"} ||
- $numprob{"CRITICAL"} >= $unklevel{"CRITICAL"} ||
- $numprob{"UNKNOWN"} >= $unklevel{"UNKNOWN"})) {
- countAndPrint($unk,$total,1);
- exitcheck($UNKNOWN);
- }
- # Everything is OK!
- else {
- print "State OK - $numok OK, $total problems\n";
- exitcheck($OK);
- }
- ############################
- # Subroutines
- ############################
- # Return the proper exit code for Critical, Warning, Unknown, or OK
- sub exitcheck {
- if ($ok) {
- exit 0;
- }
- else {
- exit $_[0];
- }
- }
- # Decide what to print for services:
- sub countAndPrint {
- my $state = $_[0];
- my $count = $_[1];
- my $alltypes = $_[2];
- my $output = "State $state - ";
- if ($details) {
- if ($count<=$much_details) {
- if ($alltypes) {
- $output .= "$count problems: $much_output{$crit} $much_output{$warn} $much_output{$unk}";
- }
- else {
- $output .= "$count \L$state\E: $much_output{$state}";
- }
- }
- elsif ($count<=$details) {
- if ($alltypes) {
- $output .= "$count problems: $output{$crit} $output{$warn} $output{$unk}";
- }
- else {
- $output .= "$count \L$state\E: $output{$state}";
- }
- }
- else {
- if ($alltypes) {
- $output .= "$numprob{$crit} critical, $numprob{$warn} warning, $numprob{$unk} unknown";
- }
- else {
- $output .= "$count \L$state\E";
- }
- }
- }
- else {
- $output .= "$count problems";
- }
- print "$output\n";
- }
-
- # Add-in the problem found in the status log
- sub addproblem {
- $test = 1;
- $type = $_[1];
- my $diffout = "";
- my @values = split /;/,$_[0];
- if (!$host) {
- my $namehold = $values[1];
- if ($ack && ($values[13] eq "1")) {
- if ($ack =~ "ok") {
- $test = 0;
- }
- else {
- $type = "\U$ack";
- }
- }
- elsif ($hdown && grep /$namehold/, @hostdown) {
- if ($hdown =~ "ok") {
- $test = 0;
- }
- else {
- $type = "\U$hdown";
- $diffout = "$values[1] is down";
- }
- }
- elsif ($dt && (($values[27] ne "0") || (grep /$namehold/, @hostdowntime))){
- if ($dt =~ "ok") {
- $test = 0;
- }
- else {
- $type = "\U$dt";
- }
- }
- elsif (exists $hostoktimes{$namehold}) {
- # If the state change time of the host is more recent than the last
- # service check, must wait until the next service check runs!
- if ($hostoktimes{$namehold} > $values[6]) {
- $test = 0;
- }
- }
- }
- else {
- if ($ack && $values[5]) {
- if ($ack =~ "ok") {
- $test = 0;
- }
- else {
- $type = "\U$ack";
- }
- }
- elsif ($dt && ($values[17] ne "0")) {
- if ($dt =~ "ok") {
- $test = 0;
- }
- else {
- $type = "\U$dt";
- }
- }
- }
- if ($details && $test) {
- if (!$host) {
- if ($diffout) {
- $much_output{$type} .= " $diffout;";
- $output{$type} .= "$diffout;";
- $much_ct{$type}++;
- $ct{$type}++;
- }
- else {
- if ($much_details && $much_ct{$type}<$much_details) {
- $much_output{$type} .= " $values[2] on $values[1] $values[31];";
- $much_ct{$type}++;
- }
- if ($ct{$type} < $details) {
- $output{$type} .= " $values[2] on $values[1];";
- $ct{$type}++;
- }
- }
- }
- else {
- $much_output{$type} .= " $values[1] $_[1] $values[20],";
- $much_ct{type}++;
- $output{$type} .= " $values[1] HOST $_[1],";
- $ct{$type}++;
- }
- }
- if ($test) {
- $numprob{$type}++;
- }
- }
- ################################
- #
- # Version and Help Information
- #
- ################################
- sub printVersion {
- printf <<EndVersion;
- $0 (nagios-plugins) 1.3
- The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute
- copies of the plugins under the terms of the GNU General Public License.
- For more information about these matters, see the file named COPYING.
- EndVersion
- }
- sub printHelp {
- printf <<EOF;
- This plugin parses through the Nagios status log and will return a
- Critical, Warning, or Unknown state depending on the number of
- Critical, Warning, and/or Unknown services found in the log
- (or Down/Unreachable hosts when matching against hosts)
- Usage: $0 -s <Status File> | -d <Nagios Log Directory>
- [-w #[,#][,#]] [-c #[,#][,#]] [-u #[,#][,#]]
- [--service=<RegEx> | --servhost=<RegEx> | --pattern=<RegEx> |
- --host | --host=<RegEx>]
- [--ack[=string]] [--dt[=string]] [--hostdown[=string]]
- [-D #[,#]] [--ok] [-f <Log freshness in # minutes>]
- $0 --help
- $0 --version
- NOTE: One of -s and -d must be specified
- Options:
- -s, --status=FILE_NAME
- Location and name of status log (e.g. /usr/local/nagios/var/status.log)
- -d, --dir=DIRECTORY_NAME
- Directory that contains the nagios logs (e.g. /usr/local/nagios/var/)
- -w, --warning=INTEGER[,INTEGER][,INTEGER]
- #: Number of warnings to result in a WARNING state
- OR
- #,#: Warning,Criticals to result in a WARNING state
- OR
- #,#,#: Warning,Critical,Unknown to result in a WARNING state
- Default: -w=1
- -c, --critical=INTEGER[,INTEGER][,INTEGER]
- #: Number of criticals to result in a CRITICAL state
- OR
- #,#: Warning,Criticals to result in a CRITICAL state
- OR
- #,#,#: Warning,Critical,Unknown to result in a CRITICAL state
- Default: -c=1
- -u, --unknown=INTEGER[,INTEGER][,INTEGER]
- #: Number of unknowns to result in a UNKNOWN state
- OR
- #,#: Warning,Criticals to result in a UNKNOWN state
- OR
- #,#,#: Warning,Critical,Unknown to result in a UNKNOWN state
- Default: -u=1
- -r, --service[=REGEX]
- Only match services [that match the RegEx]
- (--service is default setting if no other matching arguments provided)
- --servhost=REGEX
- Only match services whose host match the RegEx
- -p, --pattern=REGEX
- Only parse for this regular expression (services only, not hosts)
- --host[=REGEX]
- Report on the state of hosts (whose name matches the RegEx if provided)
- -a, --ack[=ok|warning|critical|unknown|down|unreachable]
- Handle Acknowledged problems [--ack defaults to ok]
- --dt, --downtime[=ok|warning|critical|unknown|down|unreachable]
- Handle problems in scheduled downtime [--dt defaults to ok]
- --hd, --hostdown[=ok|warning|critical|unknown|down|unreachable]
- Handle services whose Host is down [--hd defaults to ok]
- -D, --details=INTEGER[,INTEGER]
- Amount of verbosity to output
- If # problems:
- <= 1st integer, return full details (each plugin's output)
- <= 2nd integer, return some details (list each service host pair)
- > 2nd integer, return the # of problems
- -f, --freshness=INTEGER
- Number of minutes old the log can be to make sure Nagios is running
- (Default = 30 minutes)
- --ok
- Return an OK exit code, regardless of number of problems found
- -h, --help
- Print detailed help screen
- -V, --version
- Print version information
- For service checking (use --service and/or --servhost):
- 1. The values of warning, critical, and unknown default to 1, i.e.
- $0 will return CRITICAL if there is at least 1 critical service,
- WARNING if there is at least 1 warning service, and UNKNOWN if there is
- at least one unknown service.
- 2. If a service's host is DOWN or UNREACHABLE, $0 will use the
- value of --hostdown to determine how to treat the service. Without that
- argument, $0 will count the service as OK.
- 3. If a service's host is OK, but the last host-state change occurred more
- recently than the last service check, $0 will ignore that service
- (want to wait until the service has been checked after a host has recovered
- or you may get service alert for services that still need to be checked)
- 4. If the --dt, --ack, or --hd tags are used, $0 will use the value
- of the arguments to determine how to handle services in downtime, acknowledged,
- or with down hosts (default=OK). For service checks, --dt will also check
- if the service's host is in a downtime.
- For host checking (use --host):
- 1. Using the --host argument, $0 will look for DOWN and UNREACHABLE
- hosts. If any are found, $0 will return a CRITICAL. You can provide
- an REGEX for --host to only check hosts with matching host names.
- 2. If the --dt or --ack tags are used, $0 will use the value of the
- --dt/--ack arguments to determine the state of the host (default is OK)
- EOF
- }
|