|
|
@@ -19,7 +19,8 @@ use lib qw( /opt/nagios/libexec /usr/local/libexec );
|
|
|
use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage);
|
|
|
use SNMP 5.0;
|
|
|
use Getopt::Long;
|
|
|
-use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats %processes $snmp_session $PROGNAME $TIMEOUT );
|
|
|
+use Storable;
|
|
|
+use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats $opt_cache $opt_nocache $cache_exp $interpreters $snmp_session $PROGNAME $TIMEOUT );
|
|
|
|
|
|
$PROGNAME = "snmp_process_monitor.pl";
|
|
|
$opt_verbose = undef;
|
|
|
@@ -31,8 +32,13 @@ $opt_critical = [ 1, -1 ];
|
|
|
$opt_memory = undef;
|
|
|
$opt_cpu = undef;
|
|
|
$opt_port = 161;
|
|
|
-%processes = ();
|
|
|
-$exit = 'OK';
|
|
|
+$opt_cache = 1;
|
|
|
+$opt_nocache = undef;
|
|
|
+$cache_exp = 600;
|
|
|
+$exit = $ERRORS{OK};
|
|
|
+$interpreters = '(perl|/bin/sh|/usr/bin/sh|/bin/bash|/bin/ksh|python)';
|
|
|
+our $cachefile = '/var/opt/nagios/tmp/'; # completed later
|
|
|
+our %processes = ();
|
|
|
|
|
|
sub process_options {
|
|
|
my( $opt_crit, $opt_warn ) = ();
|
|
|
@@ -42,6 +48,7 @@ sub process_options {
|
|
|
'v' => \$opt_verbose, 'verbose' => \$opt_verbose,
|
|
|
'h' => \$opt_help, 'help' => \$opt_help,
|
|
|
's' => \$opt_stats, 'statistics' => \$opt_stats,
|
|
|
+ 'nocache' => \$opt_nocache,
|
|
|
'H:s' => \$opt_host, 'hostname:s' => \$opt_host,
|
|
|
'p:i' => \$opt_port, 'port:i' => \$opt_port,
|
|
|
'C:s' => \$opt_community, 'community:s' => \$opt_community,
|
|
|
@@ -75,6 +82,12 @@ sub process_options {
|
|
|
$opt_warning = [ $opt_crit, -1 ];
|
|
|
}
|
|
|
}
|
|
|
+ if ( defined($opt_memory) ) { $opt_memory = 0 }
|
|
|
+ if ( defined($opt_cpu) ) { $opt_cpu = 0 }
|
|
|
+ if ( defined($opt_nocache)) { $opt_cache = 0 }
|
|
|
+
|
|
|
+ # complete the cachefile's name
|
|
|
+ $cachefile .= $opt_host . '.proc';
|
|
|
}
|
|
|
|
|
|
sub local_print_revision {
|
|
|
@@ -82,7 +95,7 @@ sub local_print_revision {
|
|
|
}
|
|
|
|
|
|
sub print_usage {
|
|
|
- print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>]\n";
|
|
|
+ print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>] [-s|--statistics] [--memory] [--cpu] [--nocache]\n";
|
|
|
}
|
|
|
|
|
|
sub print_help {
|
|
|
@@ -107,6 +120,10 @@ sub print_help {
|
|
|
minimum and maximum number of processes before a warning is issued (Default 1,-1)
|
|
|
-c, --critical=INTEGER[,INTEGER]
|
|
|
minimum and maximum number of processes before a critical is issued (Default 1,-1)
|
|
|
+--memory
|
|
|
+ combined with '-s', will print the number of bytes of real memory used by process
|
|
|
+--cpu
|
|
|
+ combined with '-s', will print the number of seconds of cpu time consumed by process
|
|
|
EOT
|
|
|
}
|
|
|
|
|
|
@@ -117,11 +134,129 @@ sub verbose (@) {
|
|
|
|
|
|
sub check_for_errors {
|
|
|
if ( $snmp_session->{ErrorNum} ) {
|
|
|
+ %processes = ();
|
|
|
print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n";
|
|
|
exit $ERRORS{UNKNOWN};
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+sub init_cache {
|
|
|
+ if ( !defined($opt_cache) ) {
|
|
|
+ %processes = ();
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ if ( -r $cachefile ) {
|
|
|
+ eval {
|
|
|
+ verbose "loading cache from $cachefile\n";
|
|
|
+ %processes = %{ retrieve( $cachefile ) };
|
|
|
+ };
|
|
|
+ if ( $@ ) {
|
|
|
+ verbose "cache loading failed - using blank cache: $@\n";
|
|
|
+ %processes = ()
|
|
|
+ }
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ %processes = ();
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+sub snmpget {
|
|
|
+ my $tmpvar = SNMP::Varbind->new( shift );
|
|
|
+ $snmp_session->get( $tmpvar );
|
|
|
+ check_for_errors();
|
|
|
+ return $tmpvar->val;
|
|
|
+}
|
|
|
+
|
|
|
+sub update_cache {
|
|
|
+ # expire the cache after $cache_exp seconds
|
|
|
+ if ( $opt_cache != 0 && exists($processes{__last_update})
|
|
|
+ && $processes{__last_update} >= time - $cache_exp ) {
|
|
|
+ verbose "cache file is recent enough - using it\n";
|
|
|
+ return 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ verbose "retrieving full listing of processes from $opt_host\n";
|
|
|
+ my $process_count = snmpget( ['hrSystemProcesses', 0] );
|
|
|
+
|
|
|
+ # retrieve the data from the remote host
|
|
|
+ my ($names) = $snmp_session->bulkwalk( 0, $process_count + 1, [['hrSWRunName']] );
|
|
|
+ check_for_errors();
|
|
|
+
|
|
|
+ # make sure the number of processes from the bulkwalk is close to hrSystemProcesses
|
|
|
+ if ( scalar(@$names) + 10 < $process_count ) {
|
|
|
+ print "UNKNOWN - only ", scalar(@$names), " of ",$process_count, " processes returned\n";;
|
|
|
+ exit $ERRORS{UNKNOWN};
|
|
|
+ }
|
|
|
+
|
|
|
+ # sort through the process names and create a nice hash of processes
|
|
|
+ foreach my $row ( @$names ) {
|
|
|
+ my %hash = {};
|
|
|
+ $hash{name} = $row->val;
|
|
|
+ $hash{abs_name} = $row->val;
|
|
|
+ $hash{name} =~ s#.*/##; # strip path
|
|
|
+
|
|
|
+ if ( defined($opt_regex) ||
|
|
|
+ ($row->val =~ m#$interpreters$#
|
|
|
+ && $opt_command !~ m#$interpreters$#) ) {
|
|
|
+
|
|
|
+ # fetch the runtime parameters of the process
|
|
|
+ my $parameters = snmpget( ['hrSWRunParameters', $row->iid] );
|
|
|
+
|
|
|
+ # only strip if we're looking for a specific command
|
|
|
+ if ( defined($opt_command) ) {
|
|
|
+ verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n";
|
|
|
+ $hash{name} = $parameters;
|
|
|
+ $hash{name} =~ s#.*/##; # strip path name off the front
|
|
|
+ $hash{name} =~ s/\s+.*$//; # strip everything from the first space to the end
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ # use the full 'ps -efl' style listing for regular expression matching
|
|
|
+ my $path = snmpget( ['hrSWRunPath', $row->iid] );
|
|
|
+ $hash{name} = "$path $parameters";
|
|
|
+ }
|
|
|
+ }
|
|
|
+ # store in the global hash
|
|
|
+ $processes{$row->iid} = \%hash;
|
|
|
+ }
|
|
|
+
|
|
|
+ # update the timestamp so the cache can expire
|
|
|
+ $processes{__last_update} = time;
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+# process the %processes hash and see if there any matches for our command or regex
|
|
|
+sub check_for_matches {
|
|
|
+ my $ret_match = 0;
|
|
|
+ foreach my $key ( keys(%processes) ) {
|
|
|
+ next if ( $key eq '__last_update' );
|
|
|
+ my $match = 0;
|
|
|
+
|
|
|
+ # static matches are letter-for-letter (-e)
|
|
|
+ if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { $match++; }
|
|
|
+ # use /o to make sure the user-supplied regex (-r) is only compiled once
|
|
|
+ elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { $match++; }
|
|
|
+
|
|
|
+ # verify the cache's entry by doing an snmpget
|
|
|
+ if ( $match > 0 && $opt_cache != 0 ) {
|
|
|
+ my $proc = snmpget( ['hrSWRunName', $key] );
|
|
|
+ --$match if ( !$proc || $proc ne $processes{$key}->{abs_name} );
|
|
|
+ }
|
|
|
+ # get the process memory usage if requested
|
|
|
+ if ( $match > 0 && defined($opt_memory) ) {
|
|
|
+ $opt_memory += snmpget( ['hrSWRunPerfMem', $key] );
|
|
|
+ }
|
|
|
+ # get the process cpu usage if requested
|
|
|
+ if ( $match > 0 && defined($opt_cpu) ) {
|
|
|
+ $opt_cpu += snmpget( ['hrSWRunPerfCPU', $key] );
|
|
|
+ }
|
|
|
+
|
|
|
+ verbose "process '$processes{$key}->{name}' has pid $processes{$key}->{pid} and index $key\n"
|
|
|
+ if ( $match > 0 );
|
|
|
+
|
|
|
+ $ret_match += $match;
|
|
|
+ }
|
|
|
+ return $ret_match;
|
|
|
+}
|
|
|
# =========================================================================== #
|
|
|
# =====> MAIN
|
|
|
# =========================================================================== #
|
|
|
@@ -129,6 +264,10 @@ process_options();
|
|
|
|
|
|
alarm( $TIMEOUT ); # make sure we don't hang Nagios
|
|
|
|
|
|
+# intialize the cache, if it's enabled
|
|
|
+init_cache();
|
|
|
+
|
|
|
+# create a session for conversing with the remote SNMP agent
|
|
|
$snmp_session = new SNMP::Session(
|
|
|
DestHost => $opt_host,
|
|
|
Community => $opt_community,
|
|
|
@@ -136,92 +275,57 @@ $snmp_session = new SNMP::Session(
|
|
|
Version => '2c'
|
|
|
);
|
|
|
|
|
|
-my $process_count = SNMP::Varbind->new( ['hrSystemProcesses', 0] );
|
|
|
-$snmp_session->get( $process_count );
|
|
|
-check_for_errors();
|
|
|
-
|
|
|
-# retrieve the data from the remote host
|
|
|
-my( $names, $index ) = $snmp_session->bulkwalk( 0, $process_count->val, [['hrSWRunName'], ['hrSWRunIndex']] );
|
|
|
-check_for_errors();
|
|
|
-
|
|
|
-alarm( 0 ); # all done with the network connection
|
|
|
-
|
|
|
-my %namecount = ();
|
|
|
-foreach my $row ( @$names ) {
|
|
|
- $processes{$row->iid}->{name} = $row->val;
|
|
|
- $processes{$row->iid}->{name} =~ s#.*/##; # strip path
|
|
|
-
|
|
|
- if ( defined($opt_regex) ||
|
|
|
- ($row->val =~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/
|
|
|
- && $opt_command !~ /(perl|\/usr\/bin\/sh|\/bin\/bash|\/bin\/sh)$/) ) {
|
|
|
-
|
|
|
- # fetch the runtime parameters of the process
|
|
|
- my $parm_var = SNMP::Varbind->new( ['hrSWRunParameters', $row->iid] );
|
|
|
- $snmp_session->get( $parm_var );
|
|
|
- check_for_errors();
|
|
|
-
|
|
|
- # only strip if we're looking for a specific command
|
|
|
- if ( defined($opt_command) ) {
|
|
|
- verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n";
|
|
|
- $processes{$row->iid}->{name} = $parm_var->val;
|
|
|
- # strip path name off the front
|
|
|
- $processes{$row->iid}->{name} =~ s#.*/##;
|
|
|
- # strip everything from the first space to the end
|
|
|
- $processes{$row->iid}->{name} =~ s/\s+.*$//;
|
|
|
- }
|
|
|
- else {
|
|
|
- # get the longer full-path style listing
|
|
|
- my $path_var = SNMP::Varbind->new( ['hrSWRunPath', $row->iid] );
|
|
|
- $snmp_session->get( $path_var );
|
|
|
- check_for_errors();
|
|
|
+my $usage = update_cache();
|
|
|
+my $count = check_for_matches();
|
|
|
|
|
|
- # use the full 'ps -efl' style listing for regular expression matching
|
|
|
- $processes{$row->iid}->{name} = $path_var->val.' '.$parm_var->val;
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-foreach my $row ( @$index ) {
|
|
|
- $processes{$row->iid}->{pid} = $row->val;
|
|
|
+# always try twice if caching is enabled - once with cache and once without
|
|
|
+if ( $usage != 0 && $opt_cache != 0 && $count <= 0 ) {
|
|
|
+ verbose "did not find process in cache - trying a refresh\n";
|
|
|
+ %processes = ();
|
|
|
+ update_cache();
|
|
|
+ $count = check_for_matches();
|
|
|
}
|
|
|
|
|
|
-my @pids = ();
|
|
|
-my @matches = ();
|
|
|
-foreach my $key ( keys(%processes) ) {
|
|
|
- if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) {
|
|
|
- push( @matches, $processes{$key} );
|
|
|
- push( @pids, $processes{$key}->{pid} );
|
|
|
- verbose "process '$processes{$key}->{name}' has pid ",
|
|
|
- "$processes{$key}->{pid} and index $key\n";
|
|
|
- }
|
|
|
- elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) {
|
|
|
- push( @matches, $processes{$key} );
|
|
|
- push( @pids, $processes{$key}->{pid} );
|
|
|
- verbose "process '$processes{$key}->{name}' has pid ",
|
|
|
- "$processes{$key}->{pid} and index $key\n";
|
|
|
- }
|
|
|
-}
|
|
|
-my $count = @matches;
|
|
|
+
|
|
|
+# the default, OK message
|
|
|
+my $message = "OK - $count process(es) found resembling '". ($opt_command || $opt_regex);
|
|
|
|
|
|
# warning, critical
|
|
|
if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count)
|
|
|
|| ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) {
|
|
|
- $exit = 'WARNING';
|
|
|
+ $message = "WARNING - no processes found resembling '". ($opt_command || $opt_regex);
|
|
|
+ $exit = $ERRORS{WARNING};
|
|
|
}
|
|
|
if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count)
|
|
|
|| ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) {
|
|
|
- $exit = 'CRITICAL';
|
|
|
+ $message = "CRITICAL - no processes found resembling '". ($opt_command || $opt_regex);
|
|
|
+ $exit = $ERRORS{CRITICAL};
|
|
|
}
|
|
|
|
|
|
-print "$exit - $count processes with pid(s) ",join(',',@pids);
|
|
|
+# output the status message
|
|
|
+print $message, "'";
|
|
|
|
|
|
# print the number of processes if statistics are requested
|
|
|
if ( defined($opt_stats) ) {
|
|
|
- print "|count:$count\n";
|
|
|
+ print "|count=$count";
|
|
|
+ if ( defined($opt_memory) ) {
|
|
|
+ print ":memory=", $opt_memory;
|
|
|
+ }
|
|
|
+ if ( defined($opt_cpu) ) {
|
|
|
+ $opt_cpu = $opt_cpu / 100;
|
|
|
+ printf ":cpu=%.2f", $opt_cpu;
|
|
|
+ }
|
|
|
}
|
|
|
-else {
|
|
|
- print "\n";
|
|
|
+
|
|
|
+# store a copy of the %processes hash if we're using caching
|
|
|
+if ( $exit == $ERRORS{OK} && $opt_cache != 0 ) {
|
|
|
+ eval {
|
|
|
+ unlink( $cachefile ) if ( -e $cachefile );
|
|
|
+ store( \%processes, $cachefile );
|
|
|
+ };
|
|
|
}
|
|
|
|
|
|
-exit $ERRORS{$exit};
|
|
|
+print "\n";
|
|
|
+exit $exit;
|
|
|
|
|
|
|