check_snmp_process_monitor.pl 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. #!/usr/local/bin/perl
  2. # author: Al Tobey <albert.tobey@priority-health.com>
  3. # what: monitor a process using the host-resources mib
  4. # license: GPL - http://www.fsf.org/licenses/gpl.txt
  5. #
  6. # Todo:
  7. # * implement memory and cpu utilization checks
  8. # * maybe cache pids in DBM files if snmp agents get overworked
  9. ###############################################################################
  10. # to get a list of processes over snmp try this command:
  11. # snmptable -v2c -c public hostname hrSWRunTable
  12. # for just a list of valid arguments for the '-e' option:
  13. # snmpwalk -v2c -c public hostname hrSWRunName |perl -pe 's:.*/::'
  14. ###############################################################################
  15. use strict;
  16. require 5.6.0;
  17. use lib qw( /opt/nagios/libexec /usr/local/libexec );
  18. use utils qw(%ERRORS $TIMEOUT &print_revision &support &usage);
  19. use SNMP 5.0;
  20. use Getopt::Long;
  21. use Storable;
  22. use vars qw( $exit $opt_version $opt_timeout $opt_help $opt_command $opt_host $opt_community $opt_verbose $opt_warning $opt_critical $opt_memory $opt_cpu $opt_port $opt_regex $opt_stats $opt_cache $opt_nocache $cache_exp $interpreters $snmp_session $PROGNAME $TIMEOUT );
  23. $PROGNAME = "snmp_process_monitor.pl";
  24. $opt_verbose = undef;
  25. $opt_host = undef;
  26. $opt_community = 'public';
  27. $opt_command = undef;
  28. $opt_warning = [ 1, -1 ];
  29. $opt_critical = [ 1, -1 ];
  30. $opt_memory = undef;
  31. $opt_cpu = undef;
  32. $opt_port = 161;
  33. $opt_cache = 1;
  34. $opt_nocache = undef;
  35. $cache_exp = 600;
  36. $exit = $ERRORS{OK};
  37. $interpreters = '(perl|/bin/sh|/usr/bin/sh|/bin/bash|/bin/ksh|python)';
  38. our $cachefile = '/var/opt/nagios/tmp/'; # completed later
  39. our %processes = ();
  40. sub process_options {
  41. my( $opt_crit, $opt_warn ) = ();
  42. Getopt::Long::Configure( 'bundling' );
  43. GetOptions(
  44. 'V' => \$opt_version, 'version' => \$opt_version,
  45. 'v' => \$opt_verbose, 'verbose' => \$opt_verbose,
  46. 'h' => \$opt_help, 'help' => \$opt_help,
  47. 's' => \$opt_stats, 'statistics' => \$opt_stats,
  48. 'nocache' => \$opt_nocache,
  49. 'H:s' => \$opt_host, 'hostname:s' => \$opt_host,
  50. 'p:i' => \$opt_port, 'port:i' => \$opt_port,
  51. 'C:s' => \$opt_community, 'community:s' => \$opt_community,
  52. 'c:s' => \$opt_crit, 'critical:s' => \$opt_crit,
  53. 'w:s' => \$opt_warn, 'warning:s' => \$opt_warn,
  54. 't:i' => \$TIMEOUT, 'timeout:i' => \$TIMEOUT,
  55. 'e:s' => \$opt_command, 'command:s' => \$opt_command,
  56. 'r:s' => \$opt_regex, 'regex:s' => \$opt_regex,
  57. 'cpu:i' => \$opt_cpu, 'memory:i' => \$opt_memory,
  58. );
  59. if ( defined($opt_version) ) { local_print_revision(); }
  60. if ( defined($opt_verbose) ) { $SNMP::debugging = 1; }
  61. if ( !defined($opt_host) || defined($opt_help) || (!defined($opt_command) && !defined($opt_regex)) ) {
  62. print_help();
  63. exit $ERRORS{UNKNOWN};
  64. }
  65. if ( defined($opt_crit) ) {
  66. if ( $opt_crit =~ /,/ ) {
  67. $opt_critical = [ split(',', $opt_crit) ];
  68. }
  69. else {
  70. $opt_critical = [ $opt_crit, -1 ];
  71. }
  72. }
  73. if ( defined($opt_warn) ) {
  74. if ( $opt_warn =~ /,/ ) {
  75. $opt_warning = [ split(',', $opt_warn) ];
  76. }
  77. else {
  78. $opt_warning = [ $opt_crit, -1 ];
  79. }
  80. }
  81. if ( defined($opt_memory) ) { $opt_memory = 0 }
  82. if ( defined($opt_cpu) ) { $opt_cpu = 0 }
  83. if ( defined($opt_nocache)) { $opt_cache = 0 }
  84. # complete the cachefile's name
  85. $cachefile .= $opt_host . '.proc';
  86. }
  87. sub local_print_revision {
  88. print_revision( $PROGNAME, '$Revision: 84 $ ' )
  89. }
  90. sub print_usage {
  91. print "Usage: $PROGNAME -H <host> -C <snmp_community> -e <command> [-w <low>,<high>] [-c <low>,<high>] [-t <timeout>] [-s|--statistics] [--memory] [--cpu] [--nocache]\n";
  92. }
  93. sub print_help {
  94. local_print_revision();
  95. print "Copyright (c) 2002 Al Tobey <albert.tobey\@priority-health.com>\n\n",
  96. "SNMP Process Monitor plugin for Nagios\n\n";
  97. print_usage();
  98. print <<EOT;
  99. -v, --verbose
  100. print extra debugging information
  101. -h, --help
  102. print this help message
  103. -H, --hostname=HOST
  104. name or IP address of host to check
  105. -C, --community=COMMUNITY NAME
  106. community name for the host's SNMP agent
  107. -e, --command=COMMAND NAME (ps -e style)
  108. what command should be monitored?
  109. -r, --regex=Perl RE
  110. use a perl regular expression to find your process
  111. -w, --warning=INTEGER[,INTEGER]
  112. minimum and maximum number of processes before a warning is issued (Default 1,-1)
  113. -c, --critical=INTEGER[,INTEGER]
  114. minimum and maximum number of processes before a critical is issued (Default 1,-1)
  115. --memory
  116. combined with '-s', will print the number of bytes of real memory used by process
  117. --cpu
  118. combined with '-s', will print the number of seconds of cpu time consumed by process
  119. EOT
  120. }
  121. sub verbose (@) {
  122. return if ( !defined($opt_verbose) );
  123. print @_;
  124. }
  125. sub check_for_errors {
  126. if ( $snmp_session->{ErrorNum} ) {
  127. %processes = ();
  128. print "UNKNOWN - error retrieving SNMP data: $snmp_session->{ErrorStr}\n";
  129. exit $ERRORS{UNKNOWN};
  130. }
  131. }
  132. sub init_cache {
  133. if ( !defined($opt_cache) ) {
  134. %processes = ();
  135. return;
  136. }
  137. if ( -r $cachefile ) {
  138. eval {
  139. verbose "loading cache from $cachefile\n";
  140. %processes = %{ retrieve( $cachefile ) };
  141. };
  142. if ( $@ ) {
  143. verbose "cache loading failed - using blank cache: $@\n";
  144. %processes = ()
  145. }
  146. }
  147. else {
  148. %processes = ();
  149. }
  150. }
  151. sub snmpget {
  152. my $tmpvar = SNMP::Varbind->new( shift );
  153. $snmp_session->get( $tmpvar );
  154. check_for_errors();
  155. return $tmpvar->val;
  156. }
  157. sub update_cache {
  158. # expire the cache after $cache_exp seconds
  159. if ( $opt_cache != 0 && exists($processes{__last_update})
  160. && $processes{__last_update} >= time - $cache_exp ) {
  161. verbose "cache file is recent enough - using it\n";
  162. return 1;
  163. }
  164. verbose "retrieving full listing of processes from $opt_host\n";
  165. my $process_count = snmpget( ['hrSystemProcesses', 0] );
  166. # retrieve the data from the remote host
  167. my ($names) = $snmp_session->bulkwalk( 0, $process_count + 1, [['hrSWRunName']] );
  168. check_for_errors();
  169. # make sure the number of processes from the bulkwalk is close to hrSystemProcesses
  170. if ( scalar(@$names) + 10 < $process_count ) {
  171. print "UNKNOWN - only ", scalar(@$names), " of ",$process_count, " processes returned\n";;
  172. exit $ERRORS{UNKNOWN};
  173. }
  174. # sort through the process names and create a nice hash of processes
  175. foreach my $row ( @$names ) {
  176. my %hash = {};
  177. $hash{name} = $row->val;
  178. $hash{abs_name} = $row->val;
  179. $hash{name} =~ s#.*/##; # strip path
  180. if ( defined($opt_regex) ||
  181. ($row->val =~ m#$interpreters$#
  182. && $opt_command !~ m#$interpreters$#) ) {
  183. # fetch the runtime parameters of the process
  184. my $parameters = snmpget( ['hrSWRunParameters', $row->iid] );
  185. # only strip if we're looking for a specific command
  186. if ( defined($opt_command) ) {
  187. verbose "process ",$row->iid," uses $1 as an interpreter - getting parameters\n";
  188. $hash{name} = $parameters;
  189. $hash{name} =~ s#.*/##; # strip path name off the front
  190. $hash{name} =~ s/\s+.*$//; # strip everything from the first space to the end
  191. }
  192. else {
  193. # use the full 'ps -efl' style listing for regular expression matching
  194. my $path = snmpget( ['hrSWRunPath', $row->iid] );
  195. $hash{name} = "$path $parameters";
  196. }
  197. }
  198. # store in the global hash
  199. $processes{$row->iid} = \%hash;
  200. }
  201. # update the timestamp so the cache can expire
  202. $processes{__last_update} = time;
  203. return 0;
  204. }
  205. # process the %processes hash and see if there any matches for our command or regex
  206. sub check_for_matches {
  207. my $ret_match = 0;
  208. foreach my $key ( keys(%processes) ) {
  209. next if ( $key eq '__last_update' );
  210. my $match = 0;
  211. # static matches are letter-for-letter (-e)
  212. if ( defined($opt_command) && $processes{$key}->{name} eq $opt_command ) { $match++; }
  213. # use /o to make sure the user-supplied regex (-r) is only compiled once
  214. elsif ( defined($opt_regex) && $processes{$key}->{name} =~ /$opt_regex/o ) { $match++; }
  215. # verify the cache's entry by doing an snmpget
  216. if ( $match > 0 && $opt_cache != 0 ) {
  217. my $proc = snmpget( ['hrSWRunName', $key] );
  218. --$match if ( !$proc || $proc ne $processes{$key}->{abs_name} );
  219. }
  220. # get the process memory usage if requested
  221. if ( $match > 0 && defined($opt_memory) ) {
  222. $opt_memory += snmpget( ['hrSWRunPerfMem', $key] );
  223. }
  224. # get the process cpu usage if requested
  225. if ( $match > 0 && defined($opt_cpu) ) {
  226. $opt_cpu += snmpget( ['hrSWRunPerfCPU', $key] );
  227. }
  228. verbose "process '$processes{$key}->{name}' has pid $processes{$key}->{pid} and index $key\n"
  229. if ( $match > 0 );
  230. $ret_match += $match;
  231. }
  232. return $ret_match;
  233. }
  234. # =========================================================================== #
  235. # =====> MAIN
  236. # =========================================================================== #
  237. process_options();
  238. alarm( $TIMEOUT ); # make sure we don't hang Nagios
  239. # intialize the cache, if it's enabled
  240. init_cache();
  241. # create a session for conversing with the remote SNMP agent
  242. $snmp_session = new SNMP::Session(
  243. DestHost => $opt_host,
  244. Community => $opt_community,
  245. RemotePort => $opt_port,
  246. Version => '2c'
  247. );
  248. my $usage = update_cache();
  249. my $count = check_for_matches();
  250. # always try twice if caching is enabled - once with cache and once without
  251. if ( $usage != 0 && $opt_cache != 0 && $count <= 0 ) {
  252. verbose "did not find process in cache - trying a refresh\n";
  253. %processes = ();
  254. update_cache();
  255. $count = check_for_matches();
  256. }
  257. # the default, OK message
  258. my $message = "OK - $count process(es) found resembling '". ($opt_command || $opt_regex);
  259. # warning, critical
  260. if ( ($opt_warning->[0] > 0 && $opt_warning->[0] > $count)
  261. || ($opt_warning->[1] > 0 && $opt_warning->[1] <= $count) ) {
  262. $message = "WARNING - no processes found resembling '". ($opt_command || $opt_regex);
  263. $exit = $ERRORS{WARNING};
  264. }
  265. if ( ($opt_critical->[0] > 0 && $opt_critical->[0] > $count)
  266. || ($opt_critical->[1] > 0 && $opt_critical->[1] <= $count) ) {
  267. $message = "CRITICAL - no processes found resembling '". ($opt_command || $opt_regex);
  268. $exit = $ERRORS{CRITICAL};
  269. }
  270. # output the status message
  271. print $message, "'";
  272. # print the number of processes if statistics are requested
  273. if ( defined($opt_stats) ) {
  274. print "|count=$count";
  275. if ( defined($opt_memory) ) {
  276. print ":memory=", $opt_memory;
  277. }
  278. if ( defined($opt_cpu) ) {
  279. $opt_cpu = $opt_cpu / 100;
  280. printf ":cpu=%.2f", $opt_cpu;
  281. }
  282. }
  283. # store a copy of the %processes hash if we're using caching
  284. if ( $exit == $ERRORS{OK} && $opt_cache != 0 ) {
  285. eval {
  286. unlink( $cachefile ) if ( -e $cachefile );
  287. store( \%processes, $cachefile );
  288. };
  289. }
  290. print "\n";
  291. exit $exit;