| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400 |
- #!/bin/bash
- #
- # Check_procl.sh
- #
- # Program: Process load check plugin for Nagios
- # License : GPL
- # Copyright (c) 2002 Jerome Tytgat (j.tytgat@sioban.net)
- #
- # check_procl.sh,v 1.1 2002/07/04 09:35
- #
- # Description :
- #
- # This plugin is for check the %cpu, %mem or cputime of one or more process
- #
- # Usage :
- #
- # check_procl.sh -p process1,process2,... -w a.b -c c.d --cpu
- # check_procl.sh -p process1,process2,... -w a.b -c c.d --mem
- # check_procl.sh -p process1,process2,... -w a:b:c -c d:e:f --cputime
- #
- # check_procl.sh -p %all% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
- # check_procl.sh -p %max% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
- #
- # Example :
- #
- # To know the memory eaten by HTTPD processes, be warned when it reach 50% and be critical when it reach 75%
- # check_procl.sh -p httpd -w 50.0 -c 75.0 --mem
- # > OK - total %MEM for process httpd : 46.1
- #
- # To know the process which eat the more cpu time, but as we are under linux and are using kapm we do :
- # check_procl.sh -p %max% -e kapmd-idle,kapmd -w 0:1:0 -c 0:2:0 --cputime
- # > CRITICAL - total CPUTIME for process named : 02:32:10
- #
- # Tested on solaris 7/8, Linux Redhat 7.3 and Linux Suse 7.1
- #
- # BUGS : problems with handling time on solaris...
- help_usage() {
- echo "Usage:"
- echo " $0 -p <process_name1,process_name2,... | %all% | %max%>"
- echo " [-e <process_name1,process_name2,...>] -w warning -c critical < --cpu | --mem | --cputime>"
- echo " $0 (-v | --version)"
- echo " $0 (-h | --help)"
- }
- help_version() {
- echo "check_procl.sh (nagios-plugins) 1.1"
- echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute"
- echo "copies of the plugins under the terms of the GNU General Public License."
- echo "For more information about these matters, see the file named COPYING."
- echo "Copyright (c) 2002 Jerome Tytgat - j.tytgat@sioban.net"
- echo "Greetings goes to Websurg which kindly let me took time to develop this"
- echo " Manu Feig and Jacques Kern who were my beta testers, thanks to them !"
- }
- verify_dep() {
- needed="bash cut egrep expr grep let ps sed sort tail test tr wc"
- for i in `echo $needed`
- do
- type $i > /dev/null 2>&1 /dev/null
- if [ $? -eq 1 ]
- then
- echo "I am missing an important component : $i"
- echo "Cannot continue, sorry, try to find the missing one..."
- exit 3
- fi
- done
- }
- myself=$0
- verify_dep
- if [ "$1" = "-h" -o "$1" = "--help" ]
- then
- help_version
- echo ""
- echo "This plugin will check either the cumulutative %cpu, %mem or cputime"
- echo "of a process."
- echo ""
- help_usage
- echo ""
- echo "Required Arguments:"
- echo " -p, --process STRING1,STRING2,..."
- echo " names of the processes we want to monitor,"
- echo " you can add as much as process as you want, separated by comma,"
- echo " hey will be cumulated"
- echo " -p, --process %all%"
- echo " The special keyword %all% will check the cumulative cpu/mem/time of all process"
- echo " WARNING : Can be very slow on heavy loaded servers, watch your timeout !"
- echo " -p, --process %max%"
- echo " The special keyword %max% will check the process which eat the most"
- echo " WARNING : only select the process which eat the more, not the cumulative,"
- echo " but return the cumulative"
- echo " -w, --warning INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
- echo " generate warning state if process count is outside this range"
- echo " -c, --critical INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
- echo " generate critical state if process count is outside this range"
- echo " --cpu"
- echo " return the current cpu usage for the given process"
- echo " --mem"
- echo " return the current memory usage for the given process"
- echo " --cputime"
- echo " return the total cputime usage for the given process"
- echo ""
- echo "Optional Argument:"
- echo " -e, --exclude-process STRING1,STRING2,..."
- echo " names of the processes we want don't want to monitor"
- echo " only useful when associated with %all% or %max% keywords, else ignored"
- echo " ex : kapm-idled on linux is a process which eat memory / cputime but not really... ;-)"
- echo ""
- exit 3
- fi
- if [ "$1" = "-v" -o "$1" = "--version" ]
- then
- help_version
- exit 3
- fi
- if [ `echo $@|tr "=" " "|wc -w` -lt 7 ]
- then
- echo "Bad arguments number (need at least 7)!"
- help_usage
- exit 3
- fi
- tt=0
- process_name=""
- exclude_process_name=""
- wt=""
- ct=""
- # Test of the command lines arguments
- while test $# -gt 0
- do
-
- case "$1" in
- -p|--process)
- if [ -n "$process_name" ]
- then
- echo "Only one --process argument is useful..."
- help_usage
- exit 3
- fi
- shift
- process_name="`echo $1|tr \",\" \"|\"`"
- ;;
- -e|--exclude-process)
- if [ -n "$exclude_process_name" ]
- then
- echo "Only one --exclude-process argument is useful..."
- help_usage
- exit 3
- fi
- shift
- exclude_process_name="`echo $1|tr \",\" \"|\"`"
- ;;
- -w|--warning)
- if [ -n "$wt" ]
- then
- echo "Only one --warning argument needed... Trying to test bad things ? :-)"
- help_usage
- exit 3
- fi
- shift
- wt=$1
- ;;
- -c|--critical)
- if [ -n "$ct" ]
- then
- echo "Only one --critical argument needed... Trying to test bad things ? :-)"
- help_usage
- exit 3
- fi
- shift
- ct=$1
- ;;
- --cpu)
- if [ $tt -eq 0 ]
- then
- tt=1
- else
- echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
- help_usage
- exit 3
- fi
- type_arg_aff="%CPU"
- type_arg="pcpu"
- delim="."
- ;;
- --mem)
- if [ $tt -eq 0 ]
- then
- tt=2
- else
- echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
- help_usage
- exit 3
- fi
- type_arg_aff="%MEM"
- type_arg="pmem"
- delim="."
- ;;
- --cputime)
- if [ $tt -eq 0 ]
- then
- tt=3
- else
- echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
- help_usage
- exit 3
- fi
- type_arg_aff="TIME"
- type_arg="time"
- delim=":"
- ;;
- *)
- echo "Unknown argument $1"
- help_usage
- exit 3
- ;;
- esac
- shift
- done
- # Is the process running ?
- if [ -z "`ps -e | egrep \"$process_name?\"`" -a "$process_name" != "%all%" -a "$process_name" != "%max%" ]
- then
- echo "WARNING: process $process_name not running !"
- exit 3
- fi
- # Cut of warning and critical values
- wt_value1=`echo $wt|cut -d"$delim" -f1`
- wt_value2=`echo $wt|cut -d"$delim" -f2`
- ct_value1=`echo $ct|cut -d"$delim" -f1`
- ct_value2=`echo $ct|cut -d"$delim" -f2`
- if [ $tt -eq 3 ]
- then
- wt_value3=`echo $wt|cut -d"$delim" -f3`
- ct_value3=`echo $ct|cut -d"$delim" -f3`
- else
- wt_value3=0
- ct_value3=0
- fi
- # Integrity check of warning and critical values
- if [ -z "$wt_value1" -o -z "$wt_value2" -o -z "$wt_value3" ]
- then
- echo "Bad expression in the WARNING field : $wt"
- help_usage
- exit 3
- fi
- if [ "`echo $wt_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value3|tr -d \"[:digit:]\"`" != "" ]
- then
- echo "Bad expression in the WARNING field : $wt"
- help_usage
- exit 3
- fi
- if [ -z "$ct_value1" -o -z "$ct_value2" -o -z "$ct_value3" ]
- then
- echo "Bad expression in the CRITICAL field : $ct"
- help_usage
- exit 3
- fi
- if [ "`echo $ct_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value3|tr -d \"[:digit:]\"`" != "" ]
- then
- echo "Bad expression in the CRITICAL field : $ct"
- help_usage
- exit 3
- fi
- # ps line construction set...
- case "$process_name" in
- %all%)
- if [ -z "$exclude_process_name" ]
- then
- psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
- else
- psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
- fi
- ;;
- %max%)
- if [ -z "$exclude_process_name" ]
- then
- pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
- else
- pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
- fi
- psline=`ps -eo $type_arg,comm|grep $pstmp|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
- process_name=$pstmp
- ;;
- *)
- psline=`ps -eo $type_arg,comm|egrep "$process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
- ;;
- esac
- total1=0
- total2=0
- total3=0
- # fetching the values
- for i in $psline
- do
- # Special case for solaris - several format exist for the time function...
- if [ ${#i} -le 6 -a "$tt" -eq 3 ]
- then
- i="00:$i"
- fi
- value1=`echo $i|cut -d$delim -f1`
- value2=`echo $i|cut -d$delim -f2`
- value3=`echo $i|cut -d$delim -f3`
- value3=`test -z "$value3" && echo 0 || echo $value3`
- total1=`expr $total1 + $value1`
- total2=`expr $total2 + $value2`
- total3=`expr $total3 + $value3`
- if [ $tt -eq 3 ]
- then
- if [ $total3 -ge 60 ]
- then
- let total2+=1
- let total3-=60
- fi
- if [ $total2 -ge 60 ]
- then
- let total1+=1
- let total2-=60
- fi
- else
- if [ $total2 -ge 10 ]
- then
- let total1+=1
- let total2=total2-10
- fi
- fi
- done
- warn=0
- crit=0
- # evaluation of the cumulative values vs warning and critical values
- case "$tt" in
- 1)
- return_total="$total1.$total2"
- test $total1 -gt $ct_value1 && crit=1
- test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
- test $total1 -gt $wt_value1 && warn=1
- test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
- ;;
- 2)
- return_total="$total1.$total2"
- test $total1 -gt $ct_value1 && crit=1
- test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
- test $total1 -gt $wt_value1 && warn=1
- test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
- ;;
- 3)
- return_total="`test ${#total1} -eq 1 && echo 0`$total1:`test ${#total2} -eq 1 && echo 0`$total2:`test ${#total3} -eq 1 && echo 0`$total3"
- test $total1 -gt $ct_value1 && crit=1
- test $total1 -eq $ct_value1 -a $total2 -gt $ct_value2 && crit=1
- test $total1 -eq $ct_value1 -a $total2 -eq $ct_value2 -a $total3 -ge $ct_value3 && crit=1
- test $total1 -gt $wt_value1 && warn=1
- test $total1 -eq $wt_value1 -a $total2 -gt $wt_value2 && warn=1
- test $total1 -eq $wt_value1 -a $total2 -eq $wt_value2 -a $total3 -ge $wt_value3 && warn=1
- ;;
- esac
- # last check ...
- if [ $crit -eq 1 -a $warn -eq 0 ]
- then
- echo "Critical value must be greater than warning value !"
- help_usage
- exit 3
- fi
- # Finally Inform Nagios of what we found...
- if [ $crit -eq 1 ]
- then
- echo "CRITICAL - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
- exit 2
- elif [ $warn -eq 1 ]
- then
- echo "WARNING - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
- exit 1
- else
- echo "OK - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
- exit 0
- fi
- # Hey what are we doing here ???
- exit 3
|