check_procl.sh 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. #!/bin/bash
  2. #
  3. # Check_procl.sh
  4. #
  5. # Program: Process load check plugin for Nagios
  6. # License : GPL
  7. # Copyright (c) 2002 Jerome Tytgat (j.tytgat@sioban.net)
  8. #
  9. # check_procl.sh,v 1.1 2002/07/04 09:35
  10. #
  11. # Description :
  12. #
  13. # This plugin is for check the %cpu, %mem or cputime of one or more process
  14. #
  15. # Usage :
  16. #
  17. # check_procl.sh -p process1,process2,... -w a.b -c c.d --cpu
  18. # check_procl.sh -p process1,process2,... -w a.b -c c.d --mem
  19. # check_procl.sh -p process1,process2,... -w a:b:c -c d:e:f --cputime
  20. #
  21. # check_procl.sh -p %all% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
  22. # check_procl.sh -p %max% -e process1,process2,... -w <a.b | a:b:c> -c <c.d | d:e:f> <--cpu | --mem | --cputime>
  23. #
  24. # Example :
  25. #
  26. # To know the memory eaten by HTTPD processes, be warned when it reach 50% and be critical when it reach 75%
  27. # check_procl.sh -p httpd -w 50.0 -c 75.0 --mem
  28. # > OK - total %MEM for process httpd : 46.1
  29. #
  30. # To know the process which eat the more cpu time, but as we are under linux and are using kapm we do :
  31. # check_procl.sh -p %max% -e kapmd-idle,kapmd -w 0:1:0 -c 0:2:0 --cputime
  32. # > CRITICAL - total CPUTIME for process named : 02:32:10
  33. #
  34. # Tested on solaris 7/8, Linux Redhat 7.3 and Linux Suse 7.1
  35. #
  36. # BUGS : problems with handling time on solaris...
  37. help_usage() {
  38. echo "Usage:"
  39. echo " $0 -p <process_name1,process_name2,... | %all% | %max%>"
  40. echo " [-e <process_name1,process_name2,...>] -w warning -c critical < --cpu | --mem | --cputime>"
  41. echo " $0 (-v | --version)"
  42. echo " $0 (-h | --help)"
  43. }
  44. help_version() {
  45. echo "check_procl.sh (nagios-plugins) 1.1"
  46. echo "The nagios plugins come with ABSOLUTELY NO WARRANTY. You may redistribute"
  47. echo "copies of the plugins under the terms of the GNU General Public License."
  48. echo "For more information about these matters, see the file named COPYING."
  49. echo "Copyright (c) 2002 Jerome Tytgat - j.tytgat@sioban.net"
  50. echo "Greetings goes to Websurg which kindly let me took time to develop this"
  51. echo " Manu Feig and Jacques Kern who were my beta testers, thanks to them !"
  52. }
  53. verify_dep() {
  54. needed="bash cut egrep expr grep let ps sed sort tail test tr wc"
  55. for i in `echo $needed`
  56. do
  57. type $i > /dev/null 2>&1 /dev/null
  58. if [ $? -eq 1 ]
  59. then
  60. echo "I am missing an important component : $i"
  61. echo "Cannot continue, sorry, try to find the missing one..."
  62. exit 3
  63. fi
  64. done
  65. }
  66. myself=$0
  67. verify_dep
  68. if [ "$1" = "-h" -o "$1" = "--help" ]
  69. then
  70. help_version
  71. echo ""
  72. echo "This plugin will check either the cumulutative %cpu, %mem or cputime"
  73. echo "of a process."
  74. echo ""
  75. help_usage
  76. echo ""
  77. echo "Required Arguments:"
  78. echo " -p, --process STRING1,STRING2,..."
  79. echo " names of the processes we want to monitor,"
  80. echo " you can add as much as process as you want, separated by comma,"
  81. echo " hey will be cumulated"
  82. echo " -p, --process %all%"
  83. echo " The special keyword %all% will check the cumulative cpu/mem/time of all process"
  84. echo " WARNING : Can be very slow on heavy loaded servers, watch your timeout !"
  85. echo " -p, --process %max%"
  86. echo " The special keyword %max% will check the process which eat the most"
  87. echo " WARNING : only select the process which eat the more, not the cumulative,"
  88. echo " but return the cumulative"
  89. echo " -w, --warning INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
  90. echo " generate warning state if process count is outside this range"
  91. echo " -c, --critical INTEGER.INTEGER or INTERGER:INTEGER:INTEGER"
  92. echo " generate critical state if process count is outside this range"
  93. echo " --cpu"
  94. echo " return the current cpu usage for the given process"
  95. echo " --mem"
  96. echo " return the current memory usage for the given process"
  97. echo " --cputime"
  98. echo " return the total cputime usage for the given process"
  99. echo ""
  100. echo "Optional Argument:"
  101. echo " -e, --exclude-process STRING1,STRING2,..."
  102. echo " names of the processes we want don't want to monitor"
  103. echo " only useful when associated with %all% or %max% keywords, else ignored"
  104. echo " ex : kapm-idled on linux is a process which eat memory / cputime but not really... ;-)"
  105. echo ""
  106. exit 3
  107. fi
  108. if [ "$1" = "-v" -o "$1" = "--version" ]
  109. then
  110. help_version
  111. exit 3
  112. fi
  113. if [ `echo $@|tr "=" " "|wc -w` -lt 7 ]
  114. then
  115. echo "Bad arguments number (need at least 7)!"
  116. help_usage
  117. exit 3
  118. fi
  119. tt=0
  120. process_name=""
  121. exclude_process_name=""
  122. wt=""
  123. ct=""
  124. # Test of the command lines arguments
  125. while test $# -gt 0
  126. do
  127. case "$1" in
  128. -p|--process)
  129. if [ -n "$process_name" ]
  130. then
  131. echo "Only one --process argument is useful..."
  132. help_usage
  133. exit 3
  134. fi
  135. shift
  136. process_name="`echo $1|tr \",\" \"|\"`"
  137. ;;
  138. -e|--exclude-process)
  139. if [ -n "$exclude_process_name" ]
  140. then
  141. echo "Only one --exclude-process argument is useful..."
  142. help_usage
  143. exit 3
  144. fi
  145. shift
  146. exclude_process_name="`echo $1|tr \",\" \"|\"`"
  147. ;;
  148. -w|--warning)
  149. if [ -n "$wt" ]
  150. then
  151. echo "Only one --warning argument needed... Trying to test bad things ? :-)"
  152. help_usage
  153. exit 3
  154. fi
  155. shift
  156. wt=$1
  157. ;;
  158. -c|--critical)
  159. if [ -n "$ct" ]
  160. then
  161. echo "Only one --critical argument needed... Trying to test bad things ? :-)"
  162. help_usage
  163. exit 3
  164. fi
  165. shift
  166. ct=$1
  167. ;;
  168. --cpu)
  169. if [ $tt -eq 0 ]
  170. then
  171. tt=1
  172. else
  173. echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
  174. help_usage
  175. exit 3
  176. fi
  177. type_arg_aff="%CPU"
  178. type_arg="pcpu"
  179. delim="."
  180. ;;
  181. --mem)
  182. if [ $tt -eq 0 ]
  183. then
  184. tt=2
  185. else
  186. echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
  187. help_usage
  188. exit 3
  189. fi
  190. type_arg_aff="%MEM"
  191. type_arg="pmem"
  192. delim="."
  193. ;;
  194. --cputime)
  195. if [ $tt -eq 0 ]
  196. then
  197. tt=3
  198. else
  199. echo "Only one of the arguments --cpu/--mem/--cputime can be used at a time !"
  200. help_usage
  201. exit 3
  202. fi
  203. type_arg_aff="TIME"
  204. type_arg="time"
  205. delim=":"
  206. ;;
  207. *)
  208. echo "Unknown argument $1"
  209. help_usage
  210. exit 3
  211. ;;
  212. esac
  213. shift
  214. done
  215. # Is the process running ?
  216. if [ -z "`ps -e | egrep \"$process_name?\"`" -a "$process_name" != "%all%" -a "$process_name" != "%max%" ]
  217. then
  218. echo "WARNING: process $process_name not running !"
  219. exit 3
  220. fi
  221. # Cut of warning and critical values
  222. wt_value1=`echo $wt|cut -d"$delim" -f1`
  223. wt_value2=`echo $wt|cut -d"$delim" -f2`
  224. ct_value1=`echo $ct|cut -d"$delim" -f1`
  225. ct_value2=`echo $ct|cut -d"$delim" -f2`
  226. if [ $tt -eq 3 ]
  227. then
  228. wt_value3=`echo $wt|cut -d"$delim" -f3`
  229. ct_value3=`echo $ct|cut -d"$delim" -f3`
  230. else
  231. wt_value3=0
  232. ct_value3=0
  233. fi
  234. # Integrity check of warning and critical values
  235. if [ -z "$wt_value1" -o -z "$wt_value2" -o -z "$wt_value3" ]
  236. then
  237. echo "Bad expression in the WARNING field : $wt"
  238. help_usage
  239. exit 3
  240. fi
  241. if [ "`echo $wt_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $wt_value3|tr -d \"[:digit:]\"`" != "" ]
  242. then
  243. echo "Bad expression in the WARNING field : $wt"
  244. help_usage
  245. exit 3
  246. fi
  247. if [ -z "$ct_value1" -o -z "$ct_value2" -o -z "$ct_value3" ]
  248. then
  249. echo "Bad expression in the CRITICAL field : $ct"
  250. help_usage
  251. exit 3
  252. fi
  253. if [ "`echo $ct_value1|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value2|tr -d \"[:digit:]\"`" != "" -o "`echo $ct_value3|tr -d \"[:digit:]\"`" != "" ]
  254. then
  255. echo "Bad expression in the CRITICAL field : $ct"
  256. help_usage
  257. exit 3
  258. fi
  259. # ps line construction set...
  260. case "$process_name" in
  261. %all%)
  262. if [ -z "$exclude_process_name" ]
  263. then
  264. psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
  265. else
  266. psline=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
  267. fi
  268. ;;
  269. %max%)
  270. if [ -z "$exclude_process_name" ]
  271. then
  272. pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
  273. else
  274. pstmp=`ps -eo $type_arg,comm|egrep -v "$myself|$type_arg_aff|$exclude_process_name?"|sort|tail -1|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f2`
  275. fi
  276. psline=`ps -eo $type_arg,comm|grep $pstmp|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
  277. process_name=$pstmp
  278. ;;
  279. *)
  280. psline=`ps -eo $type_arg,comm|egrep "$process_name?"|sed "s/^ *\([0-9]\)/\1/"|cut -d" " -f1`
  281. ;;
  282. esac
  283. total1=0
  284. total2=0
  285. total3=0
  286. # fetching the values
  287. for i in $psline
  288. do
  289. # Special case for solaris - several format exist for the time function...
  290. if [ ${#i} -le 6 -a "$tt" -eq 3 ]
  291. then
  292. i="00:$i"
  293. fi
  294. value1=`echo $i|cut -d$delim -f1`
  295. value2=`echo $i|cut -d$delim -f2`
  296. value3=`echo $i|cut -d$delim -f3`
  297. value3=`test -z "$value3" && echo 0 || echo $value3`
  298. total1=`expr $total1 + $value1`
  299. total2=`expr $total2 + $value2`
  300. total3=`expr $total3 + $value3`
  301. if [ $tt -eq 3 ]
  302. then
  303. if [ $total3 -ge 60 ]
  304. then
  305. let total2+=1
  306. let total3-=60
  307. fi
  308. if [ $total2 -ge 60 ]
  309. then
  310. let total1+=1
  311. let total2-=60
  312. fi
  313. else
  314. if [ $total2 -ge 10 ]
  315. then
  316. let total1+=1
  317. let total2=total2-10
  318. fi
  319. fi
  320. done
  321. warn=0
  322. crit=0
  323. # evaluation of the cumulative values vs warning and critical values
  324. case "$tt" in
  325. 1)
  326. return_total="$total1.$total2"
  327. test $total1 -gt $ct_value1 && crit=1
  328. test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
  329. test $total1 -gt $wt_value1 && warn=1
  330. test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
  331. ;;
  332. 2)
  333. return_total="$total1.$total2"
  334. test $total1 -gt $ct_value1 && crit=1
  335. test $total1 -eq $ct_value1 -a $total2 -ge $ct_value2 && crit=1
  336. test $total1 -gt $wt_value1 && warn=1
  337. test $total1 -eq $wt_value1 -a $total2 -ge $wt_value2 && warn=1
  338. ;;
  339. 3)
  340. return_total="`test ${#total1} -eq 1 && echo 0`$total1:`test ${#total2} -eq 1 && echo 0`$total2:`test ${#total3} -eq 1 && echo 0`$total3"
  341. test $total1 -gt $ct_value1 && crit=1
  342. test $total1 -eq $ct_value1 -a $total2 -gt $ct_value2 && crit=1
  343. test $total1 -eq $ct_value1 -a $total2 -eq $ct_value2 -a $total3 -ge $ct_value3 && crit=1
  344. test $total1 -gt $wt_value1 && warn=1
  345. test $total1 -eq $wt_value1 -a $total2 -gt $wt_value2 && warn=1
  346. test $total1 -eq $wt_value1 -a $total2 -eq $wt_value2 -a $total3 -ge $wt_value3 && warn=1
  347. ;;
  348. esac
  349. # last check ...
  350. if [ $crit -eq 1 -a $warn -eq 0 ]
  351. then
  352. echo "Critical value must be greater than warning value !"
  353. help_usage
  354. exit 3
  355. fi
  356. # Finally Inform Nagios of what we found...
  357. if [ $crit -eq 1 ]
  358. then
  359. echo "CRITICAL - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
  360. exit 2
  361. elif [ $warn -eq 1 ]
  362. then
  363. echo "WARNING - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
  364. exit 1
  365. else
  366. echo "OK - total $type_arg_aff for process `echo $process_name|tr \"|\" \",\"` : $return_total"
  367. exit 0
  368. fi
  369. # Hey what are we doing here ???
  370. exit 3