4
0

wd.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761
  1. /*
  2. * Copyright (c) 2010-2012 Red Hat, Inc.
  3. *
  4. * All rights reserved.
  5. *
  6. * Author: Angus Salkeld <asalkeld@redhat.com>
  7. *
  8. * This software licensed under BSD license, the text of which follows:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions are met:
  12. *
  13. * - Redistributions of source code must retain the above copyright notice,
  14. * this list of conditions and the following disclaimer.
  15. * - Redistributions in binary form must reproduce the above copyright notice,
  16. * this list of conditions and the following disclaimer in the documentation
  17. * and/or other materials provided with the distribution.
  18. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  19. * contributors may be used to endorse or promote products derived from this
  20. * software without specific prior written permission.
  21. *
  22. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  23. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  26. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  32. * THE POSSIBILITY OF SUCH DAMAGE.
  33. */
  34. #include <config.h>
  35. #include <unistd.h>
  36. #include <fcntl.h>
  37. #include <sys/ioctl.h>
  38. #include <linux/types.h>
  39. #include <linux/watchdog.h>
  40. #include <sys/reboot.h>
  41. #include <corosync/corotypes.h>
  42. #include <corosync/corodefs.h>
  43. #include <corosync/coroapi.h>
  44. #include <corosync/list.h>
  45. #include <corosync/logsys.h>
  46. #include <corosync/icmap.h>
  47. #include "fsm.h"
  48. #include "service.h"
  49. typedef enum {
  50. WD_RESOURCE_GOOD,
  51. WD_RESOURCE_FAILED,
  52. WD_RESOURCE_STATE_UNKNOWN,
  53. WD_RESOURCE_NOT_MONITORED
  54. } wd_resource_state_t;
  55. struct resource {
  56. char res_path[ICMAP_KEYNAME_MAXLEN];
  57. char *recovery;
  58. char name[CS_MAX_NAME_LENGTH];
  59. time_t last_updated;
  60. struct cs_fsm fsm;
  61. corosync_timer_handle_t check_timer;
  62. uint64_t check_timeout;
  63. icmap_track_t icmap_track;
  64. };
  65. LOGSYS_DECLARE_SUBSYS("WD");
  66. /*
  67. * Service Interfaces required by service_message_handler struct
  68. */
  69. static char *wd_exec_init_fn (struct corosync_api_v1 *corosync_api);
  70. static int wd_exec_exit_fn (void);
  71. static void wd_resource_check_fn (void* resource_ref);
  72. static struct corosync_api_v1 *api;
  73. #define WD_DEFAULT_TIMEOUT_SEC 6
  74. #define WD_DEFAULT_TIMEOUT_MS (WD_DEFAULT_TIMEOUT_SEC * CS_TIME_MS_IN_SEC)
  75. #define WD_MIN_TIMEOUT_MS 500
  76. #define WD_MAX_TIMEOUT_MS (120 * CS_TIME_MS_IN_SEC)
  77. static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT_SEC;
  78. static uint64_t tickle_timeout = (WD_DEFAULT_TIMEOUT_MS / 2);
  79. static int dog = -1;
  80. static corosync_timer_handle_t wd_timer;
  81. static int watchdog_ok = 1;
  82. static char *watchdog_device = "/dev/watchdog";
  83. struct corosync_service_engine wd_service_engine = {
  84. .name = "corosync watchdog service",
  85. .id = WD_SERVICE,
  86. .priority = 1,
  87. .private_data_size = 0,
  88. .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED,
  89. .lib_init_fn = NULL,
  90. .lib_exit_fn = NULL,
  91. .lib_engine = NULL,
  92. .lib_engine_count = 0,
  93. .exec_engine = NULL,
  94. .exec_engine_count = 0,
  95. .confchg_fn = NULL,
  96. .exec_init_fn = wd_exec_init_fn,
  97. .exec_exit_fn = wd_exec_exit_fn,
  98. .exec_dump_fn = NULL
  99. };
  100. static DECLARE_LIST_INIT (confchg_notify);
  101. /*
  102. * F S M
  103. */
  104. static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
  105. static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
  106. enum wd_resource_state {
  107. WD_S_RUNNING,
  108. WD_S_FAILED,
  109. WD_S_STOPPED
  110. };
  111. enum wd_resource_event {
  112. WD_E_FAILURE,
  113. WD_E_CONFIG_CHANGED
  114. };
  115. const char * wd_running_str = "running";
  116. const char * wd_failed_str = "failed";
  117. const char * wd_failure_str = "failure";
  118. const char * wd_stopped_str = "stopped";
  119. const char * wd_config_changed_str = "config_changed";
  120. struct cs_fsm_entry wd_fsm_table[] = {
  121. { WD_S_STOPPED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_STOPPED, WD_S_RUNNING, -1} },
  122. { WD_S_STOPPED, WD_E_FAILURE, NULL, {-1} },
  123. { WD_S_RUNNING, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
  124. { WD_S_RUNNING, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} },
  125. { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
  126. { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} },
  127. };
  128. struct corosync_service_engine *wd_get_service_engine_ver0 (void)
  129. {
  130. return (&wd_service_engine);
  131. }
  132. static const char * wd_res_state_to_str(struct cs_fsm* fsm,
  133. int32_t state)
  134. {
  135. switch (state) {
  136. case WD_S_STOPPED:
  137. return wd_stopped_str;
  138. break;
  139. case WD_S_RUNNING:
  140. return wd_running_str;
  141. break;
  142. case WD_S_FAILED:
  143. return wd_failed_str;
  144. break;
  145. }
  146. return NULL;
  147. }
  148. static const char * wd_res_event_to_str(struct cs_fsm* fsm,
  149. int32_t event)
  150. {
  151. switch (event) {
  152. case WD_E_CONFIG_CHANGED:
  153. return wd_config_changed_str;
  154. break;
  155. case WD_E_FAILURE:
  156. return wd_failure_str;
  157. break;
  158. }
  159. return NULL;
  160. }
  161. static void wd_fsm_cb (struct cs_fsm *fsm, int cb_event, int32_t curr_state,
  162. int32_t next_state, int32_t fsm_event, void *data)
  163. {
  164. switch (cb_event) {
  165. case CS_FSM_CB_EVENT_PROCESS_NF:
  166. log_printf (LOGSYS_LEVEL_ERROR, "Fsm:%s could not find event \"%s\" in state \"%s\"",
  167. fsm->name, fsm->event_to_str(fsm, fsm_event), fsm->state_to_str(fsm, curr_state));
  168. corosync_exit_error(COROSYNC_DONE_FATAL_ERR);
  169. break;
  170. case CS_FSM_CB_EVENT_STATE_SET:
  171. log_printf (LOGSYS_LEVEL_INFO, "Fsm:%s event \"%s\", state \"%s\" --> \"%s\"",
  172. fsm->name,
  173. fsm->event_to_str(fsm, fsm_event),
  174. fsm->state_to_str(fsm, fsm->table[fsm->curr_entry].curr_state),
  175. fsm->state_to_str(fsm, next_state));
  176. break;
  177. case CS_FSM_CB_EVENT_STATE_SET_NF:
  178. log_printf (LOGSYS_LEVEL_CRIT, "Fsm:%s Can't change state from \"%s\" to \"%s\" (event was \"%s\")",
  179. fsm->name,
  180. fsm->state_to_str(fsm, fsm->table[fsm->curr_entry].curr_state),
  181. fsm->state_to_str(fsm, next_state),
  182. fsm->event_to_str(fsm, fsm_event));
  183. corosync_exit_error(COROSYNC_DONE_FATAL_ERR);
  184. break;
  185. default:
  186. log_printf (LOGSYS_LEVEL_CRIT, "Fsm: Unknown callback event!");
  187. corosync_exit_error(COROSYNC_DONE_FATAL_ERR);
  188. break;
  189. }
  190. }
  191. /*
  192. * returns (CS_TRUE == OK, CS_FALSE == failed)
  193. */
  194. static int32_t wd_resource_state_is_ok (struct resource *ref)
  195. {
  196. char* state = NULL;
  197. uint64_t last_updated;
  198. uint64_t my_time;
  199. uint64_t allowed_period;
  200. char key_name[ICMAP_KEYNAME_MAXLEN];
  201. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "last_updated");
  202. if (icmap_get_uint64(key_name, &last_updated) != CS_OK) {
  203. /* key does not exist.
  204. */
  205. return CS_FALSE;
  206. }
  207. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "state");
  208. if (icmap_get_string(key_name, &state) != CS_OK || strcmp(state, "disabled") == 0) {
  209. /* key does not exist.
  210. */
  211. if (state != NULL)
  212. free(state);
  213. return CS_FALSE;
  214. }
  215. if (last_updated == 0) {
  216. /* initial value */
  217. free(state);
  218. return CS_TRUE;
  219. }
  220. my_time = cs_timestamp_get();
  221. /*
  222. * Here we check that the monitor has written a timestamp within the poll_period
  223. * plus a grace factor of (0.5 * poll_period).
  224. */
  225. allowed_period = (ref->check_timeout * MILLI_2_NANO_SECONDS * 3) / 2;
  226. if ((last_updated + allowed_period) < my_time) {
  227. log_printf (LOGSYS_LEVEL_ERROR,
  228. "last_updated %"PRIu64" ms too late, period:%"PRIu64".",
  229. (uint64_t)(my_time/MILLI_2_NANO_SECONDS - ((last_updated + allowed_period) / MILLI_2_NANO_SECONDS)),
  230. ref->check_timeout);
  231. free(state);
  232. return CS_FALSE;
  233. }
  234. if (strcmp (state, wd_failed_str) == 0) {
  235. free(state);
  236. return CS_FALSE;
  237. }
  238. free(state);
  239. return CS_TRUE;
  240. }
  241. static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
  242. {
  243. char *state;
  244. uint64_t tmp_value;
  245. uint64_t next_timeout;
  246. struct resource *ref = (struct resource*)data;
  247. char key_name[ICMAP_KEYNAME_MAXLEN];
  248. next_timeout = ref->check_timeout;
  249. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "poll_period");
  250. if (icmap_get_uint64(ref->res_path, &tmp_value) == CS_OK) {
  251. if (tmp_value >= WD_MIN_TIMEOUT_MS && tmp_value <= WD_MAX_TIMEOUT_MS) {
  252. log_printf (LOGSYS_LEVEL_DEBUG,
  253. "poll_period changing from:%"PRIu64" to %"PRIu64".",
  254. ref->check_timeout, tmp_value);
  255. /*
  256. * To easy in the transition between poll_period's we are going
  257. * to make the first timeout the bigger of the new and old value.
  258. * This is to give the monitoring system time to adjust.
  259. */
  260. next_timeout = CS_MAX(tmp_value, ref->check_timeout);
  261. ref->check_timeout = tmp_value;
  262. } else {
  263. log_printf (LOGSYS_LEVEL_WARNING,
  264. "Could NOT use poll_period:%"PRIu64" ms for resource %s",
  265. tmp_value, ref->name);
  266. }
  267. }
  268. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "recovery");
  269. if (icmap_get_string(key_name, &ref->recovery) != CS_OK) {
  270. /* key does not exist.
  271. */
  272. log_printf (LOGSYS_LEVEL_WARNING,
  273. "resource %s missing a recovery key.", ref->name);
  274. cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
  275. return;
  276. }
  277. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", ref->res_path, "state");
  278. if (icmap_get_string(key_name, &state) != CS_OK) {
  279. /* key does not exist.
  280. */
  281. log_printf (LOGSYS_LEVEL_WARNING,
  282. "resource %s missing a state key.", ref->name);
  283. cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
  284. return;
  285. }
  286. if (ref->check_timer) {
  287. api->timer_delete(ref->check_timer);
  288. ref->check_timer = 0;
  289. }
  290. if (strcmp(wd_stopped_str, state) == 0) {
  291. cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref, wd_fsm_cb);
  292. } else {
  293. api->timer_add_duration(next_timeout * MILLI_2_NANO_SECONDS,
  294. ref, wd_resource_check_fn, &ref->check_timer);
  295. cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref, wd_fsm_cb);
  296. }
  297. free(state);
  298. }
  299. static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
  300. {
  301. struct resource* ref = (struct resource*)data;
  302. if (ref->check_timer) {
  303. api->timer_delete(ref->check_timer);
  304. ref->check_timer = 0;
  305. }
  306. log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
  307. ref->recovery, (char*)ref->name);
  308. if (strcmp (ref->recovery, "watchdog") == 0 ||
  309. strcmp (ref->recovery, "quit") == 0) {
  310. watchdog_ok = 0;
  311. }
  312. else if (strcmp (ref->recovery, "reboot") == 0) {
  313. reboot(RB_AUTOBOOT);
  314. }
  315. else if (strcmp (ref->recovery, "shutdown") == 0) {
  316. reboot(RB_POWER_OFF);
  317. }
  318. cs_fsm_state_set(fsm, WD_S_FAILED, data, wd_fsm_cb);
  319. }
  320. static void wd_key_changed(
  321. int32_t event,
  322. const char *key_name,
  323. struct icmap_notify_value new_val,
  324. struct icmap_notify_value old_val,
  325. void *user_data)
  326. {
  327. struct resource* ref = (struct resource*)user_data;
  328. char *last_key_part;
  329. if (ref == NULL) {
  330. return ;
  331. }
  332. last_key_part = strrchr(key_name, '.');
  333. if (last_key_part == NULL) {
  334. return ;
  335. }
  336. last_key_part++;
  337. if (event == ICMAP_TRACK_ADD || event == ICMAP_TRACK_MODIFY) {
  338. if (strcmp(last_key_part, "last_updated") == 0 ||
  339. strcmp(last_key_part, "current") == 0) {
  340. return;
  341. }
  342. cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref, wd_fsm_cb);
  343. }
  344. if (event == ICMAP_TRACK_DELETE && ref != NULL) {
  345. if (strcmp(last_key_part, "state") != 0) {
  346. return ;
  347. }
  348. log_printf (LOGSYS_LEVEL_WARNING,
  349. "resource \"%s\" deleted from cmap!",
  350. ref->name);
  351. api->timer_delete(ref->check_timer);
  352. ref->check_timer = 0;
  353. icmap_track_delete(ref->icmap_track);
  354. free(ref);
  355. }
  356. }
  357. static void wd_resource_check_fn (void* resource_ref)
  358. {
  359. struct resource* ref = (struct resource*)resource_ref;
  360. if (wd_resource_state_is_ok (ref) == CS_FALSE) {
  361. cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref, wd_fsm_cb);
  362. return;
  363. }
  364. api->timer_add_duration(ref->check_timeout*MILLI_2_NANO_SECONDS,
  365. ref, wd_resource_check_fn, &ref->check_timer);
  366. }
  367. /*
  368. * return 0 - fully configured
  369. * return -1 - partially configured
  370. */
  371. static int32_t wd_resource_create (char *res_path, char *res_name)
  372. {
  373. char *state;
  374. uint64_t tmp_value;
  375. struct resource *ref = calloc (1, sizeof (struct resource));
  376. char key_name[ICMAP_KEYNAME_MAXLEN];
  377. strcpy(ref->res_path, res_path);
  378. ref->check_timeout = WD_DEFAULT_TIMEOUT_MS;
  379. ref->check_timer = 0;
  380. strcpy(ref->name, res_name);
  381. ref->fsm.name = ref->name;
  382. ref->fsm.table = wd_fsm_table;
  383. ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
  384. ref->fsm.curr_entry = 0;
  385. ref->fsm.curr_state = WD_S_STOPPED;
  386. ref->fsm.state_to_str = wd_res_state_to_str;
  387. ref->fsm.event_to_str = wd_res_event_to_str;
  388. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "poll_period");
  389. if (icmap_get_uint64(key_name, &tmp_value) != CS_OK) {
  390. icmap_set_uint64(key_name, ref->check_timeout);
  391. } else {
  392. if (tmp_value >= WD_MIN_TIMEOUT_MS && tmp_value <= WD_MAX_TIMEOUT_MS) {
  393. ref->check_timeout = tmp_value;
  394. } else {
  395. log_printf (LOGSYS_LEVEL_WARNING,
  396. "Could NOT use poll_period:%"PRIu64" ms for resource %s",
  397. tmp_value, ref->name);
  398. }
  399. }
  400. icmap_track_add(res_path,
  401. ICMAP_TRACK_ADD | ICMAP_TRACK_MODIFY | ICMAP_TRACK_DELETE | ICMAP_TRACK_PREFIX,
  402. wd_key_changed,
  403. ref, &ref->icmap_track);
  404. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "recovery");
  405. if (icmap_get_string(key_name, &ref->recovery) != CS_OK) {
  406. /* key does not exist.
  407. */
  408. log_printf (LOGSYS_LEVEL_WARNING,
  409. "resource %s missing a recovery key.", ref->name);
  410. return -1;
  411. }
  412. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "state");
  413. if (icmap_get_string(key_name, &state) != CS_OK) {
  414. /* key does not exist.
  415. */
  416. log_printf (LOGSYS_LEVEL_WARNING,
  417. "resource %s missing a state key.", ref->name);
  418. return -1;
  419. }
  420. snprintf(key_name, ICMAP_KEYNAME_MAXLEN, "%s%s", res_path, "last_updated");
  421. if (icmap_get_uint64(key_name, &tmp_value) != CS_OK) {
  422. /* key does not exist.
  423. */
  424. ref->last_updated = 0;
  425. } else {
  426. ref->last_updated = tmp_value;
  427. }
  428. /*
  429. * delay the first check to give the monitor time to start working.
  430. */
  431. tmp_value = CS_MAX(ref->check_timeout * 2, WD_DEFAULT_TIMEOUT_MS);
  432. api->timer_add_duration(tmp_value * MILLI_2_NANO_SECONDS,
  433. ref,
  434. wd_resource_check_fn, &ref->check_timer);
  435. cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref, wd_fsm_cb);
  436. return 0;
  437. }
  438. static void wd_tickle_fn (void* arg)
  439. {
  440. ENTER();
  441. if (watchdog_ok) {
  442. if (dog > 0) {
  443. ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok);
  444. }
  445. api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
  446. wd_tickle_fn, &wd_timer);
  447. }
  448. else {
  449. log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
  450. }
  451. }
  452. static void wd_resource_created_cb(
  453. int32_t event,
  454. const char *key_name,
  455. struct icmap_notify_value new_val,
  456. struct icmap_notify_value old_val,
  457. void *user_data)
  458. {
  459. char res_name[ICMAP_KEYNAME_MAXLEN];
  460. char res_type[ICMAP_KEYNAME_MAXLEN];
  461. char tmp_key[ICMAP_KEYNAME_MAXLEN];
  462. int res;
  463. if (event != ICMAP_TRACK_ADD) {
  464. return ;
  465. }
  466. res = sscanf(key_name, "resources.%[^.].%[^.].%[^.]", res_type, res_name, tmp_key);
  467. if (res != 3) {
  468. return ;
  469. }
  470. if (strcmp(tmp_key, "state") != 0) {
  471. return ;
  472. }
  473. snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "resources.%s.%s.", res_type, res_name);
  474. wd_resource_create (tmp_key, res_name);
  475. }
  476. static void wd_scan_resources (void)
  477. {
  478. int res_count = 0;
  479. icmap_track_t icmap_track = NULL;
  480. icmap_iter_t iter;
  481. const char *key_name;
  482. int res;
  483. char res_name[ICMAP_KEYNAME_MAXLEN];
  484. char res_type[ICMAP_KEYNAME_MAXLEN];
  485. char tmp_key[ICMAP_KEYNAME_MAXLEN];
  486. ENTER();
  487. iter = icmap_iter_init("resources.");
  488. while ((key_name = icmap_iter_next(iter, NULL, NULL)) != NULL) {
  489. res = sscanf(key_name, "resources.%[^.].%[^.].%[^.]", res_type, res_name, tmp_key);
  490. if (res != 3) {
  491. continue ;
  492. }
  493. if (strcmp(tmp_key, "state") != 0) {
  494. continue ;
  495. }
  496. snprintf(tmp_key, ICMAP_KEYNAME_MAXLEN, "resources.%s.%s.", res_type, res_name);
  497. if (wd_resource_create (tmp_key, res_name) == 0) {
  498. res_count++;
  499. }
  500. }
  501. icmap_iter_finalize(iter);
  502. icmap_track_add("resources.process.", ICMAP_TRACK_ADD | ICMAP_TRACK_PREFIX,
  503. wd_resource_created_cb, NULL, &icmap_track);
  504. icmap_track_add("resources.system.", ICMAP_TRACK_ADD | ICMAP_TRACK_PREFIX,
  505. wd_resource_created_cb, NULL, &icmap_track);
  506. if (res_count == 0) {
  507. log_printf (LOGSYS_LEVEL_INFO, "no resources configured.");
  508. }
  509. }
  510. static void watchdog_timeout_apply (uint32_t new)
  511. {
  512. struct watchdog_info ident;
  513. uint32_t original_timeout = 0;
  514. if (dog > 0) {
  515. ioctl(dog, WDIOC_GETTIMEOUT, &original_timeout);
  516. }
  517. if (new == original_timeout) {
  518. return;
  519. }
  520. watchdog_timeout = new;
  521. if (dog > 0) {
  522. ioctl(dog, WDIOC_GETSUPPORT, &ident);
  523. if (ident.options & WDIOF_SETTIMEOUT) {
  524. /* yay! the dog is trained.
  525. */
  526. ioctl(dog, WDIOC_SETTIMEOUT, &watchdog_timeout);
  527. }
  528. ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout);
  529. }
  530. if (watchdog_timeout == new) {
  531. tickle_timeout = (watchdog_timeout * CS_TIME_MS_IN_SEC)/ 2;
  532. /* reset the tickle timer in case it was reduced.
  533. */
  534. api->timer_delete (wd_timer);
  535. api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
  536. wd_tickle_fn, &wd_timer);
  537. log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds", watchdog_timeout);
  538. log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %"PRIu64" ms", tickle_timeout);
  539. } else {
  540. log_printf (LOGSYS_LEVEL_WARNING,
  541. "Could not change the Watchdog timeout from %d to %d seconds",
  542. original_timeout, new);
  543. }
  544. }
  545. static int setup_watchdog(void)
  546. {
  547. struct watchdog_info ident;
  548. char *str;
  549. ENTER();
  550. if (icmap_get_string("resources.watchdog_device", &str) == CS_OK) {
  551. if (strcmp (str, "off") == 0) {
  552. log_printf (LOGSYS_LEVEL_WARNING, "Watchdog disabled by configuration");
  553. free(str);
  554. dog = -1;
  555. return -1;
  556. } else {
  557. watchdog_device = str;
  558. }
  559. }
  560. if (access (watchdog_device, W_OK) != 0) {
  561. log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog %s, try modprobe <a watchdog>", watchdog_device);
  562. dog = -1;
  563. return -1;
  564. }
  565. /* here goes, lets hope they have "Magic Close"
  566. */
  567. dog = open(watchdog_device, O_WRONLY);
  568. if (dog == -1) {
  569. log_printf (LOGSYS_LEVEL_WARNING, "Watchdog %s exists but couldn't be opened.", watchdog_device);
  570. dog = -1;
  571. return -1;
  572. }
  573. /* Right we have the dog.
  574. * Lets see what breed it is.
  575. */
  576. ioctl(dog, WDIOC_GETSUPPORT, &ident);
  577. log_printf (LOGSYS_LEVEL_INFO, "Watchdog %s is now been tickled by corosync.", watchdog_device);
  578. log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
  579. watchdog_timeout_apply (watchdog_timeout);
  580. ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
  581. return 0;
  582. }
  583. static void wd_top_level_key_changed(
  584. int32_t event,
  585. const char *key_name,
  586. struct icmap_notify_value new_val,
  587. struct icmap_notify_value old_val,
  588. void *user_data)
  589. {
  590. uint32_t tmp_value_32;
  591. ENTER();
  592. if (icmap_get_uint32("resources.watchdog_timeout", &tmp_value_32) == CS_OK) {
  593. if (tmp_value_32 >= 2 && tmp_value_32 <= 120) {
  594. watchdog_timeout_apply (tmp_value_32);
  595. return;
  596. }
  597. }
  598. log_printf (LOGSYS_LEVEL_WARNING,
  599. "Set watchdog_timeout is out of range (2..120).");
  600. icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
  601. }
  602. static void watchdog_timeout_get_initial (void)
  603. {
  604. uint32_t tmp_value_32;
  605. icmap_track_t icmap_track = NULL;
  606. ENTER();
  607. if (icmap_get_uint32("resources.watchdog_timeout", &tmp_value_32) != CS_OK) {
  608. watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
  609. icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
  610. }
  611. else {
  612. if (tmp_value_32 >= 2 && tmp_value_32 <= 120) {
  613. watchdog_timeout_apply (tmp_value_32);
  614. }
  615. else {
  616. log_printf (LOGSYS_LEVEL_WARNING,
  617. "Set watchdog_timeout is out of range (2..120).");
  618. log_printf (LOGSYS_LEVEL_INFO,
  619. "use default value %d seconds.", WD_DEFAULT_TIMEOUT_SEC);
  620. watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
  621. icmap_set_uint32("resources.watchdog_timeout", watchdog_timeout);
  622. }
  623. }
  624. icmap_track_add("resources.watchdog_timeout", ICMAP_TRACK_MODIFY,
  625. wd_top_level_key_changed, NULL, &icmap_track);
  626. }
  627. static char *wd_exec_init_fn (struct corosync_api_v1 *corosync_api)
  628. {
  629. ENTER();
  630. api = corosync_api;
  631. watchdog_timeout_get_initial();
  632. setup_watchdog();
  633. wd_scan_resources();
  634. return NULL;
  635. }
  636. static int wd_exec_exit_fn (void)
  637. {
  638. char magic = 'V';
  639. ENTER();
  640. if (dog > 0) {
  641. log_printf (LOGSYS_LEVEL_INFO, "magically closing the watchdog.");
  642. write (dog, &magic, 1);
  643. }
  644. return 0;
  645. }