wd.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838
  1. /*
  2. * Copyright (c) 2010 Red Hat, Inc.
  3. *
  4. * All rights reserved.
  5. *
  6. * Author: Angus Salkeld <asalkeld@redhat.com>
  7. *
  8. * This software licensed under BSD license, the text of which follows:
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions are met:
  12. *
  13. * - Redistributions of source code must retain the above copyright notice,
  14. * this list of conditions and the following disclaimer.
  15. * - Redistributions in binary form must reproduce the above copyright notice,
  16. * this list of conditions and the following disclaimer in the documentation
  17. * and/or other materials provided with the distribution.
  18. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  19. * contributors may be used to endorse or promote products derived from this
  20. * software without specific prior written permission.
  21. *
  22. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  23. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  24. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  25. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  26. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  27. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  28. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  29. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  30. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  31. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  32. * THE POSSIBILITY OF SUCH DAMAGE.
  33. */
  34. #include <config.h>
  35. #include <unistd.h>
  36. #include <fcntl.h>
  37. #include <sys/ioctl.h>
  38. #include <linux/types.h>
  39. #include <linux/watchdog.h>
  40. #include <sys/reboot.h>
  41. #include <corosync/corotypes.h>
  42. #include <corosync/corodefs.h>
  43. #include <corosync/lcr/lcr_comp.h>
  44. #include <corosync/engine/coroapi.h>
  45. #include <corosync/list.h>
  46. #include <corosync/engine/logsys.h>
  47. #include "../exec/fsm.h"
  48. typedef enum {
  49. WD_RESOURCE_GOOD,
  50. WD_RESOURCE_FAILED,
  51. WD_RESOURCE_STATE_UNKNOWN,
  52. WD_RESOURCE_NOT_MONITORED
  53. } wd_resource_state_t;
  54. struct resource {
  55. hdb_handle_t handle;
  56. char *recovery;
  57. char name[CS_MAX_NAME_LENGTH];
  58. time_t last_updated;
  59. struct cs_fsm fsm;
  60. corosync_timer_handle_t check_timer;
  61. uint64_t check_timeout;
  62. };
  63. LOGSYS_DECLARE_SUBSYS("WD");
  64. /*
  65. * Service Interfaces required by service_message_handler struct
  66. */
  67. static int wd_exec_init_fn (
  68. struct corosync_api_v1 *corosync_api);
  69. static int wd_exec_exit_fn (void);
  70. static void wd_resource_check_fn (void* resource_ref);
  71. static struct corosync_api_v1 *api;
  72. #define WD_DEFAULT_TIMEOUT_SEC 6
  73. #define WD_DEFAULT_TIMEOUT_MS (WD_DEFAULT_TIMEOUT_SEC * CS_TIME_MS_IN_SEC)
  74. #define WD_MIN_TIMEOUT_MS 500
  75. #define WD_MAX_TIMEOUT_MS (120 * CS_TIME_MS_IN_SEC)
  76. static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT_SEC;
  77. static uint64_t tickle_timeout = (WD_DEFAULT_TIMEOUT_MS / 2);
  78. static int dog = -1;
  79. static corosync_timer_handle_t wd_timer;
  80. static hdb_handle_t resources_obj;
  81. static int watchdog_ok = 1;
  82. struct corosync_service_engine wd_service_engine = {
  83. .name = "corosync watchdog service",
  84. .id = WD_SERVICE,
  85. .priority = 1,
  86. .private_data_size = 0,
  87. .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED,
  88. .lib_init_fn = NULL,
  89. .lib_exit_fn = NULL,
  90. .lib_engine = NULL,
  91. .lib_engine_count = 0,
  92. .exec_engine = NULL,
  93. .exec_engine_count = 0,
  94. .confchg_fn = NULL,
  95. .exec_init_fn = wd_exec_init_fn,
  96. .exec_exit_fn = wd_exec_exit_fn,
  97. .exec_dump_fn = NULL,
  98. .sync_mode = CS_SYNC_V2
  99. };
  100. static DECLARE_LIST_INIT (confchg_notify);
  101. /*
  102. * F S M
  103. */
  104. static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
  105. static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
  106. enum wd_resource_state {
  107. WD_S_RUNNING,
  108. WD_S_FAILED,
  109. WD_S_STOPPED
  110. };
  111. enum wd_resource_event {
  112. WD_E_FAILURE,
  113. WD_E_CONFIG_CHANGED
  114. };
  115. const char * wd_running_str = "running";
  116. const char * wd_failed_str = "failed";
  117. const char * wd_failure_str = "failure";
  118. const char * wd_stopped_str = "stopped";
  119. const char * wd_config_changed_str = "config_changed";
  120. struct cs_fsm_entry wd_fsm_table[] = {
  121. { WD_S_STOPPED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_STOPPED, WD_S_RUNNING, -1} },
  122. { WD_S_STOPPED, WD_E_FAILURE, NULL, {-1} },
  123. { WD_S_RUNNING, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
  124. { WD_S_RUNNING, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} },
  125. { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
  126. { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} },
  127. };
  128. /*
  129. * Dynamic loading descriptor
  130. */
  131. static struct corosync_service_engine *wd_get_service_engine_ver0 (void);
  132. static struct corosync_service_engine_iface_ver0 wd_service_engine_iface = {
  133. .corosync_get_service_engine_ver0 = wd_get_service_engine_ver0
  134. };
  135. static struct lcr_iface corosync_wd_ver0[1] = {
  136. {
  137. .name = "corosync_wd",
  138. .version = 0,
  139. .versions_replace = 0,
  140. .versions_replace_count = 0,
  141. .dependencies = 0,
  142. .dependency_count = 0,
  143. .constructor = NULL,
  144. .destructor = NULL,
  145. .interfaces = NULL,
  146. }
  147. };
  148. static struct lcr_comp wd_comp_ver0 = {
  149. .iface_count = 1,
  150. .ifaces = corosync_wd_ver0
  151. };
  152. static struct corosync_service_engine *wd_get_service_engine_ver0 (void)
  153. {
  154. return (&wd_service_engine);
  155. }
  156. #ifdef COROSYNC_SOLARIS
  157. void corosync_lcr_component_register (void);
  158. void corosync_lcr_component_register (void) {
  159. #else
  160. __attribute__ ((constructor)) static void corosync_lcr_component_register (void) {
  161. #endif
  162. lcr_interfaces_set (&corosync_wd_ver0[0], &wd_service_engine_iface);
  163. lcr_component_register (&wd_comp_ver0);
  164. }
  165. static int object_find_or_create (
  166. hdb_handle_t parent_object_handle,
  167. hdb_handle_t *object_handle,
  168. const void *object_name,
  169. size_t object_name_len)
  170. {
  171. hdb_handle_t obj_finder;
  172. hdb_handle_t obj;
  173. int ret = -1;
  174. api->object_find_create (
  175. parent_object_handle,
  176. object_name,
  177. object_name_len,
  178. &obj_finder);
  179. if (api->object_find_next (obj_finder, &obj) == 0) {
  180. /* found it */
  181. *object_handle = obj;
  182. ret = 0;
  183. }
  184. else {
  185. ret = api->object_create (parent_object_handle,
  186. object_handle,
  187. object_name, object_name_len);
  188. }
  189. api->object_find_destroy (obj_finder);
  190. return ret;
  191. }
  192. static cs_error_t str_to_uint64_t(const char* str, uint64_t *out_value, uint64_t min, uint64_t max)
  193. {
  194. char *endptr;
  195. errno = 0;
  196. *out_value = strtol(str, &endptr, 0);
  197. /* Check for various possible errors */
  198. if (errno != 0 || endptr == str) {
  199. return CS_ERR_INVALID_PARAM;
  200. }
  201. if (*out_value > max || *out_value < min) {
  202. return CS_ERR_INVALID_PARAM;
  203. }
  204. return CS_OK;
  205. }
  206. static const char * wd_res_state_to_str(struct cs_fsm* fsm,
  207. int32_t state)
  208. {
  209. switch (state) {
  210. case WD_S_STOPPED:
  211. return wd_stopped_str;
  212. break;
  213. case WD_S_RUNNING:
  214. return wd_running_str;
  215. break;
  216. case WD_S_FAILED:
  217. return wd_failed_str;
  218. break;
  219. }
  220. return NULL;
  221. }
  222. static const char * wd_res_event_to_str(struct cs_fsm* fsm,
  223. int32_t event)
  224. {
  225. switch (event) {
  226. case WD_E_CONFIG_CHANGED:
  227. return wd_config_changed_str;
  228. break;
  229. case WD_E_FAILURE:
  230. return wd_failure_str;
  231. break;
  232. }
  233. return NULL;
  234. }
  235. /*
  236. * returns (CS_TRUE == OK, CS_FALSE == failed)
  237. */
  238. static int32_t wd_resource_state_is_ok (struct resource *ref)
  239. {
  240. hdb_handle_t resource = ref->handle;
  241. int res;
  242. char* state;
  243. size_t state_len;
  244. objdb_value_types_t type;
  245. uint64_t *last_updated;
  246. uint64_t my_time;
  247. uint64_t allowed_period;
  248. size_t last_updated_len;
  249. res = api->object_key_get_typed (resource,
  250. "last_updated", (void*)&last_updated, &last_updated_len, &type);
  251. if (res != 0) {
  252. /* key does not exist.
  253. */
  254. return CS_FALSE;
  255. }
  256. res = api->object_key_get_typed (resource,
  257. "state", (void**)&state, &state_len, &type);
  258. if (res != 0 || strncmp (state, "disabled", strlen ("disabled")) == 0) {
  259. /* key does not exist.
  260. */
  261. return CS_FALSE;
  262. }
  263. if (*last_updated == 0) {
  264. /* initial value */
  265. return CS_TRUE;
  266. }
  267. my_time = cs_timestamp_get();
  268. /*
  269. * Here we check that the monitor has written a timestamp within the poll_period
  270. * plus a grace factor of (0.5 * poll_period).
  271. */
  272. allowed_period = (ref->check_timeout * MILLI_2_NANO_SECONDS * 3) / 2;
  273. if ((*last_updated + allowed_period) < my_time) {
  274. log_printf (LOGSYS_LEVEL_ERROR,
  275. "last_updated %"PRIu64" ms too late, period:%"PRIu64".",
  276. (uint64_t)(my_time/MILLI_2_NANO_SECONDS - ((*last_updated + allowed_period) / MILLI_2_NANO_SECONDS)),
  277. ref->check_timeout);
  278. return CS_FALSE;
  279. }
  280. if (strcmp (state, wd_failed_str) == 0) {
  281. return CS_FALSE;
  282. }
  283. return CS_TRUE;
  284. }
  285. static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
  286. {
  287. int res;
  288. size_t len;
  289. char *state;
  290. objdb_value_types_t type;
  291. char *str;
  292. uint64_t tmp_value;
  293. uint64_t next_timeout;
  294. struct resource *ref = (struct resource*)data;
  295. char str_copy[256];
  296. next_timeout = ref->check_timeout;
  297. res = api->object_key_get_typed (ref->handle,
  298. "poll_period",
  299. (void**)&str, &len,
  300. &type);
  301. if (res == 0) {
  302. memcpy(str_copy, str, len);
  303. str_copy[len] = '\0';
  304. if (str_to_uint64_t(str_copy, &tmp_value, WD_MIN_TIMEOUT_MS, WD_MAX_TIMEOUT_MS) == CS_OK) {
  305. log_printf (LOGSYS_LEVEL_DEBUG,
  306. "poll_period changing from:%"PRIu64" to %"PRIu64".",
  307. ref->check_timeout, tmp_value);
  308. /*
  309. * To easy in the transition between poll_period's we are going
  310. * to make the first timeout the bigger of the new and old value.
  311. * This is to give the monitoring system time to adjust.
  312. */
  313. next_timeout = CS_MAX(tmp_value, ref->check_timeout);
  314. ref->check_timeout = tmp_value;
  315. } else {
  316. log_printf (LOGSYS_LEVEL_WARNING,
  317. "Could NOT use poll_period:%s ms for resource %s",
  318. str, ref->name);
  319. }
  320. }
  321. res = api->object_key_get_typed (ref->handle,
  322. "recovery", (void*)&ref->recovery, &len, &type);
  323. if (res != 0) {
  324. /* key does not exist.
  325. */
  326. log_printf (LOGSYS_LEVEL_WARNING,
  327. "resource %s missing a recovery key.", ref->name);
  328. cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref);
  329. return;
  330. }
  331. res = api->object_key_get_typed (ref->handle,
  332. "state", (void*)&state, &len, &type);
  333. if (res != 0) {
  334. /* key does not exist.
  335. */
  336. log_printf (LOGSYS_LEVEL_WARNING,
  337. "resource %s missing a state key.", ref->name);
  338. cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref);
  339. return;
  340. }
  341. if (ref->check_timer) {
  342. api->timer_delete(ref->check_timer);
  343. ref->check_timer = NULL;
  344. }
  345. if (strcmp(wd_stopped_str, state) == 0) {
  346. cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref);
  347. } else {
  348. api->timer_add_duration(next_timeout * MILLI_2_NANO_SECONDS,
  349. ref, wd_resource_check_fn, &ref->check_timer);
  350. cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref);
  351. }
  352. }
  353. static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
  354. {
  355. struct resource* ref = (struct resource*)data;
  356. if (ref->check_timer) {
  357. api->timer_delete(ref->check_timer);
  358. ref->check_timer = NULL;
  359. }
  360. log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
  361. ref->recovery, (char*)ref->name);
  362. if (strcmp (ref->recovery, "watchdog") == 0 ||
  363. strcmp (ref->recovery, "quit") == 0) {
  364. watchdog_ok = 0;
  365. }
  366. else if (strcmp (ref->recovery, "reboot") == 0) {
  367. reboot(RB_AUTOBOOT);
  368. }
  369. else if (strcmp (ref->recovery, "shutdown") == 0) {
  370. reboot(RB_POWER_OFF);
  371. }
  372. cs_fsm_state_set(fsm, WD_S_FAILED, data);
  373. }
  374. static void wd_key_changed(object_change_type_t change_type,
  375. hdb_handle_t parent_object_handle,
  376. hdb_handle_t object_handle,
  377. const void *object_name_pt, size_t object_name_len,
  378. const void *key_name_pt, size_t key_len,
  379. const void *key_value_pt, size_t key_value_len,
  380. void *priv_data_pt)
  381. {
  382. struct resource* ref = (struct resource*)priv_data_pt;
  383. if (strncmp(key_name_pt, "last_updated", key_len) == 0 ||
  384. strncmp(key_name_pt, "current", key_len) == 0) {
  385. return;
  386. }
  387. if (ref == NULL) {
  388. return;
  389. }
  390. cs_fsm_process(&ref->fsm, WD_E_CONFIG_CHANGED, ref);
  391. }
  392. static void wd_object_destroyed(
  393. hdb_handle_t parent_object_handle,
  394. const void *name_pt, size_t name_len,
  395. void *priv_data_pt)
  396. {
  397. struct resource* ref = (struct resource*)priv_data_pt;
  398. if (ref) {
  399. log_printf (LOGSYS_LEVEL_WARNING,
  400. "resource \"%s\" deleted from objdb!",
  401. ref->name);
  402. api->timer_delete(ref->check_timer);
  403. ref->check_timer = NULL;
  404. free(ref);
  405. }
  406. }
  407. static void wd_resource_check_fn (void* resource_ref)
  408. {
  409. struct resource* ref = (struct resource*)resource_ref;
  410. if (wd_resource_state_is_ok (ref) == CS_FALSE) {
  411. cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref);
  412. return;
  413. }
  414. api->timer_add_duration(ref->check_timeout*MILLI_2_NANO_SECONDS,
  415. ref, wd_resource_check_fn, &ref->check_timer);
  416. }
  417. /*
  418. * return 0 - fully configured
  419. * return -1 - partially configured
  420. */
  421. static int32_t wd_resource_create (hdb_handle_t resource_obj)
  422. {
  423. int res;
  424. size_t len;
  425. char *state;
  426. objdb_value_types_t type;
  427. char period_str[32];
  428. char str_copy[256];
  429. char *str;
  430. uint64_t tmp_value;
  431. struct resource *ref = malloc (sizeof (struct resource));
  432. ref->handle = resource_obj;
  433. ref->check_timeout = WD_DEFAULT_TIMEOUT_MS;
  434. ref->check_timer = NULL;
  435. api->object_name_get (resource_obj,
  436. ref->name,
  437. &len);
  438. ref->name[len] = '\0';
  439. ref->fsm.name = ref->name;
  440. ref->fsm.table = wd_fsm_table;
  441. ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
  442. ref->fsm.curr_entry = 0;
  443. ref->fsm.curr_state = WD_S_STOPPED;
  444. ref->fsm.state_to_str = wd_res_state_to_str;
  445. ref->fsm.event_to_str = wd_res_event_to_str;
  446. api->object_priv_set (resource_obj, NULL);
  447. res = api->object_key_get_typed (resource_obj,
  448. "poll_period",
  449. (void**)&str, &len,
  450. &type);
  451. if (res != 0) {
  452. len = snprintf (period_str, 32, "%"PRIu64"", ref->check_timeout);
  453. api->object_key_create_typed (resource_obj,
  454. "poll_period", &period_str,
  455. len,
  456. OBJDB_VALUETYPE_STRING);
  457. }
  458. else {
  459. memcpy(str_copy, str, len);
  460. str_copy[len] = '\0';
  461. if (str_to_uint64_t(str_copy, &tmp_value, WD_MIN_TIMEOUT_MS, WD_MAX_TIMEOUT_MS) == CS_OK) {
  462. ref->check_timeout = tmp_value;
  463. } else {
  464. log_printf (LOGSYS_LEVEL_WARNING,
  465. "Could NOT use poll_period:%s ms for resource %s",
  466. str, ref->name);
  467. }
  468. }
  469. api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_RECURSIVE,
  470. wd_key_changed, NULL, wd_object_destroyed,
  471. NULL, ref);
  472. res = api->object_key_get_typed (resource_obj,
  473. "recovery", (void*)&ref->recovery, &len, &type);
  474. if (res != 0) {
  475. /* key does not exist.
  476. */
  477. log_printf (LOGSYS_LEVEL_WARNING,
  478. "resource %s missing a recovery key.", ref->name);
  479. return -1;
  480. }
  481. res = api->object_key_get_typed (resource_obj,
  482. "state", (void*)&state, &len, &type);
  483. if (res != 0) {
  484. /* key does not exist.
  485. */
  486. log_printf (LOGSYS_LEVEL_WARNING,
  487. "resource %s missing a state key.", ref->name);
  488. return -1;
  489. }
  490. res = api->object_key_get_typed (resource_obj,
  491. "last_updated", (void*)&ref->last_updated, &len, &type);
  492. if (res != 0) {
  493. /* key does not exist.
  494. */
  495. ref->last_updated = 0;
  496. }
  497. /*
  498. * delay the first check to give the monitor time to start working.
  499. */
  500. tmp_value = CS_MAX(ref->check_timeout * 2, WD_DEFAULT_TIMEOUT_MS);
  501. api->timer_add_duration(tmp_value * MILLI_2_NANO_SECONDS,
  502. ref,
  503. wd_resource_check_fn, &ref->check_timer);
  504. cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref);
  505. return 0;
  506. }
  507. static void wd_tickle_fn (void* arg)
  508. {
  509. ENTER();
  510. if (watchdog_ok) {
  511. if (dog > 0) {
  512. ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok);
  513. }
  514. api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
  515. wd_tickle_fn, &wd_timer);
  516. }
  517. else {
  518. log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
  519. }
  520. }
  521. static void wd_resource_object_created(hdb_handle_t parent_object_handle,
  522. hdb_handle_t object_handle,
  523. const void *name_pt, size_t name_len,
  524. void *priv_data_pt)
  525. {
  526. wd_resource_create (object_handle);
  527. }
  528. static void wd_scan_resources (void)
  529. {
  530. hdb_handle_t obj_finder;
  531. hdb_handle_t obj_finder2;
  532. hdb_handle_t resource_type;
  533. hdb_handle_t resource;
  534. int res_count = 0;
  535. ENTER();
  536. api->object_find_create (
  537. OBJECT_PARENT_HANDLE,
  538. "resources", strlen ("resources"),
  539. &obj_finder);
  540. api->object_find_next (obj_finder, &resources_obj);
  541. api->object_find_destroy (obj_finder);
  542. /* this will be the system or process level
  543. */
  544. api->object_find_create (
  545. resources_obj,
  546. NULL, 0,
  547. &obj_finder);
  548. while (api->object_find_next (obj_finder,
  549. &resource_type) == 0) {
  550. api->object_find_create (
  551. resource_type,
  552. NULL, 0,
  553. &obj_finder2);
  554. while (api->object_find_next (obj_finder2,
  555. &resource) == 0) {
  556. if (wd_resource_create (resource) == 0) {
  557. res_count++;
  558. }
  559. }
  560. api->object_find_destroy (obj_finder2);
  561. api->object_track_start (resource_type, OBJECT_TRACK_DEPTH_ONE,
  562. NULL, wd_resource_object_created, NULL,
  563. NULL, NULL);
  564. }
  565. api->object_find_destroy (obj_finder);
  566. if (res_count == 0) {
  567. log_printf (LOGSYS_LEVEL_INFO, "no resources configured.");
  568. }
  569. }
  570. static void watchdog_timeout_apply (uint32_t new)
  571. {
  572. struct watchdog_info ident;
  573. uint32_t original_timeout = watchdog_timeout;
  574. if (new == original_timeout) {
  575. return;
  576. }
  577. watchdog_timeout = new;
  578. if (dog > 0) {
  579. ioctl(dog, WDIOC_GETSUPPORT, &ident);
  580. if (ident.options & WDIOF_SETTIMEOUT) {
  581. /* yay! the dog is trained.
  582. */
  583. ioctl(dog, WDIOC_SETTIMEOUT, &watchdog_timeout);
  584. }
  585. ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout);
  586. }
  587. if (watchdog_timeout == new) {
  588. tickle_timeout = (watchdog_timeout * CS_TIME_MS_IN_SEC)/ 2;
  589. /* reset the tickle timer in case it was reduced.
  590. */
  591. api->timer_delete (wd_timer);
  592. api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
  593. wd_tickle_fn, &wd_timer);
  594. log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", watchdog_timeout);
  595. log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %"PRIu64" ms\n", tickle_timeout);
  596. } else {
  597. log_printf (LOGSYS_LEVEL_WARNING,
  598. "Could not change the Watchdog timeout from %d to %d seconds\n",
  599. original_timeout, new);
  600. }
  601. }
  602. static int setup_watchdog(void)
  603. {
  604. struct watchdog_info ident;
  605. ENTER();
  606. if (access ("/dev/watchdog", W_OK) != 0) {
  607. log_printf (LOGSYS_LEVEL_WARNING, "No Watchdog, try modprobe <a watchdog>");
  608. dog = -1;
  609. return -1;
  610. }
  611. /* here goes, lets hope they have "Magic Close"
  612. */
  613. dog = open("/dev/watchdog", O_WRONLY);
  614. if (dog == -1) {
  615. log_printf (LOGSYS_LEVEL_WARNING, "Watchdog exists but couldn't be opened.");
  616. dog = -1;
  617. return -1;
  618. }
  619. /* Right we have the dog.
  620. * Lets see what breed it is.
  621. */
  622. ioctl(dog, WDIOC_GETSUPPORT, &ident);
  623. log_printf (LOGSYS_LEVEL_INFO, "Watchdog is now been tickled by corosync.");
  624. log_printf (LOGSYS_LEVEL_DEBUG, "%s", ident.identity);
  625. watchdog_timeout_apply (watchdog_timeout);
  626. ioctl(dog, WDIOC_SETOPTIONS, WDIOS_ENABLECARD);
  627. return 0;
  628. }
  629. static void wd_top_level_key_changed(object_change_type_t change_type,
  630. hdb_handle_t parent_object_handle,
  631. hdb_handle_t object_handle,
  632. const void *object_name_pt, size_t object_name_len,
  633. const void *key_name_pt, size_t key_len,
  634. const void *key_value_pt, size_t key_value_len,
  635. void *priv_data_pt)
  636. {
  637. uint64_t tmp_value;
  638. int32_t tmp_value_32;
  639. char str_copy[256];
  640. ENTER();
  641. if (change_type != OBJECT_KEY_DELETED &&
  642. strncmp ((char*)key_name_pt, "watchdog_timeout", key_len) == 0) {
  643. memcpy(str_copy, key_name_pt, key_len);
  644. str_copy[key_len] = '\0';
  645. if (str_to_uint64_t(str_copy, &tmp_value, 2, 120) == CS_OK) {
  646. tmp_value_32 = tmp_value;
  647. watchdog_timeout_apply (tmp_value_32);
  648. }
  649. }
  650. else {
  651. watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
  652. }
  653. }
  654. static void watchdog_timeout_get_initial (void)
  655. {
  656. int32_t res;
  657. char watchdog_timeout_str[32];
  658. size_t watchdog_timeout_len;
  659. objdb_value_types_t watchdog_timeout_type;
  660. uint32_t tmp_value_32;
  661. uint64_t tmp_value;
  662. ENTER();
  663. res = api->object_key_get_typed (resources_obj,
  664. "watchdog_timeout",
  665. (void**)&watchdog_timeout_str, &watchdog_timeout_len,
  666. &watchdog_timeout_type);
  667. if (res != 0) {
  668. watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
  669. watchdog_timeout_len = snprintf (watchdog_timeout_str, 32, "%d", watchdog_timeout);
  670. api->object_key_create_typed (resources_obj,
  671. "watchdog_timeout", &watchdog_timeout_str,
  672. watchdog_timeout_len,
  673. OBJDB_VALUETYPE_STRING);
  674. }
  675. else {
  676. if (str_to_uint64_t(watchdog_timeout_str, &tmp_value, 2, 120) == CS_OK) {
  677. tmp_value_32 = tmp_value;
  678. watchdog_timeout_apply (tmp_value_32);
  679. } else {
  680. watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
  681. }
  682. }
  683. api->object_track_start (resources_obj, OBJECT_TRACK_DEPTH_ONE,
  684. wd_top_level_key_changed, NULL, NULL,
  685. NULL, NULL);
  686. }
  687. static int wd_exec_init_fn (
  688. struct corosync_api_v1 *corosync_api)
  689. {
  690. hdb_handle_t obj;
  691. ENTER();
  692. #ifdef COROSYNC_SOLARIS
  693. logsys_subsys_init();
  694. #endif
  695. api = corosync_api;
  696. object_find_or_create (OBJECT_PARENT_HANDLE,
  697. &resources_obj,
  698. "resources", strlen ("resources"));
  699. object_find_or_create (resources_obj,
  700. &obj,
  701. "system", strlen ("system"));
  702. object_find_or_create (resources_obj,
  703. &obj,
  704. "process", strlen ("process"));
  705. watchdog_timeout_get_initial();
  706. setup_watchdog();
  707. wd_scan_resources();
  708. api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
  709. wd_tickle_fn, &wd_timer);
  710. return 0;
  711. }
  712. static int wd_exec_exit_fn (void)
  713. {
  714. char magic = 'V';
  715. ENTER();
  716. if (dog > 0) {
  717. log_printf (LOGSYS_LEVEL_INFO, "magically closing the watchdog.");
  718. write (dog, &magic, 1);
  719. }
  720. return 0;
  721. }