amfnode.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. /** @file amfnode.c
  2. *
  3. * Copyright (c) 2006 Ericsson AB.
  4. * Author: Hans Feldt, Anders Eriksson, Lars Holm
  5. * - Constructors/destructors
  6. * - Serializers/deserializers
  7. *
  8. * All rights reserved.
  9. *
  10. *
  11. * This software licensed under BSD license, the text of which follows:
  12. *
  13. * Redistribution and use in source and binary forms, with or without
  14. * modification, are permitted provided that the following conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright notice,
  17. * this list of conditions and the following disclaimer.
  18. * - Redistributions in binary form must reproduce the above copyright notice,
  19. * this list of conditions and the following disclaimer in the documentation
  20. * and/or other materials provided with the distribution.
  21. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  22. * contributors may be used to endorse or promote products derived from this
  23. * software without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  26. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  29. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  30. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  31. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  32. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  33. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  35. * THE POSSIBILITY OF SUCH DAMAGE.
  36. *
  37. * AMF Node Class Implementation
  38. *
  39. * This file contains functions for handling AMF nodes. It can be
  40. * viewed as the implementation of the AMF Node class (called NODE)
  41. * as described in SAI-Overview-B.02.01. The SA Forum specification
  42. * SAI-AIS-AMF-B.02.01 has been used as specification of the behaviour
  43. * and is referred to as 'the spec' below.
  44. *
  45. * The functions in this file are responsible for:
  46. * - controlling the instantiation of the SUs hosted on current node and
  47. * controlling the assigning of workload to them when a node joins the
  48. * cluster (cluster start is controlled by the Cluster Class)
  49. * - controlling node level recovery and repair functions
  50. * - implementing error escallation level 2 and 3 (paragraph 3.12.2.2 and
  51. * 3.12.2.3 in the spec)
  52. * - handling run time attributes of the AMF NODE; cached
  53. * attributes are stored as variables and sent to the IMM service (future)
  54. * upon the changes described in the specification
  55. *
  56. * The node class contains the following state machines:
  57. * - administrative state machine (ADSM)
  58. * - operational state machine (OPSM)
  59. * - availability control state machine (ACSM)
  60. *
  61. * The administrative state machine will be implemented in the future.
  62. *
  63. * The operational state machine is primarily used to report status of the
  64. * node.
  65. *
  66. * The availability control state machine is used for control purposes.
  67. * ACSM contains three states of which two are composite.
  68. * Being a composite state means that the state contains substates.
  69. * ACSM states are:
  70. * - REPAIR_NEEDED
  71. * - IDLE (ESCALATION_LEVEL_0, ESCALATION_LEVEL_2 and ESCALATION_LEVEL_3)
  72. * - MANAGING_HOSTED_SERVICE_UNITS (
  73. * . FAILING_FAST (REBOOTING_NODE and ACTIVATING_STANDBY_NODE)
  74. * . FAILING_GRACEFULLY (SWITCHING_OVER, FAILING_OVER and REBOOTING_NODE)
  75. * . LEAVING_SPONTANEOUSLY (SWITCHING_OVER, FAILING_OVER and
  76. * WAITING_FOR_NODE_TO_JOIN)
  77. * . JOINING (STARTING_APPLICATIONS and ASSIGNING_WORKLOAD)
  78. *
  79. * REPAIR_NEEDED indicates the node needs a manual repair and this state will be
  80. * maintained until the administrative command REPAIRED is entered (implemented
  81. * in the future)
  82. *
  83. * IDLE is a composite state where no actions are actually performed
  84. * and used only to remember the escallation level. Substate LEVEL_0 indicates
  85. * no escallation. LEVEL_2 indicates that so many component restarts have been
  86. * executed recently that a new component restart request will escalate
  87. * to service unit restart action. Node will request a service unit restart
  88. * from SU.
  89. * LEVEL_3 will be entered if either there are too many service unit restarts
  90. * been made or a component failover recovery action is requested. On level 3
  91. * the recovery action performed is service unit failover (paragraph 3.12.1.3).
  92. *
  93. * FAILING_FAST state executes a node re-boot and waits for the node to join
  94. * the cluster again. (not implemented)
  95. *
  96. * FAILING_GRACEFULLY state requests all SGs which have SUs hosted on current
  97. * node to switch or failover according to the procedures described in
  98. * paragraphs 3.12.1.3 before re-boot is executed. Then the confirmation is
  99. * awaited from all concerned SGs and finally a node re-boot is executed as
  100. * the repair action (see paragraph 2.12.1.4).
  101. *
  102. * LEAVING_SPONTANEOUSLY state handles the spontaneous leave of a node.
  103. *
  104. * JOINING state handles the start of a node in all cases except cluster start,
  105. * which is handled by the CLUSTER class.
  106. *
  107. * 1. Node Availability Control State Machine
  108. * ==========================================
  109. *
  110. * 1.1 State Transition Table
  111. *
  112. * State: Event: Action: New state:
  113. * ============================================================================
  114. * ESCALATION_LEVEL_X node_sync_ready A6 JOINING_STARTING_APPLS
  115. * ESCALATION_LEVEL_X node_leave A9,A8 LEAVING_SP_FAILING_OVER
  116. * ESCALATION_LEVEL_X failover A11 GRACEFULLY_FAILING_OVER
  117. * ESCALATION_LEVEL_2 comp_restart_req [!C6]A13 ESCALATION_LEVEL_2
  118. * ESCALATION_LEVEL_2 comp_restart_req [C6]A14 ESCALATION_LEVEL_3
  119. * ESCALATION_LEVEL_3 comp_restart_req [!C7]A14 ESCALATION_LEVEL_3
  120. * ESCALATION_LEVEL_3 comp_failover_req [!C7]A14 ESCALATION_LEVEL_3
  121. * ESCALATION_LEVEL_3 comp_restart_req [C7]A15 ESCALATION_LEVEL_3
  122. * ESCALATION_LEVEL_3 comp_failover_req [C7]A15 ESCALATION_LEVEL_3
  123. * JOINING_STARTING_APPLS appl_started [C4] A7 JOINING_ASSIGNING_WL
  124. * JOINING_ASSIGNING_WL appl_assigned [C5] ESCALATION_LEVEL_X
  125. * LEAVING_SP_FAILING_OVER sg_failed_over [C1] LEAVING_SP_WAIT_FOR_JOIN
  126. * LEAVING_SP_WAIT_FOR_JOIN node_sync_ready A6 JOINING_STARTING_APPLS
  127. * GRACEFULLY_FAILING_OVER sg_failed_over [C1] A12 GRACEFULLY_REBOOTING
  128. * GRACEFULLY_REBOOTING node_leave ESCALATION_LEVEL_X
  129. *
  130. * 1.2 State Description
  131. * =====================
  132. * ESCALATION_LEVEL_X - Node is synchronized and idle (X = 0,2 or 3).
  133. * JOINING_STARTING_APPLS - JOINING_STARTING_APPLICATIONS
  134. * Node has ordered all applications to start its SUs
  135. * hosted on current node and is now waiting for them
  136. * to acknowledge that they have started.
  137. * GRACEFULLY_FAILING_OVER - FAILING_GRACEFULLY_FAILING_OVER
  138. * Node has ordered all SGs in the cluster to
  139. * failover all SUs that are hosted on a specific
  140. * node and waits for the SGs to confirm the
  141. * failover is completed.
  142. * GRACEFULLY_REBOOTING - FAILING_GRACEFULLY_REBOOTING_NODE
  143. * Node has ordered reboot and waits for the rebooted
  144. * node to join the cluster again.
  145. * JOINING_ASSIGNING_WL - JOINING_ASSIGNING_WORKLOAD
  146. * Node has ordered all applications to assign workload
  147. * to all its SUs which currently have no workload and
  148. * is now waiting for the applications to acknowledge.
  149. *
  150. * LEAVING_SP_FAILING_OVER - LEAVING_SPONTANEOUSLY_FAILING_OVER
  151. * Node has received an event telling that this node
  152. * has left the cluster and has ordered all service
  153. * groups to failover those of its SUs that were
  154. * hosted on current node.
  155. *
  156. * LEAVING_SP_WAIT_FOR_JOIN - LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN
  157. * Node is waiting for current node to join again.
  158. *
  159. * 1.3 Actions
  160. * ===========
  161. * A1 -
  162. * A2 -
  163. * A3 -
  164. * A4 -
  165. * A5 -
  166. * A6 - [foreach application in cluster]start application
  167. * A7 - [foreach application in cluster]assign workload to application
  168. * A8 - [foreach application in cluster]
  169. * [foreach SG in application ]failover node
  170. * A9 - [foreach application in cluster]
  171. * [foreach SG in application ]
  172. * [foreach SU in SG where the SU is hosted on current node]
  173. * [foreach comp in such an SU]indicate that the node has left the cluster
  174. * A10-
  175. * A11- [foreach SG in cluster]failover node
  176. * A12- reboot node
  177. * A13- restart SU
  178. * A14- failover SU
  179. * A15- failover node
  180. *
  181. * 1.4 Guards
  182. * ==========
  183. * C1 - All SG availability control state machines (ACSM) == IDLE
  184. * C2 -
  185. * C3 -
  186. * C4 - No applications are in ACSM state == STARTING_SGS
  187. * C5 - All applications have ACSM state == WORKLOAD_ASSIGNED
  188. * C6 - Specified number of SU restarts have been done.
  189. * C7 - Specified number of SU failover actions have been done.
  190. */
  191. #include <stdlib.h>
  192. #include <assert.h>
  193. #include <unistd.h>
  194. #include "amf.h"
  195. #include "util.h"
  196. #include "logsys.h"
  197. #include "main.h"
  198. LOGSYS_DECLARE_SUBSYS ("AMF", LOG_INFO)
  199. /******************************************************************************
  200. * Internal (static) utility functions
  201. *****************************************************************************/
  202. static void node_acsm_enter_leaving_spontaneously(struct amf_node *node)
  203. {
  204. ENTER("'%s'", node->name.value);
  205. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED;
  206. node->nodeid = 0;
  207. }
  208. static void node_acsm_enter_failing_over (struct amf_node *node)
  209. {
  210. struct amf_application *app;
  211. struct amf_sg *sg;
  212. struct amf_su *su;
  213. struct amf_comp *component = NULL;
  214. ENTER("'%s'", node->name.value);
  215. node->acsm_state = NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER;
  216. /*
  217. * Indicate to each component object in the model that current
  218. * node has left the cluster
  219. */
  220. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  221. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  222. for (su = sg->su_head; su != NULL; su = su->next) {
  223. if (name_match(&node->name, &su->saAmfSUHostedByNode)) {
  224. for (component = su->comp_head; component != NULL;
  225. component = component->next) {
  226. amf_comp_node_left(component);
  227. }
  228. }
  229. }
  230. }
  231. }
  232. /*
  233. * Let all service groups with service units hosted on current node failover
  234. * its workload
  235. */
  236. for (app = amf_cluster->application_head; app != NULL; app =
  237. app->next) {
  238. for (sg = app->sg_head; sg != NULL; sg =
  239. sg->next) {
  240. amf_sg_failover_node_req(sg, node);
  241. }
  242. }
  243. }
  244. static void failover_all_sg_on_node (amf_node_t *node)
  245. {
  246. amf_application_t *app;
  247. amf_sg_t *sg;
  248. amf_su_t *su;
  249. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  250. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  251. for (su = sg->su_head; su != NULL; su = su->next) {
  252. if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
  253. amf_sg_failover_node_req (sg, node);
  254. break;
  255. }
  256. }
  257. }
  258. }
  259. }
  260. static void node_acsm_enter_failing_gracefully_failing_over (amf_node_t *node)
  261. {
  262. ENTER("");
  263. node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER;
  264. failover_all_sg_on_node (node);
  265. }
  266. static int has_all_sg_on_node_failed_over (amf_node_t *node)
  267. {
  268. amf_application_t *app;
  269. amf_sg_t *sg;
  270. amf_su_t *su;
  271. int has_all_sg_on_node_failed_over = 1;
  272. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  273. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  274. for (su = sg->su_head; su != NULL; su = su->next) {
  275. if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
  276. if (sg->avail_state != SG_AC_Idle) {
  277. TRACE1("%s %s",sg->name.value, su->name.value);
  278. has_all_sg_on_node_failed_over = 0;
  279. goto out;
  280. }
  281. break;
  282. }
  283. }
  284. }
  285. }
  286. out:
  287. return has_all_sg_on_node_failed_over;
  288. }
  289. static void repair_node (amf_node_t *node)
  290. {
  291. ENTER("");
  292. char hostname[256];
  293. gethostname (hostname, 256);
  294. if (!strcmp (hostname, (const char*)node->saAmfNodeClmNode.value)) {
  295. /* TODO if(saAmfAutoRepair == SA_TRUE) */
  296. #ifdef DEBUG
  297. exit (0);
  298. #else
  299. system ("reboot");
  300. #endif
  301. }
  302. }
  303. static void enter_failing_gracefully_rebooting_node (amf_node_t *node)
  304. {
  305. ENTER("");
  306. node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE;
  307. repair_node (node);
  308. }
  309. static void node_acsm_enter_idle (amf_node_t *node)
  310. {
  311. ENTER ("history_state=%d",node->history_state);
  312. node->acsm_state = node->history_state;
  313. }
  314. static void node_acsm_enter_joining_assigning_workload (struct amf_node *node,
  315. struct amf_application *app)
  316. {
  317. log_printf(LOG_NOTICE,
  318. "Node=%s: all applications started, assigning workload.",
  319. node->name.value);
  320. ENTER("");
  321. node->acsm_state = NODE_ACSM_JOINING_ASSIGNING_WORKLOAD;
  322. for (app = app->cluster->application_head; app != NULL;
  323. app = app->next) {
  324. amf_application_assign_workload (app, node);
  325. }
  326. }
  327. /******************************************************************************
  328. * Event methods
  329. *****************************************************************************/
  330. /**
  331. * This event indicates that a node has unexpectedly left the cluster. Node
  332. * leave event is obtained from amf_confchg_fn.
  333. *
  334. * @param node
  335. */
  336. void amf_node_leave (struct amf_node *node)
  337. {
  338. assert (node != NULL);
  339. ENTER("'%s', CLM node '%s'", node->name.value,
  340. node->saAmfNodeClmNode.value);
  341. switch (node->acsm_state) {
  342. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  343. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  344. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  345. node_acsm_enter_leaving_spontaneously(node);
  346. node_acsm_enter_failing_over (node);
  347. break;
  348. case NODE_ACSM_REPAIR_NEEDED:
  349. break;
  350. case NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE:
  351. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  352. node_acsm_enter_idle (node);
  353. break;
  354. default:
  355. log_printf (LOG_LEVEL_ERROR, "amf_node_leave called in state = %d"
  356. " (should have been deferred)", node->acsm_state);
  357. openais_exit_error (AIS_DONE_FATAL_ERR);
  358. break;
  359. }
  360. }
  361. /**
  362. * This function handles a detected error that by a pre-analysis executed
  363. * elsewhere has been decided to be recovered by a node fail over.
  364. * @param node
  365. */
  366. void amf_node_failover (struct amf_node *node)
  367. {
  368. assert (node != NULL);
  369. ENTER("'%s', CLM node '%s'", node->name.value,
  370. node->saAmfNodeClmNode.value);
  371. switch (node->acsm_state) {
  372. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  373. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  374. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  375. node_acsm_enter_failing_gracefully_failing_over (node);
  376. break;
  377. case NODE_ACSM_REPAIR_NEEDED:
  378. break;
  379. default:
  380. log_printf (LOG_LEVEL_ERROR, "amf_node_leave()called in state = %d"
  381. " (should have been deferred)", node->acsm_state);
  382. openais_exit_error (AIS_DONE_FATAL_ERR);
  383. break;
  384. }
  385. }
  386. /**
  387. *
  388. * @param node
  389. */
  390. void amf_node_switchover (struct amf_node *node)
  391. {
  392. }
  393. /**
  394. *
  395. * @param node
  396. */
  397. void amf_node_failfast (struct amf_node *node)
  398. {
  399. }
  400. /**
  401. * This event is a request to restart a component which has been escalated,
  402. * because the component has already been restarted the number of times
  403. * specified by the configuration.
  404. * This function evaluates which recovery measure shall now be
  405. * taken and initiates the action which result from the evaluation.
  406. * @param node
  407. * @param comp
  408. */
  409. void amf_node_comp_restart_req (struct amf_node *node, struct amf_comp *comp)
  410. {
  411. amf_su_t *su = comp->su;
  412. ENTER("");
  413. switch (node->acsm_state) {
  414. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  415. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_2;
  416. amf_node_comp_restart_req (node, comp);
  417. break;
  418. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  419. if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
  420. SaNameT dn;
  421. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_3;
  422. amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
  423. amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
  424. amf_comp_dn_make (comp, &dn);
  425. log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
  426. "action:\n\t\tSU failover", dn.value);
  427. amf_sg_failover_su_req (su->sg, su, node);
  428. } else {
  429. amf_su_restart (su);
  430. }
  431. break;
  432. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  433. if (su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
  434. SaNameT dn;
  435. amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
  436. amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
  437. amf_comp_dn_make (comp, &dn);
  438. log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
  439. "action:\n\t\tSU failover", dn.value);
  440. amf_sg_failover_su_req (su->sg, su, node);
  441. return;
  442. } else {
  443. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  444. amf_node_failover (node);
  445. }
  446. break;
  447. default:
  448. dprintf("%d",node->acsm_state);
  449. assert (0);
  450. break;
  451. }
  452. }
  453. /**
  454. * This event is a request to failover the specified component.
  455. * This function evaluates which recovery measure shall actually be
  456. * taken considering the escalation policy and initiates the action
  457. * which result from the evaluation.
  458. * @param node
  459. * @param comp
  460. */
  461. void amf_node_comp_failover_req (amf_node_t *node, amf_comp_t *comp)
  462. {
  463. ENTER("");
  464. switch (node->acsm_state) {
  465. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  466. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  467. if (comp->su->saAmfSUFailover) {
  468. /* SU failover */
  469. amf_sg_failover_su_req (comp->su->sg,comp->su, node);
  470. }
  471. break;
  472. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  473. if (comp->su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
  474. if (comp->su->saAmfSUFailover) {
  475. /* SU failover */
  476. amf_sg_failover_su_req (comp->su->sg,comp->su, node);
  477. }
  478. } else {
  479. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  480. amf_node_failover (node);
  481. }
  482. break;
  483. default:
  484. dprintf("%d",node->acsm_state);
  485. assert (0);
  486. break;
  487. }
  488. }
  489. /**
  490. * This event indicates that current node has joined and its cluster model has
  491. * been synchronized with the other nodes cluster models.
  492. *
  493. * @param node
  494. */
  495. void amf_node_sync_ready (struct amf_node *node)
  496. {
  497. struct amf_application *app;
  498. assert (node != NULL);
  499. log_printf(LOG_NOTICE, "Node=%s: sync ready, starting hosted SUs.",
  500. node->name.value);
  501. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  502. switch (node->acsm_state) {
  503. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  504. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  505. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  506. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  507. node->acsm_state = NODE_ACSM_JOINING_STARTING_APPLICATIONS;
  508. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  509. amf_application_start (app, node);
  510. }
  511. break;
  512. case NODE_ACSM_REPAIR_NEEDED:
  513. break;
  514. default:
  515. log_printf (LOG_LEVEL_ERROR, "amf_node_sync_ready() was called in "
  516. "state = %d (should have been deferred)",
  517. node->acsm_state);
  518. openais_exit_error (AIS_DONE_FATAL_ERR);
  519. break;
  520. }
  521. }
  522. /******************************************************************************
  523. * Event response methods
  524. *****************************************************************************/
  525. /**
  526. * This event indicates that an application has started. Started in this context
  527. * means that none of its contained service units is in an -ING state with other
  528. * words successfully instantiated, instantiation has failed or instantiation
  529. * was not possible (due to the node on which the SU was to be hosted is not
  530. * operational).
  531. *
  532. * @param node
  533. * @param application which has been started
  534. */
  535. void amf_node_application_started (struct amf_node *node,
  536. struct amf_application *app)
  537. {
  538. assert (node != NULL && app != NULL );
  539. ENTER ("Node=%s: application '%s' started", node->name.value,
  540. app->name.value);
  541. switch (node->acsm_state) {
  542. case NODE_ACSM_JOINING_STARTING_APPLICATIONS:
  543. if (amf_cluster_applications_started_with_no_starting_sgs(
  544. app->cluster)) {
  545. node_acsm_enter_joining_assigning_workload(node, app);
  546. }
  547. break;
  548. default:
  549. log_printf (LOG_LEVEL_ERROR, "amf_node_application_started()"
  550. "called in state = %d (unexpected !!)", node->acsm_state);
  551. openais_exit_error (AIS_DONE_FATAL_ERR);
  552. break;
  553. }
  554. }
  555. /**
  556. * This event indicates that an application has been assigned workload.
  557. *
  558. * @param node
  559. * @param app - Application which has been assigned workload
  560. */
  561. void amf_node_application_workload_assigned (struct amf_node *node,
  562. struct amf_application *app)
  563. {
  564. assert (node != NULL && app != NULL );
  565. ENTER ("Node=%s: application '%s' started", node->name.value,
  566. app->name.value);
  567. switch (node->acsm_state) {
  568. case NODE_ACSM_JOINING_ASSIGNING_WORKLOAD:
  569. if (amf_cluster_applications_assigned (amf_cluster)) {
  570. log_printf(LOG_NOTICE, "Node=%s: all workload assigned",
  571. node->name.value);
  572. node_acsm_enter_idle (node);
  573. }
  574. break;
  575. default:
  576. log_printf (LOG_LEVEL_ERROR, "amf_node_application_workload_assigned()"
  577. "called in state = %d (unexpected !!)", node->acsm_state);
  578. openais_exit_error (AIS_DONE_FATAL_ERR);
  579. break;
  580. }
  581. }
  582. /**
  583. * This event indicates that an SG has failed over its workload after a node
  584. * failure.
  585. *
  586. * @param node
  587. * @param sg_in SG which is now ready with its failover
  588. */
  589. void amf_node_sg_failed_over (struct amf_node *node, struct amf_sg *sg_in)
  590. {
  591. assert (node != NULL);
  592. ENTER ("Node=%s: SG '%s' started %d", node->name.value,
  593. sg_in->name.value,node->acsm_state);
  594. switch (node->acsm_state) {
  595. case NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER:
  596. if (has_all_sg_on_node_failed_over (node)) { /*C2*/
  597. node->acsm_state =
  598. NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN;
  599. }
  600. break;
  601. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  602. /* Accept reports of failed over sg that has completed. */
  603. break;
  604. case NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER:
  605. if (has_all_sg_on_node_failed_over (node)) { /*C2*/
  606. enter_failing_gracefully_rebooting_node (node);
  607. }
  608. break;
  609. default:
  610. log_printf (LOG_LEVEL_ERROR, "amf_node_sg_failed_over()"
  611. "called in state = %d (unexpected !!)", node->acsm_state);
  612. openais_exit_error (AIS_DONE_FATAL_ERR);
  613. break;
  614. }
  615. }
  616. /******************************************************************************
  617. * General methods
  618. *****************************************************************************/
  619. /**
  620. * Node constructor
  621. * @param cluster
  622. * @param name - RDN of node
  623. */
  624. struct amf_node *amf_node_new (struct amf_cluster *cluster, char *name) {
  625. struct amf_node *node = amf_calloc (1, sizeof (struct amf_node));
  626. setSaNameT (&node->name, name);
  627. node->saAmfNodeAdminState = SA_AMF_ADMIN_UNLOCKED;
  628. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  629. node->saAmfNodeAutoRepair = SA_TRUE;
  630. node->saAmfNodeSuFailOverProb = -1;
  631. node->saAmfNodeSuFailoverMax = ~0;
  632. node->cluster = cluster;
  633. node->next = cluster->node_head;
  634. cluster->node_head = node;
  635. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  636. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  637. return node;
  638. }
  639. void *amf_node_serialize (struct amf_node *node, int *len)
  640. {
  641. char *buf = NULL;
  642. int offset = 0, size = 0;
  643. TRACE8 ("%s", node->name.value);
  644. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->name);
  645. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->saAmfNodeClmNode);
  646. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  647. node->saAmfNodeSuFailOverProb);
  648. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  649. node->saAmfNodeSuFailoverMax);
  650. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  651. node->saAmfNodeAutoRepair);
  652. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  653. node->saAmfNodeRebootOnInstantiationFailure);
  654. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  655. node->saAmfNodeRebootOnTerminationFailure);
  656. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  657. node->saAmfNodeAdminState);
  658. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  659. node->saAmfNodeOperState);
  660. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  661. node->nodeid);
  662. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  663. node->acsm_state);
  664. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  665. node->history_state);
  666. *len = offset;
  667. return buf;
  668. }
  669. struct amf_node *amf_node_deserialize (struct amf_cluster *cluster, char *buf) {
  670. char *tmp = buf;
  671. struct amf_node *node = amf_node_new (cluster, "");
  672. tmp = amf_deserialize_SaNameT (tmp, &node->name);
  673. tmp = amf_deserialize_SaNameT (tmp, &node->saAmfNodeClmNode);
  674. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailOverProb);
  675. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailoverMax);
  676. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAutoRepair);
  677. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnInstantiationFailure);
  678. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnTerminationFailure);
  679. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAdminState);
  680. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeOperState);
  681. tmp = amf_deserialize_SaUint32T (tmp, &node->nodeid);
  682. tmp = amf_deserialize_SaUint32T (tmp, &node->acsm_state);
  683. tmp = amf_deserialize_SaUint32T (tmp, &node->history_state);
  684. return node;
  685. }
  686. struct amf_node *amf_node_find (SaNameT *name) {
  687. struct amf_node *node;
  688. assert (name != NULL && amf_cluster != NULL);
  689. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  690. if (name_match (&node->name, name)) {
  691. return node;
  692. }
  693. }
  694. dprintf ("node %s not found in configuration!", name->value);
  695. return NULL;
  696. }
  697. struct amf_node *amf_node_find_by_nodeid (unsigned int nodeid) {
  698. struct amf_node *node;
  699. assert (amf_cluster != NULL);
  700. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  701. if (node->nodeid == nodeid) {
  702. return node;
  703. }
  704. }
  705. dprintf ("node %u not found in configuration!", nodeid);
  706. return NULL;
  707. }
  708. struct amf_node *amf_node_find_by_hostname (const char *hostname) {
  709. struct amf_node *node;
  710. assert (hostname != NULL && amf_cluster != NULL);
  711. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  712. if (strcmp ((char*)node->saAmfNodeClmNode.value, hostname) == 0) {
  713. return node;
  714. }
  715. }
  716. dprintf ("node %s not found in configuration!", hostname);
  717. return NULL;
  718. }