amfnode.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799
  1. /** @file amfnode.c
  2. *
  3. * Copyright (c) 2006 Ericsson AB.
  4. * Author: Hans Feldt, Anders Eriksson, Lars Holm
  5. * - Constructors/destructors
  6. * - Serializers/deserializers
  7. *
  8. * All rights reserved.
  9. *
  10. *
  11. * This software licensed under BSD license, the text of which follows:
  12. *
  13. * Redistribution and use in source and binary forms, with or without
  14. * modification, are permitted provided that the following conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright notice,
  17. * this list of conditions and the following disclaimer.
  18. * - Redistributions in binary form must reproduce the above copyright notice,
  19. * this list of conditions and the following disclaimer in the documentation
  20. * and/or other materials provided with the distribution.
  21. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  22. * contributors may be used to endorse or promote products derived from this
  23. * software without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  26. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  29. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  30. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  31. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  32. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  33. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  35. * THE POSSIBILITY OF SUCH DAMAGE.
  36. *
  37. * AMF Node Class Implementation
  38. *
  39. * This file contains functions for handling AMF nodes. It can be
  40. * viewed as the implementation of the AMF Node class (called NODE)
  41. * as described in SAI-Overview-B.02.01. The SA Forum specification
  42. * SAI-AIS-AMF-B.02.01 has been used as specification of the behaviour
  43. * and is referred to as 'the spec' below.
  44. *
  45. * The functions in this file are responsible for:
  46. * - controlling the instantiation of the SUs hosted on current node and
  47. * controlling the assigning of workload to them when a node joins the
  48. * cluster (cluster start is controlled by the Cluster Class)
  49. * - controlling node level recovery and repair functions
  50. * - implementing error escallation level 2 and 3 (paragraph 3.12.2.2 and
  51. * 3.12.2.3 in the spec)
  52. * - handling run time attributes of the AMF NODE; cached
  53. * attributes are stored as variables and sent to the IMM service (future)
  54. * upon the changes described in the specification
  55. *
  56. * The node class contains the following state machines:
  57. * - administrative state machine (ADSM)
  58. * - operational state machine (OPSM)
  59. * - availability control state machine (ACSM)
  60. *
  61. * The administrative state machine will be implemented in the future.
  62. *
  63. * The operational state machine is primarily used to report status of the
  64. * node.
  65. *
  66. * The availability control state machine is used for control purposes.
  67. * ACSM contains three states of which two are composite.
  68. * Being a composite state means that the state contains substates.
  69. * ACSM states are:
  70. * - REPAIR_NEEDED
  71. * - IDLE (ESCALATION_LEVEL_0, ESCALATION_LEVEL_2 and ESCALATION_LEVEL_3)
  72. * - MANAGING_HOSTED_SERVICE_UNITS (
  73. * . FAILING_FAST (REBOOTING_NODE and ACTIVATING_STANDBY_NODE)
  74. * . FAILING_GRACEFULLY (SWITCHING_OVER, FAILING_OVER and REBOOTING_NODE)
  75. * . LEAVING_SPONTANEOUSLY (SWITCHING_OVER, FAILING_OVER and
  76. * WAITING_FOR_NODE_TO_JOIN)
  77. * . JOINING (STARTING_APPLICATIONS and ASSIGNING_WORKLOAD)
  78. *
  79. * REPAIR_NEEDED indicates the node needs a manual repair and this state will be
  80. * maintained until the administrative command REPAIRED is entered (implemented
  81. * in the future)
  82. *
  83. * IDLE is a composite state where no actions are actually performed
  84. * and used only to remember the escallation level. Substate LEVEL_0 indicates
  85. * no escallation. LEVEL_2 indicates that so many component restarts have been
  86. * executed recently that a new component restart request will escalate
  87. * to service unit restart action. Node will request a service unit restart
  88. * from SU.
  89. * LEVEL_3 will be entered if either there are too many service unit restarts
  90. * been made or a component failover recovery action is requested. On level 3
  91. * the recovery action performed is service unit failover (paragraph 3.12.1.3).
  92. *
  93. * FAILING_FAST state executes a node re-boot and waits for the node to join
  94. * the cluster again. (not implemented)
  95. *
  96. * FAILING_GRACEFULLY state requests all SGs which have SUs hosted on current
  97. * node to switch or failover according to the procedures described in
  98. * paragraphs 3.12.1.3 before re-boot is executed. Then the confirmation is
  99. * awaited from all concerned SGs and finally a node re-boot is executed as
  100. * the repair action (see paragraph 2.12.1.4).
  101. *
  102. * LEAVING_SPONTANEOUSLY state handles the spontaneous leave of a node.
  103. *
  104. * JOINING state handles the start of a node in all cases except cluster start,
  105. * which is handled by the CLUSTER class.
  106. *
  107. * 1. Node Availability Control State Machine
  108. * ==========================================
  109. *
  110. * 1.1 State Transition Table
  111. *
  112. * State: Event: Action: New state:
  113. * ============================================================================
  114. * ESCALATION_LEVEL_X node_sync_ready A6 JOINING_STARTING_APPLS
  115. * ESCALATION_LEVEL_X node_leave A9,A8 LEAVING_SP_FAILING_OVER
  116. * ESCALATION_LEVEL_X failover A11 GRACEFULLY_FAILING_OVER
  117. * ESCALATION_LEVEL_2 comp_restart_req [!C6]A13 ESCALATION_LEVEL_2
  118. * ESCALATION_LEVEL_2 comp_restart_req [C6]A14 ESCALATION_LEVEL_3
  119. * ESCALATION_LEVEL_3 comp_restart_req [!C7]A14 ESCALATION_LEVEL_3
  120. * ESCALATION_LEVEL_3 comp_failover_req [!C7]A14 ESCALATION_LEVEL_3
  121. * ESCALATION_LEVEL_3 comp_restart_req [C7]A15 ESCALATION_LEVEL_3
  122. * ESCALATION_LEVEL_3 comp_failover_req [C7]A15 ESCALATION_LEVEL_3
  123. * JOINING_STARTING_APPLS appl_started [C4] A7 JOINING_ASSIGNING_WL
  124. * JOINING_ASSIGNING_WL appl_assigned [C5] ESCALATION_LEVEL_X
  125. * LEAVING_SP_FAILING_OVER sg_failed_over [C1] LEAVING_SP_WAIT_FOR_JOIN
  126. * LEAVING_SP_WAIT_FOR_JOIN node_sync_ready A6 JOINING_STARTING_APPLS
  127. * GRACEFULLY_FAILING_OVER sg_failed_over [C1] A12 GRACEFULLY_REBOOTING
  128. * GRACEFULLY_REBOOTING node_leave ESCALATION_LEVEL_X
  129. *
  130. * 1.2 State Description
  131. * =====================
  132. * ESCALATION_LEVEL_X - Node is synchronized and idle (X = 0,2 or 3).
  133. * JOINING_STARTING_APPLS - JOINING_STARTING_APPLICATIONS
  134. * Node has ordered all applications to start its SUs
  135. * hosted on current node and is now waiting for them
  136. * to acknowledge that they have started.
  137. * GRACEFULLY_FAILING_OVER - FAILING_GRACEFULLY_FAILING_OVER
  138. * Node has ordered all SGs in the cluster to
  139. * failover all SUs that are hosted on a specific
  140. * node and waits for the SGs to confirm the
  141. * failover is completed.
  142. * GRACEFULLY_REBOOTING - FAILING_GRACEFULLY_REBOOTING_NODE
  143. * Node has ordered reboot and waits for the rebooted
  144. * node to join the cluster again.
  145. * JOINING_ASSIGNING_WL - JOINING_ASSIGNING_WORKLOAD
  146. * Node has ordered all applications to assign workload
  147. * to all its SUs which currently have no workload and
  148. * is now waiting for the applications to acknowledge.
  149. *
  150. * LEAVING_SP_FAILING_OVER - LEAVING_SPONTANEOUSLY_FAILING_OVER
  151. * Node has received an event telling that this node
  152. * has left the cluster and has ordered all service
  153. * groups to failover those of its SUs that were
  154. * hosted on current node.
  155. *
  156. * LEAVING_SP_WAIT_FOR_JOIN - LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN
  157. * Node is waiting for current node to join again.
  158. *
  159. * 1.3 Actions
  160. * ===========
  161. * A1 -
  162. * A2 -
  163. * A3 -
  164. * A4 -
  165. * A5 -
  166. * A6 - [foreach application in cluster]start application
  167. * A7 - [foreach application in cluster]assign workload to application
  168. * A8 - [foreach application in cluster]
  169. * [foreach SG in application ]failover node
  170. * A9 - [foreach application in cluster]
  171. * [foreach SG in application ]
  172. * [foreach SU in SG where the SU is hosted on current node]
  173. * [foreach comp in such an SU]indicate that the node has left the cluster
  174. * A10-
  175. * A11- [foreach SG in cluster]failover node
  176. * A12- reboot node
  177. * A13- restart SU
  178. * A14- failover SU
  179. * A15- failover node
  180. *
  181. * 1.4 Guards
  182. * ==========
  183. * C1 - All SG availability control state machines (ACSM) == IDLE
  184. * C2 -
  185. * C3 -
  186. * C4 - No applications are in ACSM state == STARTING_SGS
  187. * C5 - All applications have ACSM state == WORKLOAD_ASSIGNED
  188. * C6 - Specified number of SU restarts have been done.
  189. * C7 - Specified number of SU failover actions have been done.
  190. */
  191. #include <stdlib.h>
  192. #include <assert.h>
  193. #include <unistd.h>
  194. #include "amf.h"
  195. #include "util.h"
  196. #include "print.h"
  197. #include "main.h"
  198. /******************************************************************************
  199. * Internal (static) utility functions
  200. *****************************************************************************/
  201. static void node_acsm_enter_leaving_spontaneously(struct amf_node *node)
  202. {
  203. ENTER("'%s'", node->name.value);
  204. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED;
  205. node->nodeid = 0;
  206. }
  207. static void node_acsm_enter_failing_over (struct amf_node *node)
  208. {
  209. struct amf_application *app;
  210. struct amf_sg *sg;
  211. struct amf_su *su;
  212. struct amf_comp *component = NULL;
  213. ENTER("'%s'", node->name.value);
  214. node->acsm_state = NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER;
  215. /*
  216. * Indicate to each component object in the model that current
  217. * node has left the cluster
  218. */
  219. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  220. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  221. for (su = sg->su_head; su != NULL; su = su->next) {
  222. if (name_match(&node->name, &su->saAmfSUHostedByNode)) {
  223. for (component = su->comp_head; component != NULL;
  224. component = component->next) {
  225. amf_comp_node_left(component);
  226. }
  227. }
  228. }
  229. }
  230. }
  231. /*
  232. * Let all service groups with service units hosted on current node failover
  233. * its workload
  234. */
  235. for (app = amf_cluster->application_head; app != NULL; app =
  236. app->next) {
  237. for (sg = app->sg_head; sg != NULL; sg =
  238. sg->next) {
  239. amf_sg_failover_node_req(sg, node);
  240. }
  241. }
  242. }
  243. static void failover_all_sg_on_node (amf_node_t *node)
  244. {
  245. amf_application_t *app;
  246. amf_sg_t *sg;
  247. amf_su_t *su;
  248. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  249. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  250. for (su = sg->su_head; su != NULL; su = su->next) {
  251. if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
  252. amf_sg_failover_node_req (sg, node);
  253. break;
  254. }
  255. }
  256. }
  257. }
  258. }
  259. static void node_acsm_enter_failing_gracefully_failing_over (amf_node_t *node)
  260. {
  261. ENTER("");
  262. node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER;
  263. failover_all_sg_on_node (node);
  264. }
  265. static int has_all_sg_on_node_failed_over (amf_node_t *node)
  266. {
  267. amf_application_t *app;
  268. amf_sg_t *sg;
  269. amf_su_t *su;
  270. int has_all_sg_on_node_failed_over = 1;
  271. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  272. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  273. for (su = sg->su_head; su != NULL; su = su->next) {
  274. if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
  275. if (sg->avail_state != SG_AC_Idle) {
  276. has_all_sg_on_node_failed_over = 0;
  277. goto out;
  278. }
  279. break;
  280. }
  281. }
  282. }
  283. }
  284. out:
  285. return has_all_sg_on_node_failed_over;
  286. }
  287. static void repair_node (amf_node_t *node)
  288. {
  289. ENTER("");
  290. char hostname[256];
  291. gethostname (hostname, 256);
  292. if (!strcmp (hostname, (const char*)node->saAmfNodeClmNode.value)) {
  293. /* TODO if(saAmfAutoRepair == SA_TRUE) */
  294. #ifdef DEBUG
  295. exit (0);
  296. #else
  297. system ("reboot");
  298. #endif
  299. }
  300. }
  301. static void enter_failing_gracefully_rebooting_node (amf_node_t *node)
  302. {
  303. ENTER("");
  304. node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE;
  305. repair_node (node);
  306. }
  307. static void node_acsm_enter_idle (amf_node_t *node)
  308. {
  309. ENTER ("history_state=%d",node->history_state);
  310. node->acsm_state = node->history_state;
  311. }
  312. static void node_acsm_enter_joining_assigning_workload (struct amf_node *node,
  313. struct amf_application *app)
  314. {
  315. log_printf(LOG_NOTICE,
  316. "Node=%s: all applications started, assigning workload.",
  317. node->name.value);
  318. ENTER("");
  319. node->acsm_state = NODE_ACSM_JOINING_ASSIGNING_WORKLOAD;
  320. for (app = app->cluster->application_head; app != NULL;
  321. app = app->next) {
  322. amf_application_assign_workload (app, node);
  323. }
  324. }
  325. /******************************************************************************
  326. * Event methods
  327. *****************************************************************************/
  328. /**
  329. * This event indicates that a node has unexpectedly left the cluster. Node
  330. * leave event is obtained from amf_confchg_fn.
  331. *
  332. * @param node
  333. */
  334. void amf_node_leave (struct amf_node *node)
  335. {
  336. assert (node != NULL);
  337. ENTER("'%s', CLM node '%s'", node->name.value,
  338. node->saAmfNodeClmNode.value);
  339. switch (node->acsm_state) {
  340. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  341. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  342. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  343. node_acsm_enter_leaving_spontaneously(node);
  344. node_acsm_enter_failing_over (node);
  345. break;
  346. case NODE_ACSM_REPAIR_NEEDED:
  347. break;
  348. case NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE:
  349. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  350. node_acsm_enter_idle (node);
  351. break;
  352. default:
  353. log_printf (LOG_LEVEL_ERROR, "amf_node_leave called in state = %d"
  354. " (should have been deferred)", node->acsm_state);
  355. openais_exit_error (AIS_DONE_FATAL_ERR);
  356. break;
  357. }
  358. }
  359. /**
  360. * This function handles a detected error that by a pre-analysis executed
  361. * elsewhere has been decided to be recovered by a node fail over.
  362. * @param node
  363. */
  364. void amf_node_failover (struct amf_node *node)
  365. {
  366. assert (node != NULL);
  367. ENTER("'%s', CLM node '%s'", node->name.value,
  368. node->saAmfNodeClmNode.value);
  369. switch (node->acsm_state) {
  370. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  371. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  372. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  373. node_acsm_enter_failing_gracefully_failing_over (node);
  374. break;
  375. case NODE_ACSM_REPAIR_NEEDED:
  376. break;
  377. default:
  378. log_printf (LOG_LEVEL_ERROR, "amf_node_leave()called in state = %d"
  379. " (should have been deferred)", node->acsm_state);
  380. openais_exit_error (AIS_DONE_FATAL_ERR);
  381. break;
  382. }
  383. }
  384. /**
  385. *
  386. * @param node
  387. */
  388. void amf_node_switchover (struct amf_node *node)
  389. {
  390. }
  391. /**
  392. *
  393. * @param node
  394. */
  395. void amf_node_failfast (struct amf_node *node)
  396. {
  397. }
  398. /**
  399. * This event is a request to restart a component which has been escalated,
  400. * because the component has already been restarted the number of times
  401. * specified by the configuration.
  402. * This function evaluates which recovery measure shall now be
  403. * taken and initiates the action which result from the evaluation.
  404. * @param node
  405. * @param comp
  406. */
  407. void amf_node_comp_restart_req (struct amf_node *node, struct amf_comp *comp)
  408. {
  409. amf_su_t *su = comp->su;
  410. ENTER("");
  411. switch (node->acsm_state) {
  412. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  413. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_2;
  414. amf_node_comp_restart_req (node, comp);
  415. break;
  416. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  417. if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
  418. SaNameT dn;
  419. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_3;
  420. amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
  421. amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
  422. amf_comp_dn_make (comp, &dn);
  423. log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
  424. "action:\n\t\tSU failover", dn.value);
  425. amf_sg_failover_su_req (su->sg, su, node);
  426. } else {
  427. amf_su_restart (su);
  428. }
  429. break;
  430. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  431. if (su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
  432. SaNameT dn;
  433. amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
  434. amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
  435. amf_comp_dn_make (comp, &dn);
  436. log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
  437. "action:\n\t\tSU failover", dn.value);
  438. amf_sg_failover_su_req (su->sg, su, node);
  439. return;
  440. } else {
  441. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  442. amf_node_failover (node);
  443. }
  444. break;
  445. default:
  446. dprintf("%d",node->acsm_state);
  447. assert (0);
  448. break;
  449. }
  450. }
  451. /**
  452. * This event is a request to failover the specified component.
  453. * This function evaluates which recovery measure shall actually be
  454. * taken considering the escalation policy and initiates the action
  455. * which result from the evaluation.
  456. * @param node
  457. * @param comp
  458. */
  459. void amf_node_comp_failover_req (amf_node_t *node, amf_comp_t *comp)
  460. {
  461. ENTER("");
  462. switch (node->acsm_state) {
  463. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  464. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  465. if (comp->su->saAmfSUFailover) {
  466. /* SU failover */
  467. amf_sg_failover_su_req (comp->su->sg,comp->su, node);
  468. }
  469. break;
  470. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  471. if (comp->su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
  472. if (comp->su->saAmfSUFailover) {
  473. /* SU failover */
  474. amf_sg_failover_su_req (comp->su->sg,comp->su, node);
  475. }
  476. } else {
  477. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  478. amf_node_failover (node);
  479. }
  480. break;
  481. default:
  482. dprintf("%d",node->acsm_state);
  483. assert (0);
  484. break;
  485. }
  486. }
  487. /**
  488. * This event indicates that current node has joined and its cluster model has
  489. * been synchronized with the other nodes cluster models.
  490. *
  491. * @param node
  492. */
  493. void amf_node_sync_ready (struct amf_node *node)
  494. {
  495. struct amf_application *app;
  496. assert (node != NULL);
  497. log_printf(LOG_NOTICE, "Node=%s: sync ready, starting hosted SUs.",
  498. node->name.value);
  499. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  500. switch (node->acsm_state) {
  501. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  502. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  503. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  504. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  505. node->acsm_state = NODE_ACSM_JOINING_STARTING_APPLICATIONS;
  506. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  507. amf_application_start (app, node);
  508. }
  509. break;
  510. case NODE_ACSM_REPAIR_NEEDED:
  511. break;
  512. default:
  513. log_printf (LOG_LEVEL_ERROR, "amf_node_sync_ready() was called in "
  514. "state = %d (should have been deferred)",
  515. node->acsm_state);
  516. openais_exit_error (AIS_DONE_FATAL_ERR);
  517. break;
  518. }
  519. }
  520. /******************************************************************************
  521. * Event response methods
  522. *****************************************************************************/
  523. /**
  524. * This event indicates that an application has started. Started in this context
  525. * means that none of its contained service units is in an -ING state with other
  526. * words successfully instantiated, instantiation has failed or instantiation
  527. * was not possible (due to the node on which the SU was to be hosted is not
  528. * operational).
  529. *
  530. * @param node
  531. * @param application which has been started
  532. */
  533. void amf_node_application_started (struct amf_node *node,
  534. struct amf_application *app)
  535. {
  536. assert (node != NULL && app != NULL );
  537. ENTER ("Node=%s: application '%s' started", node->name.value,
  538. app->name.value);
  539. switch (node->acsm_state) {
  540. case NODE_ACSM_JOINING_STARTING_APPLICATIONS:
  541. if (amf_cluster_applications_started_with_no_starting_sgs(
  542. app->cluster)) {
  543. node_acsm_enter_joining_assigning_workload(node, app);
  544. }
  545. break;
  546. default:
  547. log_printf (LOG_LEVEL_ERROR, "amf_node_application_started()"
  548. "called in state = %d (unexpected !!)", node->acsm_state);
  549. openais_exit_error (AIS_DONE_FATAL_ERR);
  550. break;
  551. }
  552. }
  553. /**
  554. * This event indicates that an application has been assigned workload.
  555. *
  556. * @param node
  557. * @param app - Application which has been assigned workload
  558. */
  559. void amf_node_application_workload_assigned (struct amf_node *node,
  560. struct amf_application *app)
  561. {
  562. assert (node != NULL && app != NULL );
  563. ENTER ("Node=%s: application '%s' started", node->name.value,
  564. app->name.value);
  565. switch (node->acsm_state) {
  566. case NODE_ACSM_JOINING_ASSIGNING_WORKLOAD:
  567. if (amf_cluster_applications_assigned (amf_cluster)) {
  568. log_printf(LOG_NOTICE, "Node=%s: all workload assigned",
  569. node->name.value);
  570. node_acsm_enter_idle (node);
  571. }
  572. break;
  573. default:
  574. log_printf (LOG_LEVEL_ERROR, "amf_node_application_workload_assigned()"
  575. "called in state = %d (unexpected !!)", node->acsm_state);
  576. openais_exit_error (AIS_DONE_FATAL_ERR);
  577. break;
  578. }
  579. }
  580. /**
  581. * This event indicates that an SG has failed over its workload after a node
  582. * failure.
  583. *
  584. * @param node
  585. * @param sg_in SG which is now ready with its failover
  586. */
  587. void amf_node_sg_failed_over (struct amf_node *node, struct amf_sg *sg_in)
  588. {
  589. assert (node != NULL);
  590. ENTER ("Node=%s: SG '%s' started %d", node->name.value,
  591. sg_in->name.value,node->acsm_state);
  592. switch (node->acsm_state) {
  593. case NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER:
  594. if (has_all_sg_on_node_failed_over (node)) { /*C2*/
  595. node->acsm_state =
  596. NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN;
  597. }
  598. break;
  599. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  600. /* Accept reports of failed over sg that has completed. */
  601. break;
  602. case NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER:
  603. if (has_all_sg_on_node_failed_over (node)) { /*C2*/
  604. enter_failing_gracefully_rebooting_node (node);
  605. }
  606. break;
  607. default:
  608. log_printf (LOG_LEVEL_ERROR, "amf_node_sg_failed_over()"
  609. "called in state = %d (unexpected !!)", node->acsm_state);
  610. openais_exit_error (AIS_DONE_FATAL_ERR);
  611. break;
  612. }
  613. }
  614. /******************************************************************************
  615. * General methods
  616. *****************************************************************************/
  617. void amf_node_init (void)
  618. {
  619. log_init ("AMF");
  620. }
  621. /**
  622. * Node constructor
  623. * @param cluster
  624. * @param name - RDN of node
  625. */
  626. struct amf_node *amf_node_new (struct amf_cluster *cluster, char *name) {
  627. struct amf_node *node = amf_calloc (1, sizeof (struct amf_node));
  628. setSaNameT (&node->name, name);
  629. node->saAmfNodeAdminState = SA_AMF_ADMIN_UNLOCKED;
  630. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  631. node->saAmfNodeAutoRepair = SA_TRUE;
  632. node->saAmfNodeSuFailOverProb = -1;
  633. node->saAmfNodeSuFailoverMax = ~0;
  634. node->cluster = cluster;
  635. node->next = cluster->node_head;
  636. cluster->node_head = node;
  637. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  638. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  639. return node;
  640. }
  641. void *amf_node_serialize (struct amf_node *node, int *len)
  642. {
  643. char *buf = NULL;
  644. int offset = 0, size = 0;
  645. TRACE8 ("%s", node->name.value);
  646. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->name);
  647. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->saAmfNodeClmNode);
  648. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  649. node->saAmfNodeSuFailOverProb);
  650. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  651. node->saAmfNodeSuFailoverMax);
  652. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  653. node->saAmfNodeAutoRepair);
  654. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  655. node->saAmfNodeRebootOnInstantiationFailure);
  656. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  657. node->saAmfNodeRebootOnTerminationFailure);
  658. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  659. node->saAmfNodeAdminState);
  660. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  661. node->saAmfNodeOperState);
  662. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  663. node->nodeid);
  664. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  665. node->acsm_state);
  666. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  667. node->history_state);
  668. *len = offset;
  669. return buf;
  670. }
  671. struct amf_node *amf_node_deserialize (struct amf_cluster *cluster, char *buf) {
  672. char *tmp = buf;
  673. struct amf_node *node = amf_node_new (cluster, "");
  674. tmp = amf_deserialize_SaNameT (tmp, &node->name);
  675. tmp = amf_deserialize_SaNameT (tmp, &node->saAmfNodeClmNode);
  676. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailOverProb);
  677. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailoverMax);
  678. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAutoRepair);
  679. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnInstantiationFailure);
  680. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnTerminationFailure);
  681. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAdminState);
  682. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeOperState);
  683. tmp = amf_deserialize_SaUint32T (tmp, &node->nodeid);
  684. tmp = amf_deserialize_SaUint32T (tmp, &node->acsm_state);
  685. tmp = amf_deserialize_SaUint32T (tmp, &node->history_state);
  686. return node;
  687. }
  688. struct amf_node *amf_node_find (SaNameT *name) {
  689. struct amf_node *node;
  690. assert (name != NULL && amf_cluster != NULL);
  691. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  692. if (name_match (&node->name, name)) {
  693. return node;
  694. }
  695. }
  696. dprintf ("node %s not found in configuration!", name->value);
  697. return NULL;
  698. }
  699. struct amf_node *amf_node_find_by_nodeid (unsigned int nodeid) {
  700. struct amf_node *node;
  701. assert (amf_cluster != NULL);
  702. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  703. if (node->nodeid == nodeid) {
  704. return node;
  705. }
  706. }
  707. dprintf ("node %u not found in configuration!", nodeid);
  708. return NULL;
  709. }
  710. struct amf_node *amf_node_find_by_hostname (const char *hostname) {
  711. struct amf_node *node;
  712. assert (hostname != NULL && amf_cluster != NULL);
  713. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  714. if (strcmp ((char*)node->saAmfNodeClmNode.value, hostname) == 0) {
  715. return node;
  716. }
  717. }
  718. dprintf ("node %s not found in configuration!", hostname);
  719. return NULL;
  720. }