amfnode.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801
  1. /** @file amfnode.c
  2. *
  3. * Copyright (c) 2006 Ericsson AB.
  4. * Author: Hans Feldt, Anders Eriksson, Lars Holm
  5. * - Constructors/destructors
  6. * - Serializers/deserializers
  7. *
  8. * All rights reserved.
  9. *
  10. *
  11. * This software licensed under BSD license, the text of which follows:
  12. *
  13. * Redistribution and use in source and binary forms, with or without
  14. * modification, are permitted provided that the following conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright notice,
  17. * this list of conditions and the following disclaimer.
  18. * - Redistributions in binary form must reproduce the above copyright notice,
  19. * this list of conditions and the following disclaimer in the documentation
  20. * and/or other materials provided with the distribution.
  21. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  22. * contributors may be used to endorse or promote products derived from this
  23. * software without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  26. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  29. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  30. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  31. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  32. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  33. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  35. * THE POSSIBILITY OF SUCH DAMAGE.
  36. *
  37. * AMF Node Class Implementation
  38. *
  39. * This file contains functions for handling AMF nodes. It can be
  40. * viewed as the implementation of the AMF Node class (called NODE)
  41. * as described in SAI-Overview-B.02.01. The SA Forum specification
  42. * SAI-AIS-AMF-B.02.01 has been used as specification of the behaviour
  43. * and is referred to as 'the spec' below.
  44. *
  45. * The functions in this file are responsible for:
  46. * - controlling the instantiation of the SUs hosted on current node and
  47. * controlling the assigning of workload to them when a node joins the
  48. * cluster (cluster start is controlled by the Cluster Class)
  49. * - controlling node level recovery and repair functions
  50. * - implementing error escallation level 2 and 3 (paragraph 3.12.2.2 and
  51. * 3.12.2.3 in the spec)
  52. * - handling run time attributes of the AMF NODE; cached
  53. * attributes are stored as variables and sent to the IMM service (future)
  54. * upon the changes described in the specification
  55. *
  56. * The node class contains the following state machines:
  57. * - administrative state machine (ADSM)
  58. * - operational state machine (OPSM)
  59. * - availability control state machine (ACSM)
  60. *
  61. * The administrative state machine will be implemented in the future.
  62. *
  63. * The operational state machine is primarily used to report status of the
  64. * node.
  65. *
  66. * The availability control state machine is used for control purposes.
  67. * ACSM contains three states of which two are composite.
  68. * Being a composite state means that the state contains substates.
  69. * ACSM states are:
  70. * - REPAIR_NEEDED
  71. * - IDLE (ESCALATION_LEVEL_0, ESCALATION_LEVEL_2 and ESCALATION_LEVEL_3)
  72. * - MANAGING_HOSTED_SERVICE_UNITS (
  73. * . FAILING_FAST (REBOOTING_NODE and ACTIVATING_STANDBY_NODE)
  74. * . FAILING_GRACEFULLY (SWITCHING_OVER, FAILING_OVER and REBOOTING_NODE)
  75. * . LEAVING_SPONTANEOUSLY (SWITCHING_OVER, FAILING_OVER and
  76. * WAITING_FOR_NODE_TO_JOIN)
  77. * . JOINING (STARTING_APPLICATIONS and ASSIGNING_WORKLOAD)
  78. *
  79. * REPAIR_NEEDED indicates the node needs a manual repair and this state will be
  80. * maintained until the administrative command REPAIRED is entered (implemented
  81. * in the future)
  82. *
  83. * IDLE is a composite state where no actions are actually performed
  84. * and used only to remember the escallation level. Substate LEVEL_0 indicates
  85. * no escallation. LEVEL_2 indicates that so many component restarts have been
  86. * executed recently that a new component restart request will escalate
  87. * to service unit restart action. Node will request a service unit restart
  88. * from SU.
  89. * LEVEL_3 will be entered if either there are too many service unit restarts
  90. * been made or a component failover recovery action is requested. On level 3
  91. * the recovery action performed is service unit failover (paragraph 3.12.1.3).
  92. *
  93. * FAILING_FAST state executes a node re-boot and waits for the node to join
  94. * the cluster again. (not implemented)
  95. *
  96. * FAILING_GRACEFULLY state requests all SGs which have SUs hosted on current
  97. * node to switch or failover according to the procedures described in
  98. * paragraphs 3.12.1.3 before re-boot is executed. Then the confirmation is
  99. * awaited from all concerned SGs and finally a node re-boot is executed as
  100. * the repair action (see paragraph 2.12.1.4).
  101. *
  102. * LEAVING_SPONTANEOUSLY state handles the spontaneous leave of a node.
  103. *
  104. * JOINING state handles the start of a node in all cases except cluster start,
  105. * which is handled by the CLUSTER class.
  106. *
  107. * 1. Node Availability Control State Machine
  108. * ==========================================
  109. *
  110. * 1.1 State Transition Table
  111. *
  112. * State: Event: Action: New state:
  113. * ============================================================================
  114. * ESCALATION_LEVEL_X node_sync_ready A6 JOINING_STARTING_APPLS
  115. * ESCALATION_LEVEL_X node_leave A9,A8 LEAVING_SP_FAILING_OVER
  116. * ESCALATION_LEVEL_X failover A11 GRACEFULLY_FAILING_OVER
  117. * ESCALATION_LEVEL_2 comp_restart_req [!C6]A13 ESCALATION_LEVEL_2
  118. * ESCALATION_LEVEL_2 comp_restart_req [C6]A14 ESCALATION_LEVEL_3
  119. * ESCALATION_LEVEL_3 comp_restart_req [!C7]A14 ESCALATION_LEVEL_3
  120. * ESCALATION_LEVEL_3 comp_failover_req [!C7]A14 ESCALATION_LEVEL_3
  121. * ESCALATION_LEVEL_3 comp_restart_req [C7]A15 ESCALATION_LEVEL_3
  122. * ESCALATION_LEVEL_3 comp_failover_req [C7]A15 ESCALATION_LEVEL_3
  123. * JOINING_STARTING_APPLS appl_started [C4] A7 JOINING_ASSIGNING_WL
  124. * JOINING_ASSIGNING_WL appl_assigned [C5] ESCALATION_LEVEL_X
  125. * LEAVING_SP_FAILING_OVER sg_failed_over [C1] LEAVING_SP_WAIT_FOR_JOIN
  126. * LEAVING_SP_WAIT_FOR_JOIN node_sync_ready A6 JOINING_STARTING_APPLS
  127. * GRACEFULLY_FAILING_OVER sg_failed_over [C1] A12 GRACEFULLY_REBOOTING
  128. * GRACEFULLY_REBOOTING node_leave ESCALATION_LEVEL_X
  129. *
  130. * 1.2 State Description
  131. * =====================
  132. * ESCALATION_LEVEL_X - Node is synchronized and idle (X = 0,2 or 3).
  133. * JOINING_STARTING_APPLS - JOINING_STARTING_APPLICATIONS
  134. * Node has ordered all applications to start its SUs
  135. * hosted on current node and is now waiting for them
  136. * to acknowledge that they have started.
  137. * GRACEFULLY_FAILING_OVER - FAILING_GRACEFULLY_FAILING_OVER
  138. * Node has ordered all SGs in the cluster to
  139. * failover all SUs that are hosted on a specific
  140. * node and waits for the SGs to confirm the
  141. * failover is completed.
  142. * GRACEFULLY_REBOOTING - FAILING_GRACEFULLY_REBOOTING_NODE
  143. * Node has ordered reboot and waits for the rebooted
  144. * node to join the cluster again.
  145. * JOINING_ASSIGNING_WL - JOINING_ASSIGNING_WORKLOAD
  146. * Node has ordered all applications to assign workload
  147. * to all its SUs which currently have no workload and
  148. * is now waiting for the applications to acknowledge.
  149. *
  150. * LEAVING_SP_FAILING_OVER - LEAVING_SPONTANEOUSLY_FAILING_OVER
  151. * Node has received an event telling that this node
  152. * has left the cluster and has ordered all service
  153. * groups to failover those of its SUs that were
  154. * hosted on current node.
  155. *
  156. * LEAVING_SP_WAIT_FOR_JOIN - LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN
  157. * Node is waiting for current node to join again.
  158. *
  159. * 1.3 Actions
  160. * ===========
  161. * A1 -
  162. * A2 -
  163. * A3 -
  164. * A4 -
  165. * A5 -
  166. * A6 - [foreach application in cluster]start application
  167. * A7 - [foreach application in cluster]assign workload to application
  168. * A8 - [foreach application in cluster]
  169. * [foreach SG in application ]failover node
  170. * A9 - [foreach application in cluster]
  171. * [foreach SG in application ]
  172. * [foreach SU in SG where the SU is hosted on current node]
  173. * [foreach comp in such an SU]indicate that the node has left the cluster
  174. * A10-
  175. * A11- [foreach SG in cluster]failover node
  176. * A12- reboot node
  177. * A13- restart SU
  178. * A14- failover SU
  179. * A15- failover node
  180. *
  181. * 1.4 Guards
  182. * ==========
  183. * C1 - All SG availability control state machines (ACSM) == IDLE
  184. * C2 -
  185. * C3 -
  186. * C4 - No applications are in ACSM state == STARTING_SGS
  187. * C5 - All applications have ACSM state == WORKLOAD_ASSIGNED
  188. * C6 - Specified number of SU restarts have been done.
  189. * C7 - Specified number of SU failover actions have been done.
  190. */
  191. #include <stdlib.h>
  192. #include <assert.h>
  193. #include <unistd.h>
  194. #include "amf.h"
  195. #include "util.h"
  196. #include "print.h"
  197. #include "main.h"
  198. /******************************************************************************
  199. * Internal (static) utility functions
  200. *****************************************************************************/
  201. static void node_acsm_enter_leaving_spontaneously(struct amf_node *node)
  202. {
  203. ENTER("'%s'", node->name.value);
  204. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED;
  205. node->nodeid = 0;
  206. }
  207. static void node_acsm_enter_failing_over (struct amf_node *node)
  208. {
  209. struct amf_application *app;
  210. struct amf_sg *sg;
  211. struct amf_su *su;
  212. struct amf_comp *component = NULL;
  213. ENTER("'%s'", node->name.value);
  214. node->acsm_state = NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER;
  215. /*
  216. * Indicate to each component object in the model that current
  217. * node has left the cluster
  218. */
  219. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  220. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  221. for (su = sg->su_head; su != NULL; su = su->next) {
  222. if (name_match(&node->name, &su->saAmfSUHostedByNode)) {
  223. for (component = su->comp_head; component != NULL;
  224. component = component->next) {
  225. amf_comp_node_left(component);
  226. }
  227. }
  228. }
  229. }
  230. }
  231. /*
  232. * Let all service groups with service units hosted on current node failover
  233. * its workload
  234. */
  235. for (app = amf_cluster->application_head; app != NULL; app =
  236. app->next) {
  237. for (sg = app->sg_head; sg != NULL; sg =
  238. sg->next) {
  239. amf_sg_failover_node_req(sg, node);
  240. }
  241. }
  242. }
  243. static void failover_all_sg_on_node (amf_node_t *node)
  244. {
  245. amf_application_t *app;
  246. amf_sg_t *sg;
  247. amf_su_t *su;
  248. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  249. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  250. for (su = sg->su_head; su != NULL; su = su->next) {
  251. if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
  252. amf_sg_failover_node_req (sg, node);
  253. break;
  254. }
  255. }
  256. }
  257. }
  258. }
  259. static void node_acsm_enter_failing_gracefully_failing_over (amf_node_t *node)
  260. {
  261. ENTER("");
  262. node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER;
  263. failover_all_sg_on_node (node);
  264. }
  265. static int has_all_sg_on_node_failed_over (amf_node_t *node)
  266. {
  267. amf_application_t *app;
  268. amf_sg_t *sg;
  269. amf_su_t *su;
  270. int has_all_sg_on_node_failed_over = 1;
  271. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  272. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  273. for (su = sg->su_head; su != NULL; su = su->next) {
  274. if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
  275. if (sg->avail_state != SG_AC_Idle) {
  276. TRACE1("%s %s",sg->name.value, su->name.value);
  277. has_all_sg_on_node_failed_over = 0;
  278. goto out;
  279. }
  280. break;
  281. }
  282. }
  283. }
  284. }
  285. out:
  286. return has_all_sg_on_node_failed_over;
  287. }
  288. static void repair_node (amf_node_t *node)
  289. {
  290. ENTER("");
  291. char hostname[256];
  292. gethostname (hostname, 256);
  293. if (!strcmp (hostname, (const char*)node->saAmfNodeClmNode.value)) {
  294. /* TODO if(saAmfAutoRepair == SA_TRUE) */
  295. #ifdef DEBUG
  296. exit (0);
  297. #else
  298. system ("reboot");
  299. #endif
  300. }
  301. }
  302. static void enter_failing_gracefully_rebooting_node (amf_node_t *node)
  303. {
  304. ENTER("");
  305. node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE;
  306. repair_node (node);
  307. }
  308. static void node_acsm_enter_idle (amf_node_t *node)
  309. {
  310. ENTER ("history_state=%d",node->history_state);
  311. node->acsm_state = node->history_state;
  312. }
  313. static void node_acsm_enter_joining_assigning_workload (struct amf_node *node,
  314. struct amf_application *app)
  315. {
  316. log_printf(LOG_NOTICE,
  317. "Node=%s: all applications started, assigning workload.",
  318. node->name.value);
  319. ENTER("");
  320. node->acsm_state = NODE_ACSM_JOINING_ASSIGNING_WORKLOAD;
  321. for (app = app->cluster->application_head; app != NULL;
  322. app = app->next) {
  323. amf_application_assign_workload (app, node);
  324. }
  325. }
  326. /******************************************************************************
  327. * Event methods
  328. *****************************************************************************/
  329. /**
  330. * This event indicates that a node has unexpectedly left the cluster. Node
  331. * leave event is obtained from amf_confchg_fn.
  332. *
  333. * @param node
  334. */
  335. void amf_node_leave (struct amf_node *node)
  336. {
  337. assert (node != NULL);
  338. ENTER("'%s', CLM node '%s'", node->name.value,
  339. node->saAmfNodeClmNode.value);
  340. switch (node->acsm_state) {
  341. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  342. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  343. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  344. node_acsm_enter_leaving_spontaneously(node);
  345. node_acsm_enter_failing_over (node);
  346. break;
  347. case NODE_ACSM_REPAIR_NEEDED:
  348. break;
  349. case NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE:
  350. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  351. node_acsm_enter_idle (node);
  352. break;
  353. default:
  354. log_printf (LOG_LEVEL_ERROR, "amf_node_leave called in state = %d"
  355. " (should have been deferred)", node->acsm_state);
  356. openais_exit_error (AIS_DONE_FATAL_ERR);
  357. break;
  358. }
  359. }
  360. /**
  361. * This function handles a detected error that by a pre-analysis executed
  362. * elsewhere has been decided to be recovered by a node fail over.
  363. * @param node
  364. */
  365. void amf_node_failover (struct amf_node *node)
  366. {
  367. assert (node != NULL);
  368. ENTER("'%s', CLM node '%s'", node->name.value,
  369. node->saAmfNodeClmNode.value);
  370. switch (node->acsm_state) {
  371. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  372. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  373. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  374. node_acsm_enter_failing_gracefully_failing_over (node);
  375. break;
  376. case NODE_ACSM_REPAIR_NEEDED:
  377. break;
  378. default:
  379. log_printf (LOG_LEVEL_ERROR, "amf_node_leave()called in state = %d"
  380. " (should have been deferred)", node->acsm_state);
  381. openais_exit_error (AIS_DONE_FATAL_ERR);
  382. break;
  383. }
  384. }
  385. /**
  386. *
  387. * @param node
  388. */
  389. void amf_node_switchover (struct amf_node *node)
  390. {
  391. }
  392. /**
  393. *
  394. * @param node
  395. */
  396. void amf_node_failfast (struct amf_node *node)
  397. {
  398. }
  399. /**
  400. * This event is a request to restart a component which has been escalated,
  401. * because the component has already been restarted the number of times
  402. * specified by the configuration.
  403. * This function evaluates which recovery measure shall now be
  404. * taken and initiates the action which result from the evaluation.
  405. * @param node
  406. * @param comp
  407. */
  408. void amf_node_comp_restart_req (struct amf_node *node, struct amf_comp *comp)
  409. {
  410. amf_su_t *su = comp->su;
  411. ENTER("");
  412. switch (node->acsm_state) {
  413. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  414. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_2;
  415. amf_node_comp_restart_req (node, comp);
  416. break;
  417. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  418. if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
  419. SaNameT dn;
  420. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_3;
  421. amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
  422. amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
  423. amf_comp_dn_make (comp, &dn);
  424. log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
  425. "action:\n\t\tSU failover", dn.value);
  426. amf_sg_failover_su_req (su->sg, su, node);
  427. } else {
  428. amf_su_restart (su);
  429. }
  430. break;
  431. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  432. if (su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
  433. SaNameT dn;
  434. amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
  435. amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
  436. amf_comp_dn_make (comp, &dn);
  437. log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
  438. "action:\n\t\tSU failover", dn.value);
  439. amf_sg_failover_su_req (su->sg, su, node);
  440. return;
  441. } else {
  442. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  443. amf_node_failover (node);
  444. }
  445. break;
  446. default:
  447. dprintf("%d",node->acsm_state);
  448. assert (0);
  449. break;
  450. }
  451. }
  452. /**
  453. * This event is a request to failover the specified component.
  454. * This function evaluates which recovery measure shall actually be
  455. * taken considering the escalation policy and initiates the action
  456. * which result from the evaluation.
  457. * @param node
  458. * @param comp
  459. */
  460. void amf_node_comp_failover_req (amf_node_t *node, amf_comp_t *comp)
  461. {
  462. ENTER("");
  463. switch (node->acsm_state) {
  464. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  465. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  466. if (comp->su->saAmfSUFailover) {
  467. /* SU failover */
  468. amf_sg_failover_su_req (comp->su->sg,comp->su, node);
  469. }
  470. break;
  471. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  472. if (comp->su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
  473. if (comp->su->saAmfSUFailover) {
  474. /* SU failover */
  475. amf_sg_failover_su_req (comp->su->sg,comp->su, node);
  476. }
  477. } else {
  478. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  479. amf_node_failover (node);
  480. }
  481. break;
  482. default:
  483. dprintf("%d",node->acsm_state);
  484. assert (0);
  485. break;
  486. }
  487. }
  488. /**
  489. * This event indicates that current node has joined and its cluster model has
  490. * been synchronized with the other nodes cluster models.
  491. *
  492. * @param node
  493. */
  494. void amf_node_sync_ready (struct amf_node *node)
  495. {
  496. struct amf_application *app;
  497. assert (node != NULL);
  498. log_printf(LOG_NOTICE, "Node=%s: sync ready, starting hosted SUs.",
  499. node->name.value);
  500. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  501. switch (node->acsm_state) {
  502. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
  503. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
  504. case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
  505. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  506. node->acsm_state = NODE_ACSM_JOINING_STARTING_APPLICATIONS;
  507. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  508. amf_application_start (app, node);
  509. }
  510. break;
  511. case NODE_ACSM_REPAIR_NEEDED:
  512. break;
  513. default:
  514. log_printf (LOG_LEVEL_ERROR, "amf_node_sync_ready() was called in "
  515. "state = %d (should have been deferred)",
  516. node->acsm_state);
  517. openais_exit_error (AIS_DONE_FATAL_ERR);
  518. break;
  519. }
  520. }
  521. /******************************************************************************
  522. * Event response methods
  523. *****************************************************************************/
  524. /**
  525. * This event indicates that an application has started. Started in this context
  526. * means that none of its contained service units is in an -ING state with other
  527. * words successfully instantiated, instantiation has failed or instantiation
  528. * was not possible (due to the node on which the SU was to be hosted is not
  529. * operational).
  530. *
  531. * @param node
  532. * @param application which has been started
  533. */
  534. void amf_node_application_started (struct amf_node *node,
  535. struct amf_application *app)
  536. {
  537. assert (node != NULL && app != NULL );
  538. ENTER ("Node=%s: application '%s' started", node->name.value,
  539. app->name.value);
  540. switch (node->acsm_state) {
  541. case NODE_ACSM_JOINING_STARTING_APPLICATIONS:
  542. if (amf_cluster_applications_started_with_no_starting_sgs(
  543. app->cluster)) {
  544. node_acsm_enter_joining_assigning_workload(node, app);
  545. }
  546. break;
  547. default:
  548. log_printf (LOG_LEVEL_ERROR, "amf_node_application_started()"
  549. "called in state = %d (unexpected !!)", node->acsm_state);
  550. openais_exit_error (AIS_DONE_FATAL_ERR);
  551. break;
  552. }
  553. }
  554. /**
  555. * This event indicates that an application has been assigned workload.
  556. *
  557. * @param node
  558. * @param app - Application which has been assigned workload
  559. */
  560. void amf_node_application_workload_assigned (struct amf_node *node,
  561. struct amf_application *app)
  562. {
  563. assert (node != NULL && app != NULL );
  564. ENTER ("Node=%s: application '%s' started", node->name.value,
  565. app->name.value);
  566. switch (node->acsm_state) {
  567. case NODE_ACSM_JOINING_ASSIGNING_WORKLOAD:
  568. if (amf_cluster_applications_assigned (amf_cluster)) {
  569. log_printf(LOG_NOTICE, "Node=%s: all workload assigned",
  570. node->name.value);
  571. node_acsm_enter_idle (node);
  572. }
  573. break;
  574. default:
  575. log_printf (LOG_LEVEL_ERROR, "amf_node_application_workload_assigned()"
  576. "called in state = %d (unexpected !!)", node->acsm_state);
  577. openais_exit_error (AIS_DONE_FATAL_ERR);
  578. break;
  579. }
  580. }
  581. /**
  582. * This event indicates that an SG has failed over its workload after a node
  583. * failure.
  584. *
  585. * @param node
  586. * @param sg_in SG which is now ready with its failover
  587. */
  588. void amf_node_sg_failed_over (struct amf_node *node, struct amf_sg *sg_in)
  589. {
  590. assert (node != NULL);
  591. ENTER ("Node=%s: SG '%s' started %d", node->name.value,
  592. sg_in->name.value,node->acsm_state);
  593. switch (node->acsm_state) {
  594. case NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER:
  595. if (has_all_sg_on_node_failed_over (node)) { /*C2*/
  596. node->acsm_state =
  597. NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN;
  598. }
  599. break;
  600. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  601. /* Accept reports of failed over sg that has completed. */
  602. break;
  603. case NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER:
  604. if (has_all_sg_on_node_failed_over (node)) { /*C2*/
  605. enter_failing_gracefully_rebooting_node (node);
  606. }
  607. break;
  608. default:
  609. log_printf (LOG_LEVEL_ERROR, "amf_node_sg_failed_over()"
  610. "called in state = %d (unexpected !!)", node->acsm_state);
  611. openais_exit_error (AIS_DONE_FATAL_ERR);
  612. break;
  613. }
  614. }
  615. /******************************************************************************
  616. * General methods
  617. *****************************************************************************/
  618. void amf_node_init (void)
  619. {
  620. log_init ("AMF");
  621. }
  622. /**
  623. * Node constructor
  624. * @param cluster
  625. * @param name - RDN of node
  626. */
  627. struct amf_node *amf_node_new (struct amf_cluster *cluster, char *name) {
  628. struct amf_node *node = amf_calloc (1, sizeof (struct amf_node));
  629. setSaNameT (&node->name, name);
  630. node->saAmfNodeAdminState = SA_AMF_ADMIN_UNLOCKED;
  631. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  632. node->saAmfNodeAutoRepair = SA_TRUE;
  633. node->saAmfNodeSuFailOverProb = -1;
  634. node->saAmfNodeSuFailoverMax = ~0;
  635. node->cluster = cluster;
  636. node->next = cluster->node_head;
  637. cluster->node_head = node;
  638. node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  639. node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
  640. return node;
  641. }
  642. void *amf_node_serialize (struct amf_node *node, int *len)
  643. {
  644. char *buf = NULL;
  645. int offset = 0, size = 0;
  646. TRACE8 ("%s", node->name.value);
  647. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->name);
  648. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->saAmfNodeClmNode);
  649. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  650. node->saAmfNodeSuFailOverProb);
  651. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  652. node->saAmfNodeSuFailoverMax);
  653. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  654. node->saAmfNodeAutoRepair);
  655. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  656. node->saAmfNodeRebootOnInstantiationFailure);
  657. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  658. node->saAmfNodeRebootOnTerminationFailure);
  659. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  660. node->saAmfNodeAdminState);
  661. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  662. node->saAmfNodeOperState);
  663. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  664. node->nodeid);
  665. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  666. node->acsm_state);
  667. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  668. node->history_state);
  669. *len = offset;
  670. return buf;
  671. }
  672. struct amf_node *amf_node_deserialize (struct amf_cluster *cluster, char *buf) {
  673. char *tmp = buf;
  674. struct amf_node *node = amf_node_new (cluster, "");
  675. tmp = amf_deserialize_SaNameT (tmp, &node->name);
  676. tmp = amf_deserialize_SaNameT (tmp, &node->saAmfNodeClmNode);
  677. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailOverProb);
  678. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailoverMax);
  679. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAutoRepair);
  680. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnInstantiationFailure);
  681. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnTerminationFailure);
  682. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAdminState);
  683. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeOperState);
  684. tmp = amf_deserialize_SaUint32T (tmp, &node->nodeid);
  685. tmp = amf_deserialize_SaUint32T (tmp, &node->acsm_state);
  686. tmp = amf_deserialize_SaUint32T (tmp, &node->history_state);
  687. return node;
  688. }
  689. struct amf_node *amf_node_find (SaNameT *name) {
  690. struct amf_node *node;
  691. assert (name != NULL && amf_cluster != NULL);
  692. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  693. if (name_match (&node->name, name)) {
  694. return node;
  695. }
  696. }
  697. dprintf ("node %s not found in configuration!", name->value);
  698. return NULL;
  699. }
  700. struct amf_node *amf_node_find_by_nodeid (unsigned int nodeid) {
  701. struct amf_node *node;
  702. assert (amf_cluster != NULL);
  703. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  704. if (node->nodeid == nodeid) {
  705. return node;
  706. }
  707. }
  708. dprintf ("node %u not found in configuration!", nodeid);
  709. return NULL;
  710. }
  711. struct amf_node *amf_node_find_by_hostname (const char *hostname) {
  712. struct amf_node *node;
  713. assert (hostname != NULL && amf_cluster != NULL);
  714. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  715. if (strcmp ((char*)node->saAmfNodeClmNode.value, hostname) == 0) {
  716. return node;
  717. }
  718. }
  719. dprintf ("node %s not found in configuration!", hostname);
  720. return NULL;
  721. }