amfnode.c 19 KB


  1. /** @file amfnode.c
  2. *
  3. * Copyright (c) 2006 Ericsson AB.
  4. * Author: Hans Feldt, Anders Eriksson, Lars Holm
  5. * - Constructors/destructors
  6. * - Serializers/deserializers
  7. *
  8. * All rights reserved.
  9. *
  10. *
  11. * This software licensed under BSD license, the text of which follows:
  12. *
  13. * Redistribution and use in source and binary forms, with or without
  14. * modification, are permitted provided that the following conditions are met:
  15. *
  16. * - Redistributions of source code must retain the above copyright notice,
  17. * this list of conditions and the following disclaimer.
  18. * - Redistributions in binary form must reproduce the above copyright notice,
  19. * this list of conditions and the following disclaimer in the documentation
  20. * and/or other materials provided with the distribution.
  21. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  22. * contributors may be used to endorse or promote products derived from this
  23. * software without specific prior written permission.
  24. *
  25. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  26. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  29. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  30. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  31. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  32. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  33. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  34. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  35. * THE POSSIBILITY OF SUCH DAMAGE.
  36. *
  37. * AMF Node Class Implementation
  38. *
  39. * This file contains functions for handling AMF nodes. It can be
  40. * viewed as the implementation of the AMF Node class (called NODE)
  41. * as described in SAI-Overview-B.02.01. The SA Forum specification
  42. * SAI-AIS-AMF-B.02.01 has been used as specification of the behaviour
  43. * and is referred to as 'the spec' below.
  44. *
  45. * The functions in this file are responsible for:
  46. * - controlling the instantiation of the SUs hosted on current node and
  47. * controlling the assigning of workload to them when a node joins the
  48. * cluster (cluster start is controlled by the Cluster Class)
  49. * - controlling node level recovery and repair functions
  50. * - implementing error escallation level 2 and 3 (paragraph 3.12.2.2 and
  51. * 3.12.2.3 in the spec)
  52. * - handling run time attributes of the AMF NODE; cached
  53. * attributes are stored as variables and sent to the IMM service (future)
  54. * upon the changes described in the specification
  55. *
  56. * The node class contains the following state machines:
  57. * - administrative state machine (ADSM)
  58. * - operational state machine (OPSM)
  59. * - availability control state machine (ACSM)
  60. *
  61. * The administrative state machine will be implemented in the future.
  62. *
  63. * The operational state machine is primarily used to report status of the
  64. * node.
  65. *
  66. * The availability control state machine is used for control purposes.
  67. * ACSM contains three states of which two are composite.
  68. * Being a composite state means that the state contains substates.
  69. * ACSM states are:
  70. * - REPAIR_NEEDED
  71. * - ESCALLATION_LEVEL (LEVEL_0, LEVEL_2 and LEVEL_3)
  72. * - MANAGING_HOSTED_SERVICE_UNITS (
  73. * . FAILING_FAST (REBOOTING_NODE and ACTIVATING_STANDBY_NODE)
  74. * . FAILING_GRACEFULLY (SWITCHING_OVER, FAILING_OVER and REBOOTING_NODE)
  75. * . LEAVING_SPONTANEOUSLY (FAILING_OVER and
  76. * WAITING_FOR_NODE_TO_JOIN)
  77. * . JOINING (STARTING_SERVICE_UNITS and ASSIGNING_STANDBY_WORKLOAD)
  78. *
  79. * REPAIR_NEEDED indicates the node needs a manual repair and this state will be
  80. * maintained until the administrative command REPAIRED is entered (implemented
  81. * in the future)
  82. *
  83. * ESCALLATION_LEVEL is a kind of idle state where no actions are performed
  84. * and used only to remember the escallation level. Substate LEVEL_0 indicates
  85. * no escallation. LEVEL_2 indicates that so many component restarts have been
  86. * executed recently that a new component restart request will escalate
  87. * to service unit restart action. Node will request a service unit restart
  88. * from SU.
  89. * LEVEL_3 will be entered if either there are too many service unit restarts
  90. * been made or a component failover recovery action is requested. On level 3
  91. * the recovery action performed is service unit failover (paragraph 3.12.1.3).
  92. *
  93. * FAILING_FAST state executes a node re-boot and waits for the node to join
  94. * the cluster again.
  95. *
  96. * FAILING_GRACEFULLY state requests all SGs which have SUs hosted on current
  97. * node to switch or failover according to the procedures described in
  98. * paragraphs 3.12.1.3 before re-boot is executed. Then the confirmation is
  99. * awaited from all concerned SGs and finally a node re-boot is executed as
  100. * the repair action (see paragraph 2.12.1.4).
  101. *
  102. * LEAVING_SPONTANEOUSLY state handles the spontaneous leave of a node.
  103. *
  104. * JOINING state handles the start of a node in all cases except cluster start,
  105. * which is handled by the CLUSTER class.
  106. *
  107. * 1. Node Availability Control State Machine
  108. * ==========================================
  109. *
  110. * 1.1 State Transition Table
  111. *
  112. * State: Event: Action: New state:
  113. * ============================================================================
  114. * ESCALATION_LEVEL_0 node_sync_ready A6 JOINING_STARTING_APPLS
  115. * ESCALATION_LEVEL_0 node_leave A9,A8 LEAVING_SP_FAILING_OVER
  116. * JOINING_STARTING_APPLS appl_started [C4] A7 JOINING_ASSIGNING_WL
  117. * JOINING_ASSIGNING_WL appl_assigned [C5] ESCALATION_LEVEL_0
  118. * LEAVING_SP_FAILING_OVER sg_failed_over [C1] LEAVING_SP_WAIT_FOR_JOIN
  119. * LEAVING_SP_WAIT_FOR_JOIN node_sync_ready A6 JOINING_STARTING_APPLS
  120. *
  121. * 1.2 State Description
  122. * =====================
  123. * ESCALATION_LEVEL_0 - Node is synchronized and idle.
  124. * JOINING_STARTING_APPLS - JOINING_STARTING_APPLICATIONS
  125. * Node has ordered all applications to start its SUs
  126. * hosted on current node and is now waiting for them
  127. * to acknowledge that they have started.
  128. *
  129. * JOINING_ASSIGNING_WL - JOINING_ASSIGNING_WORKLOAD
  130. * Node has ordered all applications to assign workload
  131. * to all its SUs which currently have no workload and
  132. * is now waiting for the applications to acknowledge.
  133. *
  134. * LEAVING_SP_FAILING_OVER - LEAVING_SPONTANEOUSLY_FAILING_OVER
  135. * Node has received an event telling that this node
  136. * has left the cluster and has ordered all service
  137. * groups to failover those of its SUs that were
  138. * hosted on current node.
  139. *
  140. * LEAVING_SP_WAIT_FOR_JOIN - LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN
  141. * Node is waiting for current node to join again.
  142. *
  143. * 1.3 Actions
  144. * ===========
  145. * A1 -
  146. * A2 -
  147. * A3 -
  148. * A4 -
  149. * A5 -
  150. * A6 - [foreach application in cluster]start application
  151. * A7 - [foreach application in cluster]assign workload to application
  152. * A8 - [foreach application in cluster]
  153. * [foreach SG in application ]failover node
  154. * A9 - [foreach application in cluster]
  155. * [foreach SG in application ]
  156. * [foreach SU in SG where the SU is hosted on current node]
  157. * [foreach comp in such an SU]indicate that the node has left the cluster
  158. *
  159. * 1.4 Guards
  160. * ==========
  161. * C1 - All SG availability control state machines (ACSM) == IDLE
  162. * C2 -
  163. * C3 -
  164. * C4 - No applications are in ACSM state == STARTING_SGS
  165. * C5 - All applications have ACSM state == WORKLOAD_ASSIGNED
  166. */
  167. #include <stdlib.h>
  168. #include <assert.h>
  169. #include "amf.h"
  170. #include "util.h"
  171. #include "print.h"
  172. #include "main.h"
  173. /******************************************************************************
  174. * Internal (static) utility functions
  175. *****************************************************************************/
  176. static void node_acsm_enter_leaving_spontaneously(struct amf_node *node)
  177. {
  178. ENTER("'%s'", node->name.value);
  179. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_DISABLED;
  180. node->nodeid = 0;
  181. }
  182. static void node_acsm_enter_failing_over (struct amf_node *node)
  183. {
  184. struct amf_application *app;
  185. struct amf_sg *sg;
  186. struct amf_su *su;
  187. struct amf_comp *component = NULL;
  188. ENTER("'%s'", node->name.value);
  189. node->acsm_state = NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER;
  190. /*
  191. * Indicate to each component object in the model that current
  192. * node has left the cluster
  193. */
  194. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  195. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  196. for (su = sg->su_head; su != NULL; su = su->next) {
  197. if (name_match(&node->name, &su->saAmfSUHostedByNode)) {
  198. for (component = su->comp_head; component != NULL;
  199. component = component->next) {
  200. amf_comp_node_left(component);
  201. }
  202. }
  203. }
  204. }
  205. }
  206. /*
  207. * Let all service groups with service units hosted on current node failover
  208. * its workload
  209. */
  210. for (app = amf_cluster->application_head; app != NULL; app =
  211. app->next) {
  212. for (sg = app->sg_head; sg != NULL; sg =
  213. sg->next) {
  214. amf_sg_failover_node_req(sg, node);
  215. }
  216. }
  217. }
  218. /******************************************************************************
  219. * Event methods
  220. *****************************************************************************/
  221. /**
  222. * This event indicates that a node has unexpectedly left the cluster. Node
  223. * leave event is obtained from amf_confchg_fn.
  224. *
  225. * @param node
  226. */
  227. void amf_node_leave (struct amf_node *node)
  228. {
  229. assert (node != NULL);
  230. ENTER("'%s', CLM node '%s'", node->name.value,
  231. node->saAmfNodeClmNode.value);
  232. switch (node->acsm_state) {
  233. case NODE_ACSM_ESCALLATION_LEVEL_0:
  234. case NODE_ACSM_ESCALLATION_LEVEL_2:
  235. case NODE_ACSM_ESCALLATION_LEVEL_3:
  236. node_acsm_enter_leaving_spontaneously(node);
  237. node_acsm_enter_failing_over (node);
  238. break;
  239. case NODE_ACSM_REPAIR_NEEDED:
  240. break;
  241. default:
  242. log_printf (LOG_LEVEL_ERROR, "amf_node_leave()called in state = %d"
  243. " (should have been deferred)", node->acsm_state);
  244. openais_exit_error (AIS_DONE_FATAL_ERR);
  245. break;
  246. }
  247. }
  248. /**
  249. *
  250. * @param node
  251. */
  252. void amf_node_failover (struct amf_node *node)
  253. {
  254. }
  255. /**
  256. *
  257. * @param node
  258. */
  259. void amf_node_switchover (struct amf_node *node)
  260. {
  261. }
  262. /**
  263. *
  264. * @param node
  265. */
  266. void amf_node_failfast (struct amf_node *node)
  267. {
  268. }
  269. /**
  270. *
  271. * @param node
  272. * @param comp
  273. */
  274. void amf_node_comp_restart_req (
  275. struct amf_node *node, struct amf_comp *comp)
  276. {
  277. }
  278. /**
  279. *
  280. * @param node
  281. * @param comp
  282. */
  283. void amf_node_comp_failover_req (
  284. struct amf_node *node, struct amf_comp *comp)
  285. {
  286. }
  287. /**
  288. * This event indicates that current node has joined and its cluster model has
  289. * been synchronized with the other nodes cluster models.
  290. *
  291. * @param node
  292. */
  293. void amf_node_sync_ready (struct amf_node *node)
  294. {
  295. struct amf_application *app;
  296. assert (node != NULL);
  297. log_printf(LOG_NOTICE, "Node=%s: sync ready, starting hosted SUs.",
  298. node->name.value);
  299. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  300. switch (node->acsm_state) {
  301. case NODE_ACSM_ESCALLATION_LEVEL_0:
  302. case NODE_ACSM_ESCALLATION_LEVEL_2:
  303. case NODE_ACSM_ESCALLATION_LEVEL_3:
  304. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  305. node->acsm_state = NODE_ACSM_JOINING_STARTING_APPLICATIONS;
  306. for (app = amf_cluster->application_head; app != NULL; app = app->next) {
  307. amf_application_start (app, node);
  308. }
  309. break;
  310. case NODE_ACSM_REPAIR_NEEDED:
  311. break;
  312. default:
  313. log_printf (LOG_LEVEL_ERROR, "amf_node_sync_ready() was called in "
  314. "state = %d (should have been deferred)",
  315. node->acsm_state);
  316. openais_exit_error (AIS_DONE_FATAL_ERR);
  317. break;
  318. }
  319. }
  320. /******************************************************************************
  321. * Event response methods
  322. *****************************************************************************/
  323. /**
  324. * This event indicates that an application has started. Started in this context
  325. * means that none of its contained service units is in an -ING state with other
  326. * words successfully instantiated, instantiation has failed or instantiation
  327. * was not possible (due to the node on which the SU was to be hosted is not
  328. * operational).
  329. *
  330. * @param node
  331. * @param application which has been started
  332. */
  333. void amf_node_application_started (struct amf_node *node,
  334. struct amf_application *app)
  335. {
  336. assert (node != NULL && app != NULL );
  337. ENTER ("Node=%s: application '%s' started", node->name.value,
  338. app->name.value);
  339. switch (node->acsm_state) {
  340. case NODE_ACSM_JOINING_STARTING_APPLICATIONS:
  341. if (amf_cluster_applications_started_with_no_starting_sgs(
  342. app->cluster)) {
  343. log_printf(LOG_NOTICE,
  344. "Node=%s: all applications started, assigning workload.",
  345. node->name.value);
  346. node->acsm_state = NODE_ACSM_JOINING_ASSIGNING_WORKLOAD;
  347. for (app = app->cluster->application_head; app != NULL;
  348. app = app->next) {
  349. amf_application_assign_workload (app, node);
  350. }
  351. }
  352. break;
  353. default:
  354. log_printf (LOG_LEVEL_ERROR, "amf_node_application_started()"
  355. "called in state = %d (unexpected !!)", node->acsm_state);
  356. openais_exit_error (AIS_DONE_FATAL_ERR);
  357. break;
  358. }
  359. }
  360. /**
  361. * This event indicates that an application has been assigned workload.
  362. *
  363. * @param node
  364. * @param application which has been assigned workload
  365. */
  366. void amf_node_application_workload_assigned (struct amf_node *node,
  367. struct amf_application *app)
  368. {
  369. assert (node != NULL && app != NULL );
  370. ENTER ("Node=%s: application '%s' started", node->name.value,
  371. app->name.value);
  372. switch (node->acsm_state) {
  373. case NODE_ACSM_JOINING_ASSIGNING_WORKLOAD:
  374. if (amf_cluster_applications_assigned (amf_cluster)) {
  375. log_printf(LOG_NOTICE, "Node=%s: all workload assigned",
  376. node->name.value);
  377. /*
  378. * TODO: new state should be set via history
  379. */
  380. node->acsm_state = NODE_ACSM_ESCALLATION_LEVEL_0;
  381. }
  382. break;
  383. default:
  384. log_printf (LOG_LEVEL_ERROR, "amf_node_application_workload_assigned()"
  385. "called in state = %d (unexpected !!)", node->acsm_state);
  386. openais_exit_error (AIS_DONE_FATAL_ERR);
  387. break;
  388. }
  389. }
  390. /**
  391. * This event indicates that an SG has failed over its workload after a node
  392. * failure.
  393. *
  394. * @param node
  395. * @param sg_in SG which is now ready with its failover
  396. */
  397. void amf_node_sg_failed_over (struct amf_node *node, struct amf_sg *sg_in)
  398. {
  399. struct amf_sg *sg;
  400. struct amf_application *app = 0;
  401. int all_sg_has_failed_over = 1;
  402. assert (node != NULL);
  403. ENTER ("Node=%s: SG '%s' started", node->name.value,
  404. sg_in->name.value);
  405. switch (node->acsm_state) {
  406. case NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER:
  407. for (app = amf_cluster->application_head; app != NULL;
  408. app = app->next) {
  409. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  410. if (sg->avail_state != SG_AC_Idle) {
  411. all_sg_has_failed_over = 0;
  412. goto end;
  413. }
  414. }
  415. }
  416. break;
  417. case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
  418. /* Accept reports of failed over sg that has completed. */
  419. break;
  420. default:
  421. log_printf (LOG_LEVEL_ERROR, "amf_node_sg_failed_over()"
  422. "called in state = %d (unexpected !!)", node->acsm_state);
  423. openais_exit_error (AIS_DONE_FATAL_ERR);
  424. break;
  425. }
  426. end:
  427. if (all_sg_has_failed_over) {
  428. node->acsm_state = NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN;
  429. }
  430. }
  431. /******************************************************************************
  432. * General methods
  433. *****************************************************************************/
  434. void amf_node_init (void)
  435. {
  436. log_init ("AMF");
  437. }
  438. /**
  439. * Node constructor
  440. * @param loc
  441. * @param cluster
  442. * @param node
  443. */
  444. struct amf_node *amf_node_new (struct amf_cluster *cluster, char *name) {
  445. struct amf_node *node = amf_calloc (1, sizeof (struct amf_node));
  446. setSaNameT (&node->name, name);
  447. node->saAmfNodeAdminState = SA_AMF_ADMIN_UNLOCKED;
  448. node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
  449. node->saAmfNodeAutoRepair = SA_TRUE;
  450. node->saAmfNodeSuFailOverProb = -1;
  451. node->saAmfNodeSuFailoverMax = ~0;
  452. node->cluster = cluster;
  453. node->next = cluster->node_head;
  454. cluster->node_head = node;
  455. node->acsm_state = NODE_ACSM_ESCALLATION_LEVEL_0;
  456. return node;
  457. }
  458. void *amf_node_serialize (struct amf_node *node, int *len)
  459. {
  460. char *buf = NULL;
  461. int offset = 0, size = 0;
  462. TRACE8 ("%s", node->name.value);
  463. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->name);
  464. buf = amf_serialize_SaNameT (buf, &size, &offset, &node->saAmfNodeClmNode);
  465. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  466. node->saAmfNodeSuFailOverProb);
  467. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  468. node->saAmfNodeSuFailoverMax);
  469. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  470. node->saAmfNodeAutoRepair);
  471. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  472. node->saAmfNodeRebootOnInstantiationFailure);
  473. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  474. node->saAmfNodeRebootOnTerminationFailure);
  475. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  476. node->saAmfNodeAdminState);
  477. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  478. node->saAmfNodeOperState);
  479. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  480. node->nodeid);
  481. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  482. node->acsm_state);
  483. *len = offset;
  484. return buf;
  485. }
  486. struct amf_node *amf_node_deserialize (struct amf_cluster *cluster, char *buf) {
  487. char *tmp = buf;
  488. struct amf_node *node = amf_node_new (cluster, "");
  489. tmp = amf_deserialize_SaNameT (tmp, &node->name);
  490. tmp = amf_deserialize_SaNameT (tmp, &node->saAmfNodeClmNode);
  491. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailOverProb);
  492. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeSuFailoverMax);
  493. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAutoRepair);
  494. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnInstantiationFailure);
  495. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeRebootOnTerminationFailure);
  496. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeAdminState);
  497. tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeOperState);
  498. tmp = amf_deserialize_SaUint32T (tmp, &node->nodeid);
  499. tmp = amf_deserialize_SaUint32T (tmp, &node->acsm_state);
  500. return node;
  501. }
  502. struct amf_node *amf_node_find (SaNameT *name) {
  503. struct amf_node *node;
  504. assert (name != NULL && amf_cluster != NULL);
  505. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  506. if (name_match (&node->name, name)) {
  507. return node;
  508. }
  509. }
  510. dprintf ("node %s not found in configuration!", name->value);
  511. return NULL;
  512. }
  513. struct amf_node *amf_node_find_by_nodeid (unsigned int nodeid) {
  514. struct amf_node *node;
  515. assert (amf_cluster != NULL);
  516. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  517. if (node->nodeid == nodeid) {
  518. return node;
  519. }
  520. }
  521. dprintf ("node %u not found in configuration!", nodeid);
  522. return NULL;
  523. }
  524. struct amf_node *amf_node_find_by_hostname (const char *hostname) {
  525. struct amf_node *node;
  526. assert (hostname != NULL && amf_cluster != NULL);
  527. for (node = amf_cluster->node_head; node != NULL; node = node->next) {
  528. if (strcmp ((char*)node->saAmfNodeClmNode.value, hostname) == 0) {
  529. return node;
  530. }
  531. }
  532. dprintf ("node %s not found in configuration!", hostname);
  533. return NULL;
  534. }