amfcluster.c 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. /** @file amfcluster.c
  2. *
  3. * Copyright (c) 2006 Ericsson AB.
  4. * Author: Hans Feldt, Anders Eriksson, Lars Holm
  5. * - Refactoring of code into several AMF files
  6. * - Constructors/destructors
  7. * - Serializers/deserializers
  8. *
  9. * All rights reserved.
  10. *
  11. *
  12. * This software licensed under BSD license, the text of which follows:
  13. *
  14. * Redistribution and use in source and binary forms, with or without
  15. * modification, are permitted provided that the following conditions are met:
  16. *
  17. * - Redistributions of source code must retain the above copyright notice,
  18. * this list of conditions and the following disclaimer.
  19. * - Redistributions in binary form must reproduce the above copyright notice,
  20. * this list of conditions and the following disclaimer in the documentation
  21. * and/or other materials provided with the distribution.
  22. * - Neither the name of the MontaVista Software, Inc. nor the names of its
  23. * contributors may be used to endorse or promote products derived from this
  24. * software without specific prior written permission.
  25. *
  26. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  27. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  30. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  31. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  32. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  33. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  34. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  35. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  36. * THE POSSIBILITY OF SUCH DAMAGE.
  37. *
  38. * AMF Cluster Class Implementation
  39. *
  40. * This file contains functions for handling the AMF cluster. It can be
  41. * viewed as the implementation of the AMF Cluster class
  42. * as described in SAI-Overview-B.02.01. The SA Forum specification
  43. * SAI-AIS-AMF-B.02.01 has been used as specification of the behaviour
  44. * and is referred to as 'the spec' below.
  45. *
  46. * The functions in this file are responsible for:
  47. * - to start the cluster initially
  48. * - to handle the administrative operation support for the cluster (FUTURE)
  49. *
  50. * The cluster class contains the following state machines:
  51. * - administrative state machine (ADSM)
  52. * - availability control state machine (ACSM)
  53. *
  54. * The administrative state machine will be implemented in the future.
  55. *
  56. * ACSM handles initial start of the cluster. In the future it will also handle
  57. * administrative commands on the cluster as described in paragraph 7.4 of the
  58. * spec. ACSM includes two stable states (UNINSTANTIATED and STARTED) and a
  59. * number of states to control the transition between the stable states.
  60. *
  61. * The cluster is in state UNINSTANTIATED when the cluster starts. (In the
  62. * future this state will also be assumed after the LOCK_INSTANTIATION
  63. * administrative command.)
  64. *
  65. * State STARTED is assumed when the cluster has been initially started and
  66. * will in the future be re-assumed after the administrative command RESTART
  67. * have been executed.
  68. *
  69. * 1. Cluster Availability Control State Machine
  70. * =============================================
  71. *
  72. * 1.1 State Transition Table
  73. *
  74. * State: Event: Action: New state:
  75. * ===========================================================================
  76. * UNINSTANTIATED sync_ready [C1] A2,A1 STARTING_APPS
  77. * STARTING_APPS sync_ready A2,A1 STARTING_APPS
  78. * STARTING_APPS app_started [C3] A7,A3 ASSIGNING_WORKLOAD
  79. * STARTING_APPS local_timer_expired A8 STARTING_APPS
  80. * STARTING_APPS time_out A7,A8 WAITING_OVERTIME_1
  81. * WAITING_OVERTIME_1 sync_ready A4 WAITING_OVERTIME_1
  82. * WAITING_OVERTIME_1 time_out [C2] A7 ASSIGNING_WORKLOAD
  83. * WAITING_OVERTIME_1 time_out A7 WAITING_OVERTIME_2
  84. * WAITING_OVERTIME_1 app_started [C2] A3 ASSIGNING_WORKLOAD
  85. * WAITING_OVERTIME_2 sync_ready A4 WAITING_OVERTIME_2
  86. * WAITING_OVERTIME_2 app_started [C2] A3 ASSIGNING_WORKLOAD
  87. * ASSIGNING_WORKLOAD sync_ready A4 ASSIGNING_WORKLOAD
  88. * ASSIGNING_WORKLOAD app_assigned [C4] A6 STARTED
  89. * STARTED sync_ready A5 STARTED
  90. *
  91. * 1.2 State Description
  92. * =====================
  93. * UNINSTANTIATED - No SUs within any SG in any Application is instantiated.
  94. * STARTING_APPLICATIONS - All applications have been requested to start
  95. * their contained SGs, which in its turn has requested
  96. * their contained SUs to instantiate all their
  97. * components. The cluster startup timer is running.
  98. * WAITING_OVERTIME_1 - The cluster startup timer has expired but all
  99. * applications have yet not responded that they have been
  100. * started. The time-out message is broadcasted again to
  101. * make sure there are no other broadcast messages pending.
  102. * (This assures first of all that there is no pending
  103. * 'component instantiate' message.)
  104. * WAITING_OVERTIME_2 - The cluster startup timer has expired but all
  105. * applications have yet not responded that they have been
  106. * started. Cluster will wait infinitely for the
  107. * applications to respond. It is correct to do so even when
  108. * the startup timer has expired, because the applications
  109. * will report they are started as soon as there is no
  110. * attempt to instantiate any of its components pending,
  111. * because attempts to instantiate a component can not go on
  112. * forever, see saAmfCompInstantiateTimeout,
  113. * saAmfCompNumMaxInstantiateWithoutDelay and
  114. * saAmfCompNumMaxInstantiateWithDelay.
  115. * ASSIGNING_WORKLOAD - All applications have been requested to assign it's
  116. * specified workload to it's service units according to
  117. * the redundancy model specified by it's SGs.
  118. * STARTED - A best effort has been made to instatiate the components of all
  119. * applications and assign the specified workload as close as possible
  120. * to what is described in the configuration.
  121. *
  122. * 1.3 Actions
  123. * ===========
  124. * A1 - [foreach application in cluster]/start application
  125. * A2 - start cluster startup timer
  126. * A3 - [foreach application in cluster]/assign workload to application
  127. * A4 - defer sync_ready event
  128. * A5 - forward sync_ready to appropriate node object
  129. * A6 - recall deferred event
  130. * A7 - stop node local instance of cluster startup timer
  131. * A8 - multicast 'cluster startup timer time-out' event (time_out)
  132. *
  133. * 1.4 Guards
  134. * ==========
  135. * C1 - Administrative state == UNLOCKED
  136. * C2 - No SU has presence state == INSTANTIATING
  137. * C3 - All SGs are fully instantiated
  138. * C4 - No Application has Availability Control state == ASSIGNING_WORKLOAD
  139. */
  140. #include <stdlib.h>
  141. #include <errno.h>
  142. #include <assert.h>
  143. #include "logsys.h"
  144. #include "amf.h"
  145. #include "util.h"
  146. #include "main.h"
  147. #include "service.h"
  148. LOGSYS_DECLARE_SUBSYS ("AMF", LOG_INFO);
  149. typedef struct cluster_event {
  150. amf_cluster_event_type_t event_type;
  151. amf_cluster_t *cluster;
  152. amf_node_t *node;
  153. } cluster_event_t;
  154. /******************************************************************************
  155. * Internal (static) utility functions
  156. *****************************************************************************/
  157. static void cluster_defer_event (amf_cluster_event_type_t event_type,
  158. struct amf_cluster *cluster, struct amf_node * node)
  159. {
  160. cluster_event_t sync_ready_event = {event_type, cluster, node};
  161. amf_fifo_put (event_type, &cluster->deferred_events,
  162. sizeof (cluster_event_t),
  163. &sync_ready_event);
  164. }
  165. static void cluster_recall_deferred_events (amf_cluster_t *cluster)
  166. {
  167. cluster_event_t cluster_event;
  168. if (amf_fifo_get (&cluster->deferred_events, &cluster_event)) {
  169. switch (cluster_event.event_type) {
  170. case CLUSTER_SYNC_READY_EV:
  171. log_printf (LOG_NOTICE,
  172. "Recall CLUSTER_SYNC_READY_EV");
  173. amf_node_sync_ready (cluster_event.node);
  174. break;
  175. default:
  176. assert (0);
  177. break;
  178. }
  179. }
  180. }
  181. static void timer_function_cluster_recall_deferred_events (void *data)
  182. {
  183. amf_cluster_t *cluster = (amf_cluster_t*)data;
  184. ENTER ("");
  185. cluster_recall_deferred_events (cluster);
  186. }
  187. /**
  188. * Determine if all applications are started so that all
  189. * SUs is in SA_AMF_PRESENCE_INSTANTIATED presense state
  190. * @param cluster
  191. *
  192. * @return 1; All applications are started
  193. */
  194. static int cluster_applications_started_instantiated (struct amf_cluster *cluster)
  195. {
  196. int all_started = 1;
  197. struct amf_application *app;
  198. struct amf_sg *sg;
  199. struct amf_su *su;
  200. for (app = cluster->application_head; app != NULL; app = app->next) {
  201. for (sg = app->sg_head; sg != NULL; sg = sg->next) {
  202. for (su = sg->su_head; su != NULL; su = su->next) {
  203. if (su->saAmfSUPresenceState != SA_AMF_PRESENCE_INSTANTIATED) {
  204. all_started = 0;
  205. goto done;
  206. }
  207. }
  208. }
  209. }
  210. done:
  211. return all_started;
  212. }
  213. /**
  214. * Determine if any SGs are in the process of instantiating their SUs.
  215. * @param cluster
  216. *
  217. * @return 1; At least one SG is in the process of instantiating.
  218. */
  219. static int cluster_applications_are_starting_sgs(struct amf_cluster *cluster)
  220. {
  221. amf_application_t *application;
  222. amf_sg_t *sg;
  223. amf_su_t *su;
  224. int is_starting_sgs = 0;
  225. for (application = cluster->application_head; application != NULL;
  226. application = application->next) {
  227. for (sg = application->sg_head; sg != NULL; sg = sg->next) {
  228. for (su = sg->su_head; su != NULL; su = su->next) {
  229. if (su->saAmfSUPresenceState ==
  230. SA_AMF_PRESENCE_INSTANTIATING) {
  231. is_starting_sgs = 1;
  232. break;
  233. }
  234. }
  235. }
  236. }
  237. return is_starting_sgs;
  238. }
  239. static void amf_cluster_assign_workload (struct amf_cluster *cluster)
  240. {
  241. struct amf_application *app;
  242. ENTER ("");
  243. for (app = cluster->application_head; app != NULL; app = app->next) {
  244. amf_application_assign_workload (app, NULL);
  245. }
  246. }
  247. static void acsm_cluster_enter_assigning_workload (struct amf_cluster *cluster)
  248. {
  249. log_printf(LOG_NOTICE,
  250. "Cluster: all applications started, assigning workload.");
  251. cluster->acsm_state = CLUSTER_AC_ASSIGNING_WORKLOAD;
  252. amf_cluster_assign_workload (cluster);
  253. }
  254. static void timer_function_cluster_assign_workload_tmo (void *cluster)
  255. {
  256. ((struct amf_cluster*)cluster)->timeout_handle = 0;
  257. ENTER ("");
  258. amf_msg_mcast (MESSAGE_REQ_EXEC_AMF_CLUSTER_START_TMO, &this_amf_node->name,
  259. sizeof(SaNameT));
  260. }
  261. static inline void stop_cluster_startup_timer (struct amf_cluster *cluster)
  262. {
  263. if (cluster->timeout_handle) {
  264. dprintf ("Stop cluster startup timer");
  265. poll_timer_delete (aisexec_poll_handle,
  266. cluster->timeout_handle);
  267. cluster->timeout_handle = 0;
  268. }
  269. }
  270. static void start_cluster_startup_timer (struct amf_cluster *cluster)
  271. {
  272. if (cluster->timeout_handle == 0) {
  273. poll_timer_add (aisexec_poll_handle,
  274. cluster->saAmfClusterStartupTimeout,
  275. cluster,
  276. timer_function_cluster_assign_workload_tmo,
  277. &cluster->timeout_handle);
  278. }
  279. }
  280. static inline void cluster_enter_starting_applications (
  281. struct amf_cluster *cluster)
  282. {
  283. ENTER ("");
  284. start_cluster_startup_timer (cluster);
  285. amf_cluster->acsm_state = CLUSTER_AC_STARTING_APPLICATIONS;
  286. amf_cluster_start_applications (cluster);
  287. }
  288. static void acsm_cluster_enter_started (amf_cluster_t *cluster)
  289. {
  290. ENTER ("");
  291. amf_cluster->acsm_state = CLUSTER_AC_STARTED;
  292. amf_call_function_asynchronous (
  293. timer_function_cluster_recall_deferred_events, cluster);
  294. }
  295. /******************************************************************************
  296. * Event methods
  297. *****************************************************************************/
  298. void amf_cluster_start_tmo_event (int is_sync_masterm,
  299. struct amf_cluster *cluster, SaNameT *sourceNodeName)
  300. {
  301. ENTER ("acsm_state = %d", amf_cluster->acsm_state);
  302. stop_cluster_startup_timer (cluster);
  303. switch (cluster->acsm_state) {
  304. case CLUSTER_AC_WAITING_OVER_TIME_1:
  305. if (cluster_applications_are_starting_sgs (cluster)) {
  306. dprintf ("Cluster startup timeout,"
  307. "start waiting over time");
  308. amf_cluster->acsm_state =
  309. CLUSTER_AC_WAITING_OVER_TIME_2;
  310. } else {
  311. dprintf ("Cluster startup timeout,"
  312. " assigning workload");
  313. acsm_cluster_enter_assigning_workload (cluster);
  314. }
  315. break;
  316. case CLUSTER_AC_STARTING_APPLICATIONS:
  317. cluster->acsm_state = CLUSTER_AC_WAITING_OVER_TIME_1;
  318. if (name_match (&this_amf_node->name, sourceNodeName)) {
  319. timer_function_cluster_assign_workload_tmo (cluster);
  320. }
  321. break;
  322. case CLUSTER_AC_ASSIGNING_WORKLOAD:
  323. /* ignore cluster startup timer expiration */
  324. case CLUSTER_AC_STARTED:
  325. /* ignore cluster startup timer expiration */
  326. case CLUSTER_AC_WAITING_OVER_TIME_2:
  327. /* ignore cluster startup timer expiration */
  328. break;
  329. default:
  330. log_printf(LOG_LEVEL_ERROR, "Cluster timout expired"
  331. " in wrong cluster"
  332. " state = %d", cluster->acsm_state);
  333. assert(0);
  334. break;
  335. }
  336. }
  337. /**
  338. * Start all applications in the cluster and start
  339. * the cluster startup timeout.
  340. * @param cluster
  341. * @param app
  342. */
  343. void amf_cluster_start_applications(struct amf_cluster *cluster)
  344. {
  345. struct amf_application *app;
  346. for (app = cluster->application_head; app != NULL; app = app->next) {
  347. amf_application_start (app, NULL);
  348. }
  349. }
  350. /**
  351. * A new node has joined the cluster and is now synchronized with the nodes that
  352. * was part of the cluster before.
  353. * @param cluster
  354. * @param node
  355. */
  356. void amf_cluster_sync_ready (struct amf_cluster *cluster, struct amf_node *node)
  357. {
  358. ENTER ("");
  359. switch (amf_cluster->acsm_state) {
  360. case CLUSTER_AC_UNINSTANTIATED:
  361. if (amf_cluster->saAmfClusterAdminState ==
  362. SA_AMF_ADMIN_UNLOCKED) {
  363. cluster_enter_starting_applications (cluster);
  364. }
  365. break;
  366. case CLUSTER_AC_STARTING_APPLICATIONS:
  367. cluster_enter_starting_applications(cluster);
  368. break;
  369. case CLUSTER_AC_ASSIGNING_WORKLOAD:
  370. /*
  371. * Defer assigning workload to those syncronized nodes to
  372. * CLUSTER_AC_STARTED state.
  373. */
  374. cluster_defer_event (CLUSTER_SYNC_READY_EV, cluster,
  375. node);
  376. break;
  377. case CLUSTER_AC_WAITING_OVER_TIME_2:
  378. /*
  379. * Defer assigning workload to those syncronized nodes to
  380. * CLUSTER_AC_STARTED state.
  381. */
  382. cluster_defer_event (CLUSTER_SYNC_READY_EV, cluster,
  383. node);
  384. break;
  385. case CLUSTER_AC_STARTED:
  386. TRACE1 ("Node sync ready sent from cluster in "
  387. "CLUSTER_AC_STARTED state");
  388. amf_node_sync_ready (node);
  389. break;
  390. default:
  391. log_printf(LOG_LEVEL_ERROR, "Cluster sync ready event"
  392. " received in wrong cluster"
  393. " state = %d", cluster->acsm_state);
  394. assert (0);
  395. break;
  396. }
  397. }
  398. /******************************************************************************
  399. * Event response methods
  400. *****************************************************************************/
  401. /**
  402. * An application indicates it has been started or the application indicates it
  403. * was not even possible to try to start because the required nodes were not
  404. * available.
  405. * @param cluster
  406. * @param application
  407. */
  408. void amf_cluster_application_started (
  409. struct amf_cluster *cluster, struct amf_application *application)
  410. {
  411. ENTER ("application '%s' started %d", application->name.value,
  412. cluster->acsm_state);
  413. switch (cluster->acsm_state) {
  414. case CLUSTER_AC_STARTING_APPLICATIONS:
  415. if (cluster_applications_started_instantiated (cluster)) {
  416. stop_cluster_startup_timer (cluster);
  417. acsm_cluster_enter_assigning_workload (cluster);
  418. }
  419. break;
  420. case CLUSTER_AC_WAITING_OVER_TIME_1:
  421. case CLUSTER_AC_WAITING_OVER_TIME_2:
  422. if (amf_cluster_applications_started_with_no_starting_sgs (cluster)) {
  423. acsm_cluster_enter_assigning_workload (cluster);
  424. }
  425. break;
  426. default: {
  427. log_printf (LOG_ERR,"Error invalid cluster availability state %d",
  428. cluster->acsm_state);
  429. openais_exit_error(cluster->acsm_state);
  430. break;
  431. }
  432. }
  433. }
  434. /**
  435. * An application indicates it has assigned workload to all its contained SUs.
  436. * @param cluster
  437. */
  438. void amf_cluster_application_workload_assigned (
  439. struct amf_cluster *cluster, struct amf_application *app)
  440. {
  441. ENTER ("");
  442. switch (cluster->acsm_state) {
  443. case CLUSTER_AC_ASSIGNING_WORKLOAD:
  444. log_printf (LOG_NOTICE, "Cluster: application %s assigned.",
  445. app->name.value);
  446. if (amf_cluster_applications_assigned (cluster)) {
  447. acsm_cluster_enter_started (cluster);
  448. }
  449. break;
  450. default:
  451. assert(0);
  452. break;
  453. }
  454. }
  455. /******************************************************************************
  456. * General methods
  457. *****************************************************************************/
  458. struct amf_cluster *amf_cluster_new (void)
  459. {
  460. struct amf_cluster *cluster = amf_calloc (1,
  461. sizeof (struct amf_cluster));
  462. cluster->saAmfClusterStartupTimeout = -1;
  463. cluster->saAmfClusterAdminState = SA_AMF_ADMIN_UNLOCKED;
  464. cluster->deferred_events = 0;
  465. cluster->acsm_state = CLUSTER_AC_UNINSTANTIATED;
  466. return cluster;
  467. }
  468. void *amf_cluster_serialize (struct amf_cluster *cluster, int *len)
  469. {
  470. char *buf = NULL;
  471. int offset = 0, size = 0;
  472. TRACE8 ("%s", cluster->name.value);
  473. buf = amf_serialize_SaNameT (buf, &size, &offset, &cluster->name);
  474. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  475. cluster->saAmfClusterStartupTimeout);
  476. buf = amf_serialize_SaNameT (buf, &size, &offset,
  477. &cluster->saAmfClusterClmCluster);
  478. buf = amf_serialize_SaUint32T (buf, &size, &offset,
  479. cluster->saAmfClusterAdminState);
  480. buf = amf_serialize_SaUint32T (buf, &size, &offset, cluster->acsm_state);
  481. *len = offset;
  482. return buf;
  483. }
  484. struct amf_cluster *amf_cluster_deserialize (char *buf)
  485. {
  486. char *tmp = buf;
  487. struct amf_cluster *cluster = amf_cluster_new ();
  488. tmp = amf_deserialize_SaNameT (tmp, &cluster->name);
  489. tmp = amf_deserialize_SaUint32T (tmp, &cluster->saAmfClusterStartupTimeout);
  490. tmp = amf_deserialize_SaNameT (tmp, &cluster->saAmfClusterClmCluster);
  491. tmp = amf_deserialize_SaUint32T (tmp, &cluster->saAmfClusterAdminState);
  492. tmp = amf_deserialize_SaUint32T (tmp, &cluster->acsm_state);
  493. return cluster;
  494. }
  495. /**
  496. * Determine if any SGs are in the process of instantiating their SUs.
  497. * @param cluster
  498. *
  499. * @return 1; At least one SG is in the process of instantiating.
  500. */
  501. int amf_cluster_applications_started_with_no_starting_sgs (
  502. struct amf_cluster *cluster)
  503. {
  504. return !cluster_applications_are_starting_sgs (cluster);
  505. }
  506. /**
  507. * Determine if all Applications have been assigned workload.
  508. * @param cluster
  509. *
  510. * @return 1; All Applications have been assigned workload.
  511. */
  512. int amf_cluster_applications_assigned (struct amf_cluster *cluster)
  513. {
  514. struct amf_application *app = 0;
  515. int is_all_application_assigned = 1;
  516. for (app = cluster->application_head; app != NULL; app = app->next) {
  517. if (app->acsm_state != APP_AC_WORKLOAD_ASSIGNED) {
  518. is_all_application_assigned = 0;
  519. break;
  520. }
  521. }
  522. return is_all_application_assigned;
  523. }