Просмотр исходного кода

This patch contians:

-  AMF handles a component report of injurious health.

- AMF handles saAmfHealthcheckConfirm() SA_AIS_ERR_FAILED_OPERATION
so that if it's a recent recovery ongoing amf does nothing but if it's
no  immediate recovery in progress, AMF invokes the recovery action
specified by the component when the health check is started If
the individual recommendation was SA_AMF_NO_RECOMMENDATION,
then AMF uses the configured recovery action for the component
(saAmfCompRecoveryOnError). If this recommendation also is
SA_AMF_NO_RECOMMENDATION, then AMF makes a component restart or
component/SU fail over counts on the value of
saAmfCompDisableRestart and saAmfSUFailover.

- Handling of cleanup of a component and health check response hardened.


- Time supervision and check return value of clc-cli CLEANUP command.


- Handle 'recommended recovery' specified by a component in an error
report. The potential recovery action to  choose
implemented is - component restart - and - node fails over.

- The attribute saAmfCompDisableRestart is now recognizable which means
that if the component specifies 'Component restart' and restart is
disabled
then the SU in which the component is contained shall fall over.

- The attribute saAmfSUFailover will not be recognized. SU will always
  fail
over as a single entity.

- A component can report an error on another component than itself.


- Implementation 'Instantiation Level' according to chapter 3.9.2 in the
AMF specification.
- Implementation of the escalation levels, component restart, SU
restart, SU fail over and Node fail over.



git-svn-id: http://svn.fedorahosted.org/svn/corosync/trunk@1321 fd59a12c-fef9-0310-b244-a6a79926bd2f
Lon Hohberger 19 лет назад
Родитель
Сommit
8f87e5f413
12 измененных файлов с 1467 добавлено и 277 удалено
  1. 57 12
      exec/amf.c
  2. 53 26
      exec/amf.h
  3. 4 1
      exec/amfapp.c
  4. 20 11
      exec/amfcluster.c
  5. 270 40
      exec/amfcomp.c
  6. 230 48
      exec/amfnode.c
  7. 298 20
      exec/amfsg.c
  8. 1 1
      exec/amfsi.c
  9. 529 115
      exec/amfsu.c
  10. 2 0
      exec/amfutil.c
  11. 1 1
      lib/amf.c
  12. 2 2
      test/testamf1.c

+ 57 - 12
exec/amf.c

@@ -222,6 +222,8 @@ static void message_handler_req_exec_amf_sync_request (
 	void *message, unsigned int nodeid);
 static void message_handler_req_exec_amf_comp_instantiate_tmo(
 	void *message, unsigned int nodeid);
+static void message_handler_req_exec_amf_comp_cleanup_tmo(
+	void *message, unsigned int nodeid);
 static void amf_dump_fn (void);
 static void amf_sync_init (void);
 static int amf_sync_process (void);
@@ -361,6 +363,9 @@ static struct openais_exec_handler amf_exec_service[] = {
 	{
 		.exec_handler_fn = message_handler_req_exec_amf_comp_instantiate_tmo,
 	},
+	{
+		.exec_handler_fn = message_handler_req_exec_amf_comp_cleanup_tmo,
+	},
 };
 
 /*
@@ -444,7 +449,9 @@ struct req_exec_amf_response {
 	mar_req_header_t header;
 	SaUint32T interface;
 	SaNameT dn;
+	SaAmfHealthcheckKeyT healtcheck_key;
 	SaAisErrorT error;
+	SaAmfRecommendedRecoveryT recommendedRecovery;
 };
 
 struct req_exec_amf_sync_data {
@@ -1505,14 +1512,16 @@ static void message_handler_req_exec_amf_comp_error_report (
 {
 	struct req_exec_amf_comp_error_report *req_exec = message;
 	struct amf_comp *comp;
+	amf_comp_t *reporting_comp;
 
 	if (scsm.state != NORMAL_OPERATION) {
 		return;
 	}
 
 	comp = amf_comp_find (amf_cluster, &req_exec->erroneousComponent);
+	reporting_comp = amf_comp_find (amf_cluster, &req_exec->reportingComponent);
 	assert (comp != NULL);
-	amf_comp_error_report (comp, req_exec->recommendedRecovery);
+	amf_comp_error_report (comp, reporting_comp,req_exec->recommendedRecovery);
 }
 
 static void message_handler_req_exec_amf_comp_instantiate(
@@ -1546,12 +1555,27 @@ static void message_handler_req_exec_amf_comp_instantiate_tmo(
 	amf_comp_instantiate_tmo_event (component);
 }
 
+static void message_handler_req_exec_amf_comp_cleanup_tmo(
+	void *message, unsigned int nodeid)
+{
+	struct req_exec_amf_comp_cleanup_tmo *req_exec = message;
+	struct amf_comp *component;
+
+	component = amf_comp_find (amf_cluster, &req_exec->compName);
+	if (component == NULL) {
+		log_printf (LOG_ERR, "Error: '%s' not found", req_exec->compName.value);
+		return;
+
+	}
+	amf_comp_cleanup_tmo_event (component);
+}
+
 static void message_handler_req_exec_amf_clc_cleanup_completed (
 	void *message, unsigned int nodeid)
 {
 	struct req_exec_amf_clc_cleanup_completed *req_exec = message;
-	struct amf_comp *comp;
-
+	amf_comp_t *comp;
+	ENTER ("");
 	if (scsm.state != NORMAL_OPERATION) {
 		return;
 	}
@@ -1561,8 +1585,13 @@ static void message_handler_req_exec_amf_clc_cleanup_completed (
 		log_printf (LOG_ERR, "Error: '%s' not found", req_exec->compName.value);
 		return;
 	}
+	
+	if (req_exec->cleanup_exit_code != 0) {
+		amf_comp_cleanup_failed_completed (comp);
+	} else {
+		amf_comp_cleanup_completed (comp);
 
-	amf_comp_cleanup_completed (comp);
+	}
 }
 
 static void message_handler_req_exec_amf_healthcheck_tmo (
@@ -1586,7 +1615,7 @@ static void message_handler_req_exec_amf_healthcheck_tmo (
 
 	healthcheck = amf_comp_find_healthcheck (comp, &req_exec->safHealthcheckKey);
 
-	amf_comp_healthcheck_tmo (comp, healthcheck);
+	amf_comp_healthcheck_tmo (comp, req_exec->recommendedRecovery);
 }
 
 static void message_handler_req_exec_amf_response (
@@ -1604,7 +1633,8 @@ static void message_handler_req_exec_amf_response (
 	TRACE1 ("AmfResponse: %s", req_exec->dn.value);
 
 	comp = amf_comp_response_2 (
-		req_exec->interface, &req_exec->dn, req_exec->error, &retval);
+		req_exec->interface, &req_exec->dn, &req_exec->healtcheck_key,
+		req_exec->error, &retval, req_exec->recommendedRecovery);
 	assert (comp != NULL);
 
 	if (amf_su_is_local (comp->su)) {
@@ -1775,10 +1805,15 @@ static void message_handler_req_exec_amf_sync_data (
 static void message_handler_req_exec_amf_cluster_start_tmo (
 	void *message, unsigned int nodeid)
 {
+	struct req_exec_amf_cluster_start_tmo *req;
+	req = (struct req_exec_amf_cluster_start_tmo *)message;
+	
 	if (scsm.state != NORMAL_OPERATION) {
 		return;
 	}
-	amf_cluster_start_tmo_event (nodeid == scsm.sync_master, amf_cluster);
+	TRACE1("%s", req->sourceNodeName.value);
+	amf_cluster_start_tmo_event (nodeid == scsm.sync_master, amf_cluster,
+		&req->sourceNodeName);
 }
 
 static void message_handler_req_exec_amf_sync_request (
@@ -2116,16 +2151,22 @@ static void message_handler_req_lib_amf_protectiongrouptrackstop (
 #endif
 }
 
+
 static void message_handler_req_lib_amf_componenterrorreport (
 	void *conn,
 	void *msg)
 {
 	struct req_lib_amf_componenterrorreport *req_lib = msg;
 	struct amf_comp *comp;
+	amf_comp_t *reporting_comp;
+	SaNameT reporting_comp_name;
 
 	assert (scsm.state == NORMAL_OPERATION);
 
 	comp = amf_comp_find (amf_cluster, &req_lib->erroneousComponent);
+	reporting_comp = amf_comp_find_from_conn_info (conn);
+	assert (reporting_comp);
+
 	if (comp != NULL) {
 		struct req_exec_amf_comp_error_report req_exec;
 		struct iovec iovec;
@@ -2135,8 +2176,8 @@ static void message_handler_req_lib_amf_componenterrorreport (
 		req_exec.header.size = sizeof (struct req_exec_amf_comp_error_report);
 		req_exec.header.id = SERVICE_ID_MAKE (AMF_SERVICE,
 			MESSAGE_REQ_EXEC_AMF_COMPONENT_ERROR_REPORT);
-
-		memcpy (&req_exec.reportingComponent, &req_lib->reportingComponent,
+		amf_comp_dn_make(reporting_comp, &reporting_comp_name);
+		memcpy (&req_exec.reportingComponent,  &reporting_comp_name,
 			sizeof (SaNameT));
 		memcpy (&req_exec.erroneousComponent, &req_lib->erroneousComponent,
 			sizeof (SaNameT));
@@ -2225,13 +2266,14 @@ static void message_handler_req_lib_amf_response (void *conn, void *msg)
 	SaAisErrorT retval;
 	SaUint32T interface;
 	SaNameT dn;
-
+	SaAmfHealthcheckKeyT healthcheck_key;
+	SaAmfRecommendedRecoveryT recommendedRecovery;
 	/*
 	* This is an optimisation to avoid multicast of healthchecks while keeping
 	* a nice design. We multicast and make lib responses from this file.
 	*/
-	multicast = amf_comp_response_1 (
-		req_lib->invocation, req_lib->error, &retval, &interface, &dn);
+	multicast = amf_comp_response_1 (req_lib->invocation, req_lib->error, 
+		&retval, &interface, &dn, &healthcheck_key, &recommendedRecovery);
 
 	if (multicast) {
 		struct req_exec_amf_response req_exec;
@@ -2247,6 +2289,9 @@ static void message_handler_req_lib_amf_response (void *conn, void *msg)
 			MESSAGE_REQ_EXEC_AMF_RESPONSE);
 		req_exec.interface = interface;
 		memcpy (&req_exec.dn, &dn, sizeof (SaNameT));
+		memcpy (&req_exec.healtcheck_key, &healthcheck_key, 
+			sizeof(SaAmfHealthcheckKeyT));
+		req_exec.recommendedRecovery = recommendedRecovery;
 		req_exec.error = req_lib->error;
 		iovec.iov_base = (char *)&req_exec;
 		iovec.iov_len = sizeof (req_exec);

+ 53 - 26
exec/amf.h

@@ -107,14 +107,6 @@ typedef enum {
 	USR_AMF_HA_STATE_REMOVED = SA_AMF_HA_QUIESCING + 1
 } UsrAmfHaState;
 
-/*                                                              
- * Node Error Escallation State
- */
-typedef enum {
-	NODE_EESM_IDLE,
-	NODE_EESM_ESCALLATION_LEVEL_2,
-	NODE_EESM_ESCALLATION_LEVEL_3
-} amf_node_eesm_state_t;
 
 typedef enum {
 	APP_AC_UNINSTANTIATED = 1,
@@ -126,9 +118,9 @@ typedef enum {
 
 typedef enum {
 	NODE_ACSM_REPAIR_NEEDED = 1,
-	NODE_ACSM_ESCALLATION_LEVEL_0,
-	NODE_ACSM_ESCALLATION_LEVEL_2,
-	NODE_ACSM_ESCALLATION_LEVEL_3,
+	NODE_ACSM_IDLE_ESCALLATION_LEVEL_0,
+	NODE_ACSM_IDLE_ESCALLATION_LEVEL_2,
+	NODE_ACSM_IDLE_ESCALLATION_LEVEL_3,
 	NODE_ACSM_FAILING_FAST_REBOOTING_NODE,
 	NODE_ACSM_FAILING_FAST_ACTIVATING_STANDBY_NODE,
 	NODE_ACSM_FAILING_GRACEFULLY_SWITCHING_OVER,
@@ -169,10 +161,14 @@ typedef enum amf_sg_event_type {
 	SG_ASSIGN_SI_EV
 } amf_sg_event_type_t;
 
+typedef enum amf_su_event_type {
+	SU_COMP_ERROR_SUSPECTED_EV = 1
+} amf_su_event_type_t;
+
 typedef enum {
-	SU_RC_ESCALATION_LEVEL_0 = 0,
-	SU_RC_ESCALATION_LEVEL_1,
-	SU_RC_ESCALATION_LEVEL_2,
+	SU_RC_IDLE_ESCALATION_LEVEL_0 = 0,
+	SU_RC_IDLE_ESCALATION_LEVEL_1,
+	SU_RC_IDLE_ESCALATION_LEVEL_2,
 	SU_RC_RESTART_COMP_DEACTIVATING,
 	SU_RC_RESTART_COMP_RESTARTING,
 	SU_RC_RESTART_COMP_SETTING,
@@ -213,7 +209,8 @@ struct amf_healthcheck;
 typedef enum {
 	CLUSTER_AC_UNINSTANTIATED = 1,
 	CLUSTER_AC_STARTING_APPLICATIONS,
-	CLUSTER_AC_WAITING_OVER_TIME,
+	CLUSTER_AC_WAITING_OVER_TIME_1,
+	CLUSTER_AC_WAITING_OVER_TIME_2,
 	CLUSTER_AC_ASSIGNING_WORKLOAD,
 	CLUSTER_AC_STARTED,
 	CLUSTER_AC_TERMINATING_APPLICATIONS,
@@ -281,6 +278,8 @@ typedef struct amf_node {
 	unsigned int nodeid;
 	struct amf_node *next;
 	amf_node_acsm_state_t acsm_state;
+	amf_node_acsm_state_t history_state;
+
 } amf_node_t;
 
 typedef struct amf_application {
@@ -381,7 +380,9 @@ typedef struct amf_su {
 	su_restart_control_state_t escalation_level_history_state;
 	SaStringT clccli_path;
 	SaUint32T              su_failover_cnt;	/* missing in SAF specs? */
+	SaUint32T              current_comp_instantiation_level;
 	struct amf_su         *next;
+	amf_fifo_t            *deferred_events;
 } amf_su_t;
 
 typedef struct amf_comp {
@@ -446,6 +447,7 @@ typedef struct amf_comp {
 	enum clc_component_types comptype;
 	struct amf_healthcheck *healthcheck_head;
 	poll_timer_handle instantiate_timeout_handle;
+	poll_timer_handle cleanup_timeout_handle;
 	/*
 	 * Flag that indicates of this component has a suspected error
 	 */
@@ -597,18 +599,21 @@ enum amf_message_req_types {
 	MESSAGE_REQ_EXEC_AMF_SYNC_DATA = 7,
 	MESSAGE_REQ_EXEC_AMF_CLUSTER_START_TMO = 8,
 	MESSAGE_REQ_EXEC_AMF_SYNC_REQUEST = 9,
-	MESSAGE_REQ_EXEC_AMF_COMPONENT_INSTANTIATE_TMO = 10
+	MESSAGE_REQ_EXEC_AMF_COMPONENT_INSTANTIATE_TMO = 10,
+	MESSAGE_REQ_EXEC_AMF_COMPONENT_CLEANUP_TMO = 11
 };
 
 struct req_exec_amf_clc_cleanup_completed {
 	mar_req_header_t header;
 	SaNameT compName;
+	int cleanup_exit_code;
 };
 
 struct req_exec_amf_healthcheck_tmo {
 	mar_req_header_t header;
 	SaNameT compName;
 	SaAmfHealthcheckKeyT safHealthcheckKey;
+	SaAmfRecommendedRecoveryT recommendedRecovery;
 };
 
 struct req_exec_amf_comp_instantiate {
@@ -621,9 +626,14 @@ struct req_exec_amf_comp_instantiate_tmo {
 	SaNameT compName;
 };
 
+struct req_exec_amf_comp_cleanup_tmo {
+	mar_req_header_t header;
+	SaNameT compName;
+};
 
 struct req_exec_amf_cluster_start_tmo {
 	mar_req_header_t header;
+	SaNameT sourceNodeName;
 };
 
 /*===========================================================================*/
@@ -730,7 +740,7 @@ extern int amf_cluster_applications_started_with_no_starting_sgs (
 
 /* Event methods */
 extern void amf_cluster_start_tmo_event (int is_sync_master, 
-	struct amf_cluster *cluster);
+	struct amf_cluster *cluster, SaNameT *sourceNodeName);
 extern void amf_cluster_sync_ready (struct amf_cluster *cluster, 
 	struct amf_node *node);
 /**
@@ -847,10 +857,15 @@ extern int amf_su_get_saAmfSUNumCurrActiveSIs (struct amf_su *su);
 extern int amf_su_get_saAmfSUNumCurrStandbySIs (struct amf_su *su);
 extern SaAmfReadinessStateT amf_su_get_saAmfSUReadinessState (
 	struct amf_su *su);
-extern int amf_su_presence_state_all_comps_in_su_are_set (struct amf_su *su,
+extern int amf_su_are_all_comps_in_su (struct amf_su *su,
 	SaAmfPresenceStateT state);
+
 /* Event methods */
-extern void amf_su_instantiate (struct amf_su *su);
+/**
+ * 
+ * @param su
+ * @param comp
+ */
 extern amf_si_assignment_t *amf_su_assign_si (
 	struct amf_su *su, struct amf_si *si, SaAmfHAStateT ha_state);
 extern void amf_su_restart_req (struct amf_su *su);
@@ -859,8 +874,7 @@ extern void amf_su_restart_req (struct amf_su *su);
  * Request termination of all component in an SU
  * @param su
  */
-extern void amf_su_terminate (struct amf_su *su);
-
+void amf_su_terminate (struct amf_su *su);
 extern struct amf_node *amf_su_get_node (struct amf_su *su);
 extern void amf_su_escalation_level_reset (struct amf_su *su);
 extern void amf_su_remove_assignment (struct amf_su *su);
@@ -872,7 +886,10 @@ extern void amf_su_comp_error_suspected (
 	struct amf_su *su,
 	struct amf_comp *comp,
 	SaAmfRecommendedRecoveryT recommended_recovery);
-
+extern void amf_su_restart (struct amf_su *su);
+void amf_su_operational_state_set (struct amf_su *su,
+	SaAmfOperationalStateT oper_state);
+extern int amf_su_instantiate (struct amf_su *su);
 /*===========================================================================*/
 /* amfcomp.c */
 
@@ -894,6 +911,7 @@ extern struct amf_csi_assignment *amf_comp_get_next_csi_assignment (
 	struct amf_comp *component, const struct amf_csi_assignment *csi_assignment);
 extern SaAmfReadinessStateT amf_comp_get_saAmfCompReadinessState (
 	struct amf_comp *comp);
+struct amf_comp *amf_comp_find_from_conn_info (void *conn);
 
 /* Event methods */
 extern void amf_comp_instantiate (struct amf_comp *comp);
@@ -901,6 +919,7 @@ extern void amf_comp_terminate (struct amf_comp *comp);
 extern void amf_comp_node_left (struct amf_comp *comp);
 extern void amf_comp_instantiate_event(struct amf_comp *comp);
 extern void amf_comp_instantiate_tmo_event (struct amf_comp *comp);
+extern void amf_comp_cleanup_tmo_event (struct amf_comp *comp);
 
 /**
  * Request the component to assume a HA state
@@ -920,8 +939,9 @@ extern void amf_comp_readiness_state_set (
 extern struct amf_healthcheck *amf_comp_find_healthcheck (
 	struct amf_comp *comp, SaAmfHealthcheckKeyT *key);
 extern void amf_comp_healthcheck_tmo (
-	struct amf_comp *comp, struct amf_healthcheck *healthcheck);
+	struct amf_comp *comp, SaAmfRecommendedRecoveryT recommendedRecovery);
 extern void amf_comp_cleanup_completed (struct amf_comp *comp);
+extern void amf_comp_cleanup_failed_completed (amf_comp_t *comp);
 
 /**
  * Count number of active CSI assignments
@@ -954,12 +974,16 @@ extern SaAisErrorT amf_comp_healthcheck_stop (
 extern SaAisErrorT amf_comp_register (struct amf_comp *comp);
 extern void amf_comp_unregister (struct amf_comp *comp);
 extern void amf_comp_error_report (
-	struct amf_comp *comp, SaAmfRecommendedRecoveryT recommendedRecovery);
+	struct amf_comp *comp, amf_comp_t *report_comp,
+	SaAmfRecommendedRecoveryT recommendedRecovery);
 extern int amf_comp_response_1 (
 	SaInvocationT invocation, SaAisErrorT error, SaAisErrorT *retval,
-	SaUint32T *interface, SaNameT *dn);
+	SaUint32T *interface, SaNameT *dn, SaAmfHealthcheckKeyT *healtcheck_key,
+	SaAmfRecommendedRecoveryT *recommendedRecovery);
 extern struct amf_comp *amf_comp_response_2 (
-	SaUint32T interface, SaNameT *dn, SaAisErrorT error, SaAisErrorT *retval);
+	SaUint32T interface, SaNameT *dn, SaAmfHealthcheckKeyT *healthcheckKey,
+	SaAisErrorT error, SaAisErrorT *retval, 
+	SaAmfRecommendedRecoveryT recommendedRecovery);
 extern SaAisErrorT amf_comp_hastate_get (
 	struct amf_comp *comp, SaNameT *csi_name, SaAmfHAStateT *ha_state);
 extern SaAisErrorT amf_comp_healthcheck_confirm (
@@ -975,6 +999,9 @@ extern struct amf_healthcheck *amf_healthcheck_deserialize (
 
 extern void amf_comp_csi_remove (amf_comp_t *component,
 	amf_csi_assignment_t *csi_assignment);
+extern void amf_comp_error_suspected_clear (amf_comp_t *comp);
+extern void amf_comp_error_suspected_set (amf_comp_t *comp);
+extern int amf_comp_is_error_suspected (amf_comp_t *comp);
 
 /*===========================================================================*/
 /* amfsi.c */

+ 4 - 1
exec/amfapp.c

@@ -254,7 +254,7 @@ static void application_enter_starting_sgs (struct amf_application *app,
 	int su_to_instantiate = 0;
 	app->node_to_start = node;
 	app->acsm_state = APP_AC_STARTING_SGS;
-
+	ENTER ("%s",app->name.value);
 	for (sg = app->sg_head; sg != NULL; sg = sg->next) {
 		su_to_instantiate += amf_sg_start (sg, node);
 	}
@@ -275,6 +275,7 @@ static void application_enter_assigning_workload (amf_application_t *app)
 {
 	amf_sg_t *sg = 0;
 	int posible_to_assign_si = 0;
+	ENTER ("%s",app->name.value);
 	app->acsm_state = APP_AC_ASSIGNING_WORKLOAD;
 	for (sg = app->sg_head; sg != NULL; sg = sg->next) {
 		if (amf_sg_assign_si_req (sg, 0)) {
@@ -289,12 +290,14 @@ static void application_enter_assigning_workload (amf_application_t *app)
 
 static void application_enter_workload_assigned (amf_application_t *app)
 {
+	ENTER ("%s", app->name.value);
 	if (all_sg_assigned (app)){
 		app->acsm_state = APP_AC_WORKLOAD_ASSIGNED;
 		if (app->node_to_start == NULL){
 			amf_cluster_application_workload_assigned (
 				app->cluster, app);
 		} else {
+			TRACE1("%s",app->node_to_start->name.value);
 			amf_node_application_workload_assigned(
 				app->node_to_start, app);
 		}

+ 20 - 11
exec/amfcluster.c

@@ -249,12 +249,12 @@ static void acsm_cluster_enter_assigning_workload (struct amf_cluster *cluster)
 
 static void timer_function_cluster_assign_workload_tmo (void *cluster)
 {
-	struct req_exec_amf_cluster_start_tmo req;
-	((struct amf_cluster*)cluster)->timeout_handle = 0;;
+	((struct amf_cluster*)cluster)->timeout_handle = 0;
 
 	ENTER ("");
 
-	amf_msg_mcast (MESSAGE_REQ_EXEC_AMF_CLUSTER_START_TMO, &req, sizeof(req));
+	amf_msg_mcast (MESSAGE_REQ_EXEC_AMF_CLUSTER_START_TMO, &this_amf_node->name, 
+		sizeof(SaNameT));
 }
 
 static inline void stop_cluster_startup_timer (struct amf_cluster *cluster)
@@ -307,30 +307,37 @@ struct amf_cluster *cluster)
 }
 
 void amf_cluster_start_tmo_event (int is_sync_masterm, 
-	struct amf_cluster *cluster)
+	struct amf_cluster *cluster, SaNameT *sourceNodeName)
 {
 	ENTER ("acsm_state = %d", amf_cluster->acsm_state);
 
 	stop_cluster_startup_timer (cluster);
 
 	switch (cluster->acsm_state) {
-		case CLUSTER_AC_STARTING_APPLICATIONS:
+		case CLUSTER_AC_WAITING_OVER_TIME_1:
 			if (cluster_applications_are_starting_sgs (cluster)) {
 				dprintf ("Cluster startup timeout," 
 					"start waiting over time");
 				amf_cluster->acsm_state = 
-					CLUSTER_AC_WAITING_OVER_TIME; 
+					CLUSTER_AC_WAITING_OVER_TIME_2; 
 			} else {
 				dprintf ("Cluster startup timeout,"
 					" assigning workload");
 				acsm_cluster_enter_assigning_workload (cluster);
 			}
+			break;
+		case CLUSTER_AC_STARTING_APPLICATIONS:
+			cluster->acsm_state = CLUSTER_AC_WAITING_OVER_TIME_1;
+			if (name_match (&this_amf_node->name, sourceNodeName)) {
+				timer_function_cluster_assign_workload_tmo (cluster);
+			}
+			
 			break;
 		case CLUSTER_AC_ASSIGNING_WORKLOAD:
 			/* ignore cluster startup timer expiration */
 		case CLUSTER_AC_STARTED:
 			/* ignore cluster startup timer expiration */
-		case CLUSTER_AC_WAITING_OVER_TIME:
+		case CLUSTER_AC_WAITING_OVER_TIME_2:
 			/* ignore cluster startup timer expiration */
 			break;
 		default:
@@ -368,7 +375,7 @@ void amf_cluster_sync_ready (struct amf_cluster *cluster, struct amf_node *node)
 				SA_AMF_ADMIN_UNLOCKED) {
 				cluster_enter_starting_applications (cluster);
 			}
-			break;
+			break;    
 		case CLUSTER_AC_STARTING_APPLICATIONS:
 			cluster_enter_starting_applications(cluster);
 			break;
@@ -384,7 +391,7 @@ void amf_cluster_sync_ready (struct amf_cluster *cluster, struct amf_node *node)
 				"cluster state: %u\n", amf_cluster->acsm_state);
 			assert (0);
 			break;
-		case CLUSTER_AC_WAITING_OVER_TIME:
+		case CLUSTER_AC_WAITING_OVER_TIME_2:
 			/*
 			 * Defer assigning workload to those syncronized nodes to
 			 * CLUSTER_AC_STARTED state.
@@ -414,7 +421,8 @@ void amf_cluster_init (void)
 void amf_cluster_application_started (
 	struct amf_cluster *cluster, struct amf_application *application)
 {
-	ENTER ("application '%s' started", application->name.value);
+	ENTER ("application '%s' started %d", application->name.value, 
+		cluster->acsm_state);
 	switch (cluster->acsm_state) {
 		case CLUSTER_AC_STARTING_APPLICATIONS:
 			if (cluster_applications_started_instantiated (cluster)) {
@@ -422,7 +430,8 @@ void amf_cluster_application_started (
 				acsm_cluster_enter_assigning_workload (cluster);
 			}
 			break;
-		case CLUSTER_AC_WAITING_OVER_TIME:
+		case CLUSTER_AC_WAITING_OVER_TIME_1:
+		case CLUSTER_AC_WAITING_OVER_TIME_2:
 			if (amf_cluster_applications_started_with_no_starting_sgs (cluster)) {
 				acsm_cluster_enter_assigning_workload (cluster);
 			}

+ 270 - 40
exec/amfcomp.c

@@ -156,6 +156,7 @@ struct clc_command_run_data {
 	struct amf_comp *comp;
 	enum clc_command_run_operation_type type;
 	void (*completion_callback) (void *context);
+	int exit_code;
 };
 
 struct clc_interface {
@@ -190,6 +191,8 @@ static void lib_csi_set_request (
 	struct amf_comp *comp,
 	struct amf_csi_assignment *csi_assignment);
 
+static void comp_recover_action (amf_comp_t *comp, 
+	SaAmfRecommendedRecoveryT recommendedRecovery);
 
 /*
  * Life cycle functions
@@ -315,8 +318,10 @@ static void report_error_suspected (
 	struct amf_comp *comp,
 	SaAmfRecommendedRecoveryT recommended_recovery)
 {
-	comp->error_suspected = 1;
-	amf_su_comp_error_suspected (comp->su, comp, recommended_recovery);
+	ENTER ("%s, recommended_recovery = %d",
+		comp->name.value, recommended_recovery);
+	amf_comp_error_suspected_set (comp);
+	comp_recover_action (comp, recommended_recovery);
 }
 
 
@@ -327,6 +332,8 @@ static void *clc_command_run (void *context)
 {
 	struct clc_command_run_data *clc_command_run_data =
 		(struct clc_command_run_data *)context;
+	clc_command_run_data->exit_code = 0;
+
 	pid_t pid;
 	int res;
 	char **argv = NULL;
@@ -356,10 +363,10 @@ static void *clc_command_run (void *context)
 			fprintf (stderr, "Error: CLC_CLI (%d) failed with exit status:"
 				" %d - %s\n", pid, WEXITSTATUS(status),
 				strerror (WEXITSTATUS(status)));
-			/*                                                              
-             * Healthcheck timout will expire laterfore the component
-             * and this will lead to Intantiation failed for the component.
-			 */
+			/*
+             * Store the exit code from the script in the return data.
+             */
+			clc_command_run_data->exit_code = WEXITSTATUS(status);
 		}
 		if (WIFSIGNALED (status) != 0) {
 			fprintf (stderr, "Error: CLC_CLI (%d) failed with exit status:"
@@ -493,14 +500,48 @@ static void amf_comp_instantiate_tmo (void *component)
 		&compName, sizeof (SaNameT));
 }
 
+static void amf_comp_cleanup_tmo (void *component)
+{
+	SaNameT compName;
+	amf_comp_dn_make (component, &compName);
+
+	amf_msg_mcast (MESSAGE_REQ_EXEC_AMF_COMPONENT_CLEANUP_TMO,
+		&compName, sizeof (SaNameT));
+}
+
 static void start_component_instantiate_timer (struct amf_comp *component)
 {
 	ENTER("%s",component->name.value);
-	poll_timer_add (aisexec_poll_handle, 
-		component->saAmfCompInstantiateTimeout,
-		component,
-		amf_comp_instantiate_tmo,
-		&component->instantiate_timeout_handle);
+	if (component->instantiate_timeout_handle == 0) {
+		poll_timer_add (aisexec_poll_handle, 
+			component->saAmfCompInstantiateTimeout,
+			component,
+			amf_comp_instantiate_tmo,
+			&component->instantiate_timeout_handle);
+	}
+}
+
+static void start_component_cleanup_timer (struct amf_comp *component)
+{
+	ENTER("%s",component->name.value);
+	if (component->cleanup_timeout_handle == 0) {
+		poll_timer_add (aisexec_poll_handle, 
+			component->saAmfCompCleanupTimeout,
+			component,
+			amf_comp_cleanup_tmo,
+			&component->cleanup_timeout_handle);
+	}
+}
+
+void stop_component_cleanup_timer (struct amf_comp *component)
+{
+	ENTER("%s",component->name.value);
+
+	if (component->cleanup_timeout_handle != 0) {
+		poll_timer_delete (aisexec_poll_handle, 
+			component->cleanup_timeout_handle);
+		component->cleanup_timeout_handle  = 0;
+	}
 }
 
 /*
@@ -528,9 +569,6 @@ static int clc_cli_instantiate (struct amf_comp *comp)
 		log_printf (LOG_LEVEL_ERROR, "pthread_create failed: %d", res);
 	}
 	start_component_instantiate_timer (comp);
-//	clc_command_run_data->completion_callback (clc_command_run_data);
-
-// TODO error code from pthread_create
 	return (res);
 }
 
@@ -615,6 +653,11 @@ static void mcast_cleanup_completion_event (void *context)
 	iovec.iov_base = (char *)&req;
 	iovec.iov_len = sizeof (req);
 
+	/*
+     * Exit code from the invoked cleanup script.
+     */
+	req.cleanup_exit_code = clc_command_run_data->exit_code;
+
 	assert (totempg_groups_mcast_joined (openais_group_handle,
 		&iovec, 1, TOTEMPG_AGREED) == 0);
 }
@@ -635,7 +678,7 @@ static int clc_cli_cleanup (struct amf_comp *comp)
 	clc_command_run_data->comp = comp;
 	clc_command_run_data->type = CLC_COMMAND_RUN_OPERATION_TYPE_CLEANUP;
 	clc_command_run_data->completion_callback = mcast_cleanup_completion_event;
-
+	start_component_cleanup_timer (comp);
 	pthread_attr_init (&thread_attr);
 	pthread_attr_setdetachstate (&thread_attr, PTHREAD_CREATE_DETACHED);
 	res = pthread_create (&thread, &thread_attr, clc_command_run,
@@ -660,8 +703,8 @@ static int clc_terminate (struct amf_comp *comp)
 
 	dprintf ("clc terminate for comp %s\n", getSaNameT (&comp->name));
 	assert (0);
-	operational_state_comp_set (comp, SA_AMF_OPERATIONAL_DISABLED);
 	comp_presence_state_set (comp, SA_AMF_PRESENCE_TERMINATING);
+	operational_state_comp_set (comp, SA_AMF_OPERATIONAL_DISABLED);
 
 	res = clc_interfaces[comp->comptype]->terminate (comp);
 	return (0);
@@ -739,9 +782,10 @@ struct amf_comp *amf_comp_new(struct amf_su *su, char *name)
 
 	comp->saAmfCompOperState = SA_AMF_OPERATIONAL_DISABLED;
 	comp->saAmfCompPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
-	comp->error_suspected = 0;
+	amf_comp_error_suspected_clear (comp);
 	setSaNameT (&comp->name, name);
-
+	comp->instantiate_timeout_handle = 0;
+	comp->cleanup_timeout_handle = 0;
 	return comp;
 }
 
@@ -778,6 +822,28 @@ void amf_comp_delete (struct amf_comp *comp)
 	free (comp);
 }
 
+struct amf_comp *amf_comp_find_from_conn_info (void *conn)
+{
+	struct amf_application *app;
+	struct amf_sg *sg;
+	struct amf_su *su;
+	struct amf_comp *comp;
+
+	for (app = amf_cluster->application_head; app != NULL; app = app->next) {
+		for (sg = app->sg_head; sg != NULL; sg = sg->next) {
+			for (su = sg->su_head; su != NULL; su = su->next) {
+				for (comp = su->comp_head; comp != NULL; comp = comp->next) {
+					if (comp->conn == conn) {
+						goto end;
+					}
+				}
+			}
+		}
+	}
+end:
+	return comp;
+}
+
 struct amf_comp *amf_comp_find (struct amf_cluster *cluster, SaNameT *name)
 {
 	struct amf_application *app;
@@ -1038,6 +1104,7 @@ static void mcast_healthcheck_tmo_event (
 	amf_comp_dn_make (healthcheck->comp, &req_exec.compName);
 	memcpy (&req_exec.safHealthcheckKey,
 		&healthcheck->safHealthcheckKey, sizeof (SaAmfHealthcheckKeyT));
+	req_exec.recommendedRecovery = healthcheck->recommendedRecovery;
 	iovec.iov_base = (char *)&req_exec;
 	iovec.iov_len = sizeof (req_exec);
 
@@ -1217,6 +1284,8 @@ SaAisErrorT amf_comp_register (struct amf_comp *comp)
             /* ignore due to instantitate timeout a while ago  */
 			break;
 		default:
+			log_printf(LOG_LEVEL_ERROR,"comp->saAmfCompPresenceState = %d",
+				comp->saAmfCompPresenceState);
 			assert (0);
 			break;
 		
@@ -1225,19 +1294,21 @@ SaAisErrorT amf_comp_register (struct amf_comp *comp)
 	return SA_AIS_OK;
 }
 
-void amf_comp_error_report (struct amf_comp *comp, SaAmfRecommendedRecoveryT recommendedRecovery)
+void amf_comp_error_report (struct amf_comp *comp, amf_comp_t* reporting_comp, 
+	SaAmfRecommendedRecoveryT recommendedRecovery)
 {
 	struct res_lib_amf_componenterrorreport res_lib;
-	TRACE2("Exec comp error report '%s'", comp->name.value);
-	
-	if (amf_su_is_local (comp->su)) {
+	TRACE2("Exec comp error report on comp'%s' from %s", comp->name.value, 
+		reporting_comp->name.value );
+	 
+	if (amf_su_is_local (reporting_comp->su)) {
 		res_lib.header.size = sizeof (struct res_lib_amf_componenterrorreport);
 		res_lib.header.id = MESSAGE_RES_AMF_COMPONENTERRORREPORT;
 		res_lib.header.error = SA_AIS_OK;
-		openais_conn_send_response (comp->conn, &res_lib, sizeof (res_lib));
+		openais_conn_send_response (reporting_comp->conn, &res_lib, sizeof (res_lib));
 	}
 
-	/* report to SU and let it handle the problem */
+    /* Report to SU and let it handle the problem */
 	report_error_suspected (comp, recommendedRecovery);
 }
 
@@ -1247,12 +1318,12 @@ void amf_comp_error_report (struct amf_comp *comp, SaAmfRecommendedRecoveryT rec
  * @param healthcheck
  */
 void amf_comp_healthcheck_tmo (
-	struct amf_comp *comp, struct amf_healthcheck *healthcheck)
+	struct amf_comp *comp, SaAmfRecommendedRecoveryT recommendedRecovery)
 {
 	TRACE2("Exec healthcheck tmo for '%s'", comp->name.value);
 
 	/* report to SU and let it handle the problem */
-	report_error_suspected (comp, healthcheck->recommendedRecovery);
+	report_error_suspected (comp, recommendedRecovery);
 }
 
 static void clear_ha_state (
@@ -1262,6 +1333,110 @@ static void clear_ha_state (
 	csi_assignment->saAmfCSICompHAState = 0;
 }
 
+
+static void comp_recover_action (amf_comp_t *comp, 
+	SaAmfRecommendedRecoveryT recommendedRecovery)
+{
+
+
+	ENTER ("%s %d %d", comp->name.value,recommendedRecovery, 
+		comp->saAmfCompRecoveryOnError);
+
+	amf_node_t *node = amf_node_find (&comp->su->saAmfSUHostedByNode);
+	switch (recommendedRecovery) {
+		case SA_AMF_NO_RECOMMENDATION: {
+			/*
+             * If the recommendation was SA_AMF_NO_RECOMMENDATION,
+             * then use the configured recovery action for the component
+             */
+			switch (comp->saAmfCompRecoveryOnError) {
+				case SA_AMF_NO_RECOMMENDATION:
+					if (comp->saAmfCompDisableRestart) {
+                        /* Comp or SU failover */
+						amf_node_comp_failover_req (node, comp);
+					} else {
+						/* Component restart */
+						amf_su_comp_error_suspected (comp->su, comp, 
+							recommendedRecovery);
+					}
+				case SA_AMF_COMPONENT_RESTART:
+					if (comp->saAmfCompDisableRestart) {
+                        /* Comp or SU failover */
+						amf_node_comp_failover_req (node, comp);
+					} else {
+						/* Component restart */
+						amf_su_comp_error_suspected (comp->su, comp, 
+							recommendedRecovery);
+					}
+					break;
+				case SA_AMF_COMPONENT_FAILOVER:
+                    /* SU failover */
+					amf_node_comp_failover_req (node, comp);
+					break;
+				case SA_AMF_NODE_SWITCHOVER:
+					break;
+				case SA_AMF_NODE_FAILOVER: { 
+                    /* Node failover */
+					amf_node_t *node = amf_node_find (
+						&comp->su->saAmfSUHostedByNode);
+					amf_node_failover(node);
+				}
+				break;
+				case SA_AMF_NODE_FAILFAST:
+					break;
+				case SA_AMF_CLUSTER_RESET:
+					break;
+				case SA_AMF_APPLICATION_RESTART:
+				default:
+					dprintf("recommendedRecovery=%d",recommendedRecovery);
+					assert (0);
+					break;
+			}
+			break;
+		}
+		case SA_AMF_COMPONENT_RESTART:
+			if (comp->saAmfCompDisableRestart == SA_TRUE) {
+				amf_node_comp_failover_req (node, comp);
+			} else {
+				amf_su_comp_error_suspected (comp->su, comp, recommendedRecovery);
+			}
+			break;
+		case SA_AMF_COMPONENT_FAILOVER:
+			amf_node_comp_failover_req (node, comp);
+			break;
+		case SA_AMF_NODE_SWITCHOVER:
+			break;
+		case SA_AMF_NODE_FAILOVER:
+            /* Node failover */
+			amf_node_failover (amf_node_find (&comp->su->saAmfSUHostedByNode));
+			break;
+		case SA_AMF_NODE_FAILFAST:
+			break;
+		case SA_AMF_CLUSTER_RESET:
+			break;
+		case SA_AMF_APPLICATION_RESTART:
+		default:
+			assert (0);
+			break;
+
+	}
+}
+
+/**
+ * Event method to be called when a cleanup completed event is received
+ * with failure.
+ * @param comp
+ */
+void amf_comp_cleanup_failed_completed (amf_comp_t *comp)
+{
+	ENTER ("'%s'", comp->name.value);
+
+	stop_component_cleanup_timer (comp);
+	amf_comp_error_suspected_clear (comp);
+	amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
+	comp_presence_state_set (comp, SA_AMF_PRESENCE_TERMINATION_FAILED);
+}
+
 /**
  * Event method to be called when a cleanup completed event is received
  * @param comp
@@ -1269,12 +1444,12 @@ static void clear_ha_state (
 void amf_comp_cleanup_completed (struct amf_comp *comp)
 {
 	TRACE2("Exec CLC cleanup completed for '%s'", comp->name.value);
+	stop_component_cleanup_timer (comp);
 
 	/* Set all CSI's confirmed HA state to unknown  */
 	amf_comp_foreach_csi_assignment (comp, clear_ha_state);
 
-	/* clear error suspected flag, component is terminated now */
-	comp->error_suspected = 0;
+	amf_comp_error_suspected_clear (comp);
 	
 	if (comp->saAmfCompPresenceState == SA_AMF_PRESENCE_RESTARTING) {
 		amf_comp_instantiate (comp);
@@ -1430,6 +1605,16 @@ void amf_comp_instantiate (struct amf_comp *comp)
 	}
 }
 
+void amf_comp_cleanup_tmo_event (struct amf_comp *comp)
+{
+	ENTER ("Comp cleanup timeout after %d ms '%s' '%s'", 
+		comp->saAmfCompCleanupTimeout, comp->su->name.value,
+		comp->name.value);
+	amf_comp_error_suspected_clear(comp);	
+	amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
+	comp_presence_state_set (comp, SA_AMF_PRESENCE_TERMINATION_FAILED);
+}
+
 void amf_comp_instantiate_tmo_event (struct amf_comp *comp)
 {
 	ENTER ("Comp instantiate timeout after %d ms '%s' '%s'", 
@@ -1515,7 +1700,8 @@ void amf_comp_readiness_state_set (struct amf_comp *comp,
  */
 int amf_comp_response_1 (
 	SaInvocationT invocation, SaAisErrorT error, SaAisErrorT *retval,
-	SaUint32T *interface, SaNameT *dn)
+	SaUint32T *interface, SaNameT *dn, SaAmfHealthcheckKeyT *healtcheck_key,
+	SaAmfRecommendedRecoveryT *recommendedRecovery)
 {
 	int res;
 	void *data;
@@ -1531,10 +1717,21 @@ int amf_comp_response_1 (
 	switch (*interface) {
 		case AMF_RESPONSE_HEALTHCHECKCALLBACK: {
 				struct amf_healthcheck *healthcheck = data;
-				SaNameT name;
-				TRACE7 ("Healthcheck response from '%s': %d",
-					amf_comp_dn_make (healthcheck->comp, &name), error);
-					
+
+				amf_comp_dn_make (healthcheck->comp, dn);
+				TRACE7 ("Healthcheck response from '%s': %d",dn->value, error);
+				/*
+                 * Healthcheck with erroneous response
+                 * and no recovery action is in progress.
+				 */
+				memcpy(healtcheck_key, &healthcheck->safHealthcheckKey,
+					sizeof (SaAmfHealthcheckKeyT));
+				*recommendedRecovery = healthcheck->recommendedRecovery;
+
+				if (error != SA_AIS_OK && 
+					!amf_comp_is_error_suspected (healthcheck->comp)) {
+					return 1; 
+				}
 				if (is_not_instantiating_or_instantiated_or_restarting(
 					healthcheck->comp)) {
 					log_printf (LOG_ERR, "HealthcheckResponse: ignored for key = %s, "
@@ -1602,8 +1799,9 @@ int amf_comp_response_1 (
  * 
  * @return component to which the response should be sent
  */
-struct amf_comp *amf_comp_response_2 (
-	SaUint32T interface, SaNameT *dn, SaAisErrorT error, SaAisErrorT *retval)
+struct amf_comp *amf_comp_response_2 (SaUint32T interface, SaNameT *dn, 
+	SaAmfHealthcheckKeyT *healthcheck_key, SaAisErrorT error, 
+	SaAisErrorT *retval, SaAmfRecommendedRecoveryT recommendedRecovery)
 {
 	struct amf_csi_assignment *csi_assignment;
 	struct amf_comp *comp = NULL;
@@ -1649,6 +1847,20 @@ struct amf_comp *amf_comp_response_2 (
 				}
 				break;
 			}
+		case AMF_RESPONSE_HEALTHCHECKCALLBACK: {
+			dprintf("AMF_RESPONSE_HEALTHCHECKCALLBACK for %s", dn->value);
+			comp = amf_comp_find (amf_cluster, dn);
+			
+			assert (comp);
+			amf_healthcheck_t *healthcheck = amf_comp_find_healthcheck (
+				comp, healthcheck_key);
+			assert (comp);
+			healthcheck->recommendedRecovery = recommendedRecovery; 
+			comp_recover_action (comp, healthcheck->recommendedRecovery);
+				
+			break;
+		}
+
 #if 0
 		case AMF_RESPONSE_COMPONENTTERMINATECALLBACK: {
 				struct component_terminate_callback_data *callback_data = data;
@@ -1682,7 +1894,7 @@ void amf_comp_hastate_set (
 	assert (component != NULL && csi_assignment != NULL);
 
 
-	if (!component->error_suspected) {
+	if (!amf_comp_is_error_suspected (component)) {
 		lib_csi_set_request(component, csi_assignment);
 	} else {
 		if (csi_assignment->requested_ha_state == SA_AMF_HA_QUIESCED) {
@@ -1708,7 +1920,7 @@ void amf_comp_terminate (struct amf_comp *comp)
 
 	if (amf_su_is_local (comp->su)) {
 		amf_comp_healthcheck_stop (comp, NULL);
-		if (comp->error_suspected) {
+		if (amf_comp_is_error_suspected(comp)) {
 			clc_interfaces[comp->comptype]->cleanup (comp);
 		} else {
 			clc_interfaces[comp->comptype]->terminate (comp);
@@ -1799,7 +2011,11 @@ SaAisErrorT amf_comp_healthcheck_confirm (
 				&healthcheck->timer_handle_period);
 		} else if (healthcheckResult == SA_AIS_ERR_FAILED_OPERATION) {
 			/* send to cluster */
-			mcast_healthcheck_tmo_event (healthcheck);
+			if (!comp->error_suspected) {
+				poll_timer_delete (aisexec_poll_handle,
+				healthcheck->timer_handle_period);
+				mcast_healthcheck_tmo_event (healthcheck);
+			}
 		} else {
 			error = SA_AIS_ERR_INVALID_PARAM;
 		}
@@ -1893,7 +2109,7 @@ void amf_comp_node_left (struct amf_comp *component)
 	struct amf_csi_assignment *csi_assignment;
 
 	ENTER("saAmfCompPresenceState = %d", component->saAmfCompPresenceState);
-	component->error_suspected = 0;
+	amf_comp_error_suspected_clear (component);
 	if (component->saAmfCompPresenceState == SA_AMF_PRESENCE_INSTANTIATING ||
 		component->saAmfCompPresenceState == SA_AMF_PRESENCE_RESTARTING ||
 		component->saAmfCompPresenceState == SA_AMF_PRESENCE_TERMINATING) {
@@ -1903,7 +2119,7 @@ void amf_comp_node_left (struct amf_comp *component)
 
 	component->saAmfCompPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
 
-	if (amf_su_presence_state_all_comps_in_su_are_set (component->su,
+	if (amf_su_are_all_comps_in_su (component->su,
 		SA_AMF_PRESENCE_UNINSTANTIATED)) {
 		component->su->saAmfSUPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
 	}
@@ -2258,4 +2474,18 @@ void amf_comp_csi_remove (amf_comp_t *component,
 		&res_lib, sizeof (struct res_lib_amf_csiremovecallback));
 }
 
+void amf_comp_error_suspected_clear (amf_comp_t *comp)
+{
+	comp->error_suspected = 0;
+}
+
+void amf_comp_error_suspected_set (amf_comp_t *comp)
+{
+	comp->error_suspected = 1;
+}
+
+int amf_comp_is_error_suspected (amf_comp_t *comp)
+{
+	return comp->error_suspected ? 1 : 0;
+}
 

+ 230 - 48
exec/amfnode.c

@@ -167,7 +167,7 @@
 
 #include <stdlib.h>
 #include <assert.h>
-
+#include <unistd.h>
 #include "amf.h"
 #include "util.h"
 #include "print.h"
@@ -223,6 +223,108 @@ static void node_acsm_enter_failing_over (struct amf_node *node)
 	}
 }
 
+/**
+ * 
+ * @param node
+ */
+static void failover_all_sg_on_node (amf_node_t *node)
+{
+	amf_application_t *app;
+	amf_sg_t *sg;
+	amf_su_t *su;
+	for (app = amf_cluster->application_head; app != NULL; app = app->next) {
+		for (sg = app->sg_head; sg != NULL; sg = sg->next) {
+			for (su = sg->su_head; su != NULL; su = su->next) {
+				if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
+					amf_sg_failover_node_req (sg, node);
+					break;
+				}
+			}
+
+		}
+	}
+}
+
+static void node_acsm_enter_failing_gracefully_failing_over (amf_node_t *node)
+{
+	ENTER("");
+	node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER;
+	failover_all_sg_on_node (node);
+}
+
+static int has_all_sg_on_node_failed_over (amf_node_t *node) 
+{
+	amf_application_t *app;
+	amf_sg_t *sg;
+	amf_su_t *su;
+	int has_all_sg_on_node_failed_over = 1;
+
+	for (app = amf_cluster->application_head; app != NULL; app = app->next) {
+		for (sg = app->sg_head; sg != NULL; sg = sg->next) {
+			for (su = sg->su_head; su != NULL; su = su->next) {
+				if (name_match(&su->saAmfSUHostedByNode, &node->name)) {
+					if (sg->avail_state != SG_AC_Idle) {
+						has_all_sg_on_node_failed_over = 0;
+						goto out;
+					}
+					break;
+				}
+			}
+
+		}
+	}
+out:
+	return has_all_sg_on_node_failed_over;
+}
+
+static void repair_node (amf_node_t *node)
+{
+	ENTER("");
+	char hostname[256];
+	gethostname (hostname, 256);
+	if (!strcmp (hostname, (const char*)node->saAmfNodeClmNode.value)) {
+        /* TODO if(saAmfAutoRepair == SA_TRUE) */
+#ifdef DEBUG
+			exit (0);
+#else
+			system ("reboot");
+#endif	
+	}
+}
+
+static void enter_failing_gracefully_rebooting_node (amf_node_t *node)
+{
+	ENTER("");
+	node->acsm_state = NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE;
+	repair_node (node);
+}
+
+static void node_acsm_enter_idle (amf_node_t *node)
+{
+	ENTER ("history_state=%d",node->history_state);
+	node->acsm_state =  node->history_state;
+}
+
+/**
+ * 
+ * @param node
+ * @param app
+ */
+static void node_acsm_enter_joining_assigning_workload (struct amf_node *node, 
+	struct amf_application *app)
+{
+	log_printf(LOG_NOTICE,
+		"Node=%s: all applications started, assigning workload.",
+		node->name.value);
+
+	ENTER("");
+	node->acsm_state = NODE_ACSM_JOINING_ASSIGNING_WORKLOAD;
+	for (app = app->cluster->application_head; app != NULL; 
+		app = app->next) {
+		amf_application_assign_workload (app, node);
+	}
+}
+
 /******************************************************************************
  * Event methods
  *****************************************************************************/
@@ -241,29 +343,51 @@ void amf_node_leave (struct amf_node *node)
 
 
 	switch (node->acsm_state) {
-		case NODE_ACSM_ESCALLATION_LEVEL_0:
-		case NODE_ACSM_ESCALLATION_LEVEL_2:
-		case NODE_ACSM_ESCALLATION_LEVEL_3:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
 			node_acsm_enter_leaving_spontaneously(node);    
 			node_acsm_enter_failing_over (node);
 			break;
 		case NODE_ACSM_REPAIR_NEEDED:
 			break;
+		case NODE_ACSM_FAILING_GRACEFULLY_REBOOTING_NODE:
+			node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED; 
+			node_acsm_enter_idle (node);
+			break;
 		default:
-			log_printf (LOG_LEVEL_ERROR, "amf_node_leave()called in state = %d"
+			log_printf (LOG_LEVEL_ERROR, "amf_node_leave called in state = %d"
 				" (should have been deferred)", node->acsm_state);
 			openais_exit_error (AIS_DONE_FATAL_ERR);
 			break;
 
 	}
 }
+
 /**
  * 
  * @param node
  */
 void amf_node_failover (struct amf_node *node)
 {
+	assert (node != NULL);
+	ENTER("'%s', CLM node '%s'", node->name.value,
+		node->saAmfNodeClmNode.value);
 
+	switch (node->acsm_state) {
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
+			node_acsm_enter_failing_gracefully_failing_over (node);
+			break;
+		case NODE_ACSM_REPAIR_NEEDED:
+			break;
+		default:
+			log_printf (LOG_LEVEL_ERROR, "amf_node_leave()called in state = %d"
+				" (should have been deferred)", node->acsm_state);
+			openais_exit_error (AIS_DONE_FATAL_ERR);
+			break;
+	}
 }
 
 /**
@@ -289,21 +413,94 @@ void amf_node_failfast (struct amf_node *node)
  * @param node
  * @param comp
  */
-void amf_node_comp_restart_req (
-	struct amf_node *node, struct amf_comp *comp)
+void amf_node_comp_restart_req (struct amf_node *node, struct amf_comp *comp)
 {
-
+	amf_su_t *su = comp->su;
+	ENTER("");
+	switch (node->acsm_state) {
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
+			node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_2;
+			amf_node_comp_restart_req (node, comp);
+			break;
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
+			if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
+				SaNameT dn;
+				node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_3;
+				amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
+				amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
+				amf_comp_dn_make (comp, &dn);
+
+				log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
+					"action:\n\t\tSU failover", dn.value);
+
+				amf_sg_failover_su_req (su->sg, su, node);
+			} else {
+				amf_su_restart (su);
+			}
+			break;
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
+			if (su->su_failover_cnt <  node->saAmfNodeSuFailoverMax) {
+				SaNameT dn;
+				amf_comp_operational_state_set (comp, SA_AMF_OPERATIONAL_DISABLED);
+				amf_su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
+				amf_comp_dn_make (comp, &dn);
+
+				log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
+					"action:\n\t\tSU failover", dn.value);
+
+				amf_sg_failover_su_req (su->sg, su, node);
+				return;
+			} else {
+				node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
+				amf_node_failover (node);
+			}
+			break;
+		default:
+			dprintf("%d",node->acsm_state);
+			assert (0);
+			break;
+	}                       	
 }
 
 /**
  * 
  * @param node
- * @param comp
  */
-void amf_node_comp_failover_req (
-	struct amf_node *node, struct amf_comp *comp)
+void amf_node_comp_failover_req (amf_node_t *node, amf_comp_t *comp)
 {
-
+	ENTER("");
+	switch (node->acsm_state) {
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
+			if (comp->su->saAmfSUFailover) {
+				/* SU failover */
+				amf_sg_failover_su_req (comp->su->sg,comp->su, node);
+				
+			} else {
+				/* TODO: component failover */
+				assert (0);
+			}
+			break;
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
+			if (comp->su->su_failover_cnt < node->saAmfNodeSuFailoverMax) {
+				if (comp->su->saAmfSUFailover) {
+					/* SU failover */
+					amf_sg_failover_su_req (comp->su->sg,comp->su, node);
+					
+				} else {
+					/* TODO: component failover */
+					assert (0);
+				}
+			} else {
+				node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
+				amf_node_failover (node);
+			}
+			break;
+		default:
+			dprintf("%d",node->acsm_state);
+			assert (0);
+			break;
+	}
 }
 
 /**
@@ -323,9 +520,9 @@ void amf_node_sync_ready (struct amf_node *node)
 	node->saAmfNodeOperState = SA_AMF_OPERATIONAL_ENABLED;
 
 	switch (node->acsm_state) {
-		case NODE_ACSM_ESCALLATION_LEVEL_0:
-		case NODE_ACSM_ESCALLATION_LEVEL_2:
-		case NODE_ACSM_ESCALLATION_LEVEL_3:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_0:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_2:
+		case NODE_ACSM_IDLE_ESCALLATION_LEVEL_3:
 		case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
 			node->acsm_state = NODE_ACSM_JOINING_STARTING_APPLICATIONS;
 			for (app = amf_cluster->application_head; app != NULL; app = app->next) {
@@ -369,15 +566,8 @@ void amf_node_application_started (struct amf_node *node,
 		case NODE_ACSM_JOINING_STARTING_APPLICATIONS:
 			if (amf_cluster_applications_started_with_no_starting_sgs(
 				app->cluster)) {
-				log_printf(LOG_NOTICE,
-					"Node=%s: all applications started, assigning workload.",
-					node->name.value);
 
-				node->acsm_state = NODE_ACSM_JOINING_ASSIGNING_WORKLOAD;
-				for (app = app->cluster->application_head; app != NULL; 
-					app = app->next) {
-					amf_application_assign_workload (app, node);
-				}
+				node_acsm_enter_joining_assigning_workload(node, app);
 			}
 			break;
 		default:
@@ -408,10 +598,7 @@ void amf_node_application_workload_assigned (struct amf_node *node,
 			if (amf_cluster_applications_assigned (amf_cluster)) {
 				log_printf(LOG_NOTICE, "Node=%s: all workload assigned", 
 					node->name.value);
-				/*
-				 * TODO: new state should be set via history
-				 */
-				node->acsm_state = NODE_ACSM_ESCALLATION_LEVEL_0;
+				node_acsm_enter_idle (node);
 			}
 			break;
 		default:
@@ -431,40 +618,32 @@ void amf_node_application_workload_assigned (struct amf_node *node,
  */
 void amf_node_sg_failed_over (struct amf_node *node, struct amf_sg *sg_in)
 {
-	struct amf_sg *sg;
-	struct amf_application *app = 0;
-	int all_sg_has_failed_over = 1;
-
 	assert (node != NULL);
-	ENTER ("Node=%s: SG '%s' started", node->name.value,
-		sg_in->name.value);
+	ENTER ("Node=%s: SG '%s' started %d", node->name.value,
+		sg_in->name.value,node->acsm_state);
 
 	switch (node->acsm_state) {
 		case NODE_ACSM_LEAVING_SPONTANEOUSLY_FAILING_OVER:
-			for (app = amf_cluster->application_head; app != NULL;
-				app = app->next) {
-				for (sg = app->sg_head; sg != NULL; sg = sg->next) {
-					if (sg->avail_state != SG_AC_Idle) {
-						all_sg_has_failed_over = 0;
-						goto end;
-					}
-				}
+			if (has_all_sg_on_node_failed_over (node)) { /*C2*/
+				node->acsm_state = 
+					NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN;
 			}
-
 			break;
 		case NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN:
 			/* Accept reports of failed over sg that has completed. */
 			break;
+		case NODE_ACSM_FAILING_GRACEFULLY_FAILING_OVER:
+			if (has_all_sg_on_node_failed_over (node)) { /*C2*/
+				enter_failing_gracefully_rebooting_node (node);
+			}
+			break;
 		default:
 			log_printf (LOG_LEVEL_ERROR, "amf_node_sg_failed_over()"
 				"called in state = %d (unexpected !!)", node->acsm_state);
 			openais_exit_error (AIS_DONE_FATAL_ERR);
 			break;
 	}
-	end:
-	if (all_sg_has_failed_over) {
-		node->acsm_state = NODE_ACSM_LEAVING_SPONTANEOUSLY_WAITING_FOR_NODE_TO_JOIN;
-	}
+
 }
 
 /******************************************************************************
@@ -494,8 +673,8 @@ struct amf_node *amf_node_new (struct amf_cluster *cluster, char *name) {
 	node->cluster = cluster;
 	node->next = cluster->node_head;
 	cluster->node_head = node;
-	node->acsm_state = NODE_ACSM_ESCALLATION_LEVEL_0; 
-
+	node->acsm_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0; 
+	node->history_state = NODE_ACSM_IDLE_ESCALLATION_LEVEL_0;
 	return node;
 }
 
@@ -526,6 +705,8 @@ void *amf_node_serialize (struct amf_node *node, int *len)
 		node->nodeid);
 	buf = amf_serialize_SaUint32T (buf, &size, &offset,
 		node->acsm_state);
+	buf = amf_serialize_SaUint32T (buf, &size, &offset,
+		node->history_state);
 
 	*len = offset;
 
@@ -547,6 +728,7 @@ struct amf_node *amf_node_deserialize (struct amf_cluster *cluster, char *buf) {
 	tmp = amf_deserialize_SaUint32T (tmp, &node->saAmfNodeOperState);
 	tmp = amf_deserialize_SaUint32T (tmp, &node->nodeid);
 	tmp = amf_deserialize_SaUint32T (tmp, &node->acsm_state);
+	tmp = amf_deserialize_SaUint32T (tmp, &node->history_state);
 
 	return node;
 }

+ 298 - 20
exec/amfsg.c

@@ -337,8 +337,8 @@ static int is_any_si_in_scope_assigned_standby (struct amf_sg *sg)
 	struct amf_si_assignment *si_assignment;
 
 	/*
-	 * Check if there is any si in the scope which has no active assignment
-	 * and at least one standby assignment.
+     * Check if there is any si in the scope which has no
+     * active assignment and at least one standby assignment.
 	 */
 	while (*sis != NULL) {
 		si_assignment = (*sis)->assigned_sis;
@@ -605,6 +605,7 @@ static void acsm_enter_repairing_su (struct amf_sg *sg)
 {
 	struct amf_su **sus= sg->recovery_scope.sus;
 	int is_any_su_instantiated = 0;
+	const int PERFORMS_INSTANTIATING = 1;
 
 	ENTER("'%s'",sg->name.value);
 	sg->avail_state = SG_AC_ReparingSu;
@@ -619,14 +620,15 @@ static void acsm_enter_repairing_su (struct amf_sg *sg)
 				amf_node_find(&((*sus)->saAmfSUHostedByNode));
 			if (node == NULL) {
 				log_printf (LOG_LEVEL_ERROR, 
-					"no node to hosted on su found"
-					"amf_si_depedency failed\n");
+					"Su to recover not hosted on any node\n");
 				openais_exit_error (AIS_DONE_FATAL_ERR);
 			}
 			if (node->saAmfNodeOperState == SA_AMF_OPERATIONAL_ENABLED) {
 				/* node is synchronized */
-				is_any_su_instantiated = 1;
-				amf_su_instantiate ((*sus));
+				
+				if (amf_su_instantiate ((*sus)) == PERFORMS_INSTANTIATING) {
+					is_any_su_instantiated = 1;
+				}
 			}
 		}
 		sus++;
@@ -648,7 +650,7 @@ static inline void remove_all_suspected_sus (amf_sg_t *sg)
 		for (component = su->comp_head; component != NULL; 
 			component = component->next) {
 
-				component->error_suspected = 0;
+				amf_comp_error_suspected_clear (component);
 		}
 	}
 }
@@ -828,7 +830,18 @@ static void acsm_enter_assigning_standby_to_spare (amf_sg_t *sg)
 	if (is_spare_sus (sg)) {
 		assume_standby_si_assignment_for_spare_sus (sg);
 	} else {
-		acsm_enter_repairing_su (sg);
+		switch (sg->recovery_scope.event_type) {
+			case SG_FAILOVER_NODE_EV:
+				acsm_enter_idle (sg);
+				break;
+			case SG_FAILOVER_SU_EV:
+				acsm_enter_repairing_su (sg);
+				break;
+			default:
+				dprintf("event_type %d",sg->recovery_scope.event_type);
+				assert (0);
+				break;
+		}
 	}
 }
 
@@ -1449,7 +1462,6 @@ static int amf_si_get_saAmfSINumReqActiveAssignments(struct amf_si *si)
 	return number_of_req_active_assignments;
 }
 
-
 static int amf_si_get_saAmfSINumReqStandbyAssignments(struct amf_si *si) 
 {
 	struct amf_si_assignment *si_assignment = si->assigned_sis;
@@ -1486,8 +1498,7 @@ static int sg_assign_nm_active (struct amf_sg *sg, int su_active_assign)
 	while (su != NULL && su_left_to_assign > 0) {
 		if (amf_su_get_saAmfSUReadinessState (su) !=
 			SA_AMF_READINESS_IN_SERVICE ||
-			amf_su_get_saAmfSUNumCurrActiveSIs (su) == 
-			assign_to_su ||
+			amf_su_get_saAmfSUNumCurrActiveSIs (su) ==  assign_to_su ||
 			amf_su_get_saAmfSUNumCurrStandbySIs (su) > 0) {
 
 			su = su->next;
@@ -1501,7 +1512,6 @@ static int sg_assign_nm_active (struct amf_sg *sg, int su_active_assign)
 			assign_to_su = sg->saAmfSGMaxActiveSIsperSUs;
 		}
 		while (si != NULL) {
-
 			if (name_match (&si->saAmfSIProtectedbySG, &sg->name) &&
 				assigned < assign_to_su && 
 				amf_si_get_saAmfSINumReqActiveAssignments(si) == 0) {
@@ -1609,6 +1619,45 @@ static int su_inservice_count_get (struct amf_sg *sg)
 	return(answer);
 }
 
+static int su_active_out_of_service_count_get (amf_sg_t *sg)
+{
+	int active_out_of_service_count = 0;
+	amf_su_t *su;
+	for (su = sg->su_head; su != NULL; su = su->next) {
+		amf_si_assignment_t *si_assignment;
+		si_assignment = amf_su_get_next_si_assignment (su, NULL);
+		while (si_assignment != NULL) {
+			if ((si_assignment->saAmfSISUHAState == SA_AMF_HA_ACTIVE) &&
+				(amf_su_get_saAmfSUReadinessState (su) == 
+				SA_AMF_READINESS_OUT_OF_SERVICE)) {
+				active_out_of_service_count += 1;
+			}
+			si_assignment = amf_su_get_next_si_assignment (su, si_assignment);
+		}
+	}
+	return active_out_of_service_count;
+}
+
+
+static int su_standby_out_of_service_count_get (amf_sg_t *sg)
+{
+	int active_out_of_service_count = 0;
+	amf_su_t *su;
+	for (su = sg->su_head; su != NULL; su = su->next) {
+		amf_si_assignment_t *si_assignment;
+		si_assignment = amf_su_get_next_si_assignment (su, NULL);
+		while (si_assignment != NULL) {
+			if ((si_assignment->saAmfSISUHAState == SA_AMF_HA_STANDBY) &&
+				(amf_su_get_saAmfSUReadinessState (su) == 
+				SA_AMF_READINESS_OUT_OF_SERVICE)) {
+				active_out_of_service_count += 1;
+			}
+			si_assignment = amf_su_get_next_si_assignment (su, si_assignment);
+		}
+	}
+	return active_out_of_service_count;
+}
+
 /**
  * TODO: dependency_level not used, hard coded
  * @param sg
@@ -1623,7 +1672,8 @@ static int assign_si (struct amf_sg *sg, int dependency_level)
 	int su_standby_assign;
 	int su_spare_assign;
 	int assigned = 0;
-
+	int active_out_of_service = 0;
+	int standby_out_of_service = 0;
 	ENTER ("'%s'", sg->name.value);
 
 	/**
@@ -1636,7 +1686,8 @@ static int assign_si (struct amf_sg *sg, int dependency_level)
 	 * Calculate number of SUs to assign to active or standby state
 	 */
 	inservice_count = su_inservice_count_get (sg);
-
+	active_out_of_service = su_active_out_of_service_count_get(sg);
+	standby_out_of_service = su_standby_out_of_service_count_get(sg);
 	if (sg->saAmfSGNumPrefActiveSUs > 0) {
 
 		active_sus_needed = div_round (
@@ -1664,19 +1715,20 @@ static int assign_si (struct amf_sg *sg, int dependency_level)
 	/* Determine number of active and standby service units
 	 * to assign based upon reduction procedure
 	 */
-	if ((inservice_count < active_sus_needed)) {
+	if ((inservice_count < active_sus_needed - active_out_of_service)) {
 		dprintf ("assignment VI - partial assignment with SIs drop outs\n");
 
 		su_active_assign = inservice_count;
 		su_standby_assign = 0;
 		su_spare_assign = 0;
 	} else
-		if ((inservice_count < active_sus_needed + standby_sus_needed)) {
+		if ((inservice_count < active_sus_needed - active_out_of_service + 
+			 standby_sus_needed)) {
 		dprintf ("assignment V - partial assignment with reduction of"
 			" standby units\n");
 
 		su_active_assign = active_sus_needed;
-		su_standby_assign = inservice_count - active_sus_needed;
+		su_standby_assign = inservice_count - active_sus_needed - active_out_of_service;
 		su_spare_assign = 0;
 	} else
 		if ((inservice_count < sg->saAmfSGNumPrefActiveSUs + standby_sus_needed)) {
@@ -1751,6 +1803,7 @@ static int assign_si (struct amf_sg *sg, int dependency_level)
 	return assigned;
 }
 
+#ifdef COMPILE_OUT
 static void remove_si_in_scope (amf_sg_t *sg, amf_si_t *si)
 {
 	int i; 
@@ -1769,8 +1822,9 @@ static void remove_si_in_scope (amf_sg_t *sg, amf_si_t *si)
 
 	sg->recovery_scope.sis = new_sis;
 }
+#endif
 
-
+#ifdef COMPILE_OUT
 static void remove_sis_for_term_failed_su_from_scope (amf_sg_t *sg, 
 	amf_su_t *su)
 {
@@ -1799,6 +1853,195 @@ static void remove_sis_for_term_failed_su_from_scope (amf_sg_t *sg,
 		}
 	}
 }
+#endif
+
+/**
+ * sg_su_state_changed_in_instantiated
+ * @param sg
+ * @param su
+ */
+static void sg_su_state_changed_to_instantiated (struct amf_sg *sg, struct amf_su *su)
+{
+	ENTER("%s %s",sg->name.value, su->name.value);
+	switch (sg->avail_state) {
+		case SG_AC_InstantiatingServiceUnits:
+			if (no_su_has_presence_state(sg, sg->node_to_start, 
+				SA_AMF_PRESENCE_INSTANTIATING)) {
+				acsm_enter_idle (sg);
+			}
+			break;
+		case SG_AC_ReparingSu:
+			if (no_su_has_presence_state(sg, sg->node_to_start, 
+				SA_AMF_PRESENCE_INSTANTIATING)) {
+				if (all_su_in_scope_has_either_of_three_presence_state(
+					su->sg,
+					SA_AMF_PRESENCE_INSTANTIATED,
+					SA_AMF_PRESENCE_INSTANTIATION_FAILED,
+					SA_AMF_PRESENCE_UNINSTANTIATED)) {
+					su->sg->avail_state = SG_AC_AssigningStandBy;
+					if (assign_si (sg, 0) == 0) {
+						acsm_enter_idle (sg);
+					}
+				} else {
+					dprintf ("avail-state: %u", sg->avail_state);
+					assert (0);
+				}
+			}
+			break;
+		default:
+			dprintf ("avail-state: %u", sg->avail_state);
+			assert (0);
+			break;
+	}
+}
+
+/**
+ * amf_sg_su_state_changed_in_uninstantiated
+ * @param sg
+ * @param su
+ */
+static void amf_sg_su_state_changed_to_uninstantiated (amf_sg_t *sg, 
+	amf_su_t *su)
+{
+	ENTER("%s %s",sg->name.value, su->name.value);
+	switch (sg->avail_state) {
+		case SG_AC_TerminatingSuspected:
+			if (no_su_has_presence_state(sg, sg->node_to_start, 
+				SA_AMF_PRESENCE_TERMINATING)) {
+				if (all_su_in_scope_has_either_two_presence_state (sg,
+					SA_AMF_PRESENCE_UNINSTANTIATED,
+					SA_AMF_PRESENCE_TERMINATION_FAILED)) {
+					
+					delete_si_assignments_in_scope (sg);
+					
+					if (is_any_si_in_scope_assigned_standby (sg)) {
+						remove_all_suspected_sus (sg);
+						acsm_enter_removing_standby_assignments (sg);
+					} else { /*is_no_si_in_scope_assigned_standby*/
+						remove_all_suspected_sus (sg);
+						acsm_enter_assigning_standby_to_spare (sg);
+					}
+				}
+			}
+			break;
+		case SG_AC_ReparingSu:
+			if (no_su_has_presence_state(sg, sg->node_to_start, 
+				SA_AMF_PRESENCE_TERMINATING)) {
+				if (all_su_in_scope_has_either_of_three_presence_state(
+					su->sg,
+					SA_AMF_PRESENCE_INSTANTIATED,
+					SA_AMF_PRESENCE_INSTANTIATION_FAILED,
+					SA_AMF_PRESENCE_UNINSTANTIATED)) {
+					su->sg->avail_state = SG_AC_AssigningStandBy;
+					if (assign_si (sg, 0) == 0) {
+						acsm_enter_idle (sg);
+					}
+				}
+			}
+			break;
+		default:
+			log_printf (LOG_ERR, "sg avail_state = %d", sg->avail_state);
+			assert (0);
+			break;
+	}
+}
+
+static int npm_and_comp_in_active_ha_state (
+	amf_sg_t *sg, amf_su_t *su)
+{
+	amf_comp_t *component;
+	amf_csi_assignment_t *csi_assignment;
+	int comp_is_in_active_ha_state = 0;
+
+	if(sg->saAmfSGRedundancyModel == SA_AMF_NPM_REDUNDANCY_MODEL) {
+		for (component = su->comp_head; component != NULL; 
+			  component = component->next) {
+			csi_assignment = amf_comp_get_next_csi_assignment(component, NULL);
+			while (csi_assignment != NULL) {
+				if (csi_assignment->saAmfCSICompHAState == SA_AMF_HA_ACTIVE) {
+					comp_is_in_active_ha_state = 1;
+					goto out;
+				}
+				csi_assignment = amf_comp_get_next_csi_assignment(component, 
+					csi_assignment);
+			}
+		}
+	}
+out:
+	return comp_is_in_active_ha_state;
+}
+
+/**
+ * amf_sg_su_state_changed_in_termination_failed
+ * @param sg
+ * @param su
+ */
+static void amf_sg_su_state_changed_to_termination_failed (amf_sg_t *sg,
+	amf_su_t *su)
+{
+	ENTER("%s %s",sg->name.value, su->name.value);
+	if (no_su_has_presence_state(sg, sg->node_to_start, 
+		SA_AMF_PRESENCE_INSTANTIATING)) {
+		if (npm_and_comp_in_active_ha_state (sg, su)) {
+			acsm_enter_idle (sg);
+			goto out;
+		}
+		
+		if (all_su_in_scope_has_either_two_presence_state (sg,
+			SA_AMF_PRESENCE_UNINSTANTIATED,
+			SA_AMF_PRESENCE_TERMINATION_FAILED)) {
+			
+			delete_si_assignments_in_scope (sg);
+			
+			if (is_any_si_in_scope_assigned_standby (sg)) {
+				remove_all_suspected_sus (sg);
+				acsm_enter_removing_standby_assignments (sg);
+			} else { /*is_no_si_in_scope_assigned_standby*/
+				remove_all_suspected_sus (sg);
+				acsm_enter_assigning_standby_to_spare (sg);
+			}
+		}
+	}
+out:
+	return;
+}
+/**
+ * amf_sg_su_state_changed_in_instantiation_failed
+ * @param sg
+ * @param su
+ */
+static void amf_sg_su_state_changed_to_instantiation_failed (amf_sg_t *sg,  
+	amf_su_t *su)
+{
+	ENTER("%s %s",sg->name.value, su->name.value);
+	switch (sg->avail_state) {
+		case SG_AC_InstantiatingServiceUnits:
+			if (no_su_has_presence_state(sg, sg->node_to_start, 
+				SA_AMF_PRESENCE_INSTANTIATING)) {
+				acsm_enter_idle (sg);
+			}
+			break;
+		case SG_AC_ReparingSu:
+			if (no_su_has_presence_state(sg, sg->node_to_start, 
+				SA_AMF_PRESENCE_INSTANTIATING)) {
+				if (all_su_in_scope_has_either_of_three_presence_state(
+					su->sg,
+					SA_AMF_PRESENCE_INSTANTIATED,
+					SA_AMF_PRESENCE_INSTANTIATION_FAILED,
+					SA_AMF_PRESENCE_UNINSTANTIATED)) {
+					su->sg->avail_state = SG_AC_AssigningStandBy;
+					if (assign_si (sg, 0) == 0) {
+						acsm_enter_idle (sg);
+					}
+				}
+			}
+			break;
+		default:
+			/* TODO: Insert the assert (0) until solving defers in SU   */
+			dprintf("sg->avail_state = %d", sg->avail_state);
+			break;
+	}
+}
 
 int amf_sg_assign_si_req (struct amf_sg *sg, int dependency_level)
 {
@@ -1949,7 +2192,8 @@ int amf_sg_start (struct amf_sg *sg, struct amf_node *node)
 	return instantiated_sus;
 }
 
-void amf_sg_su_state_changed (struct amf_sg *sg, 
+#ifdef COMPILE_OUT
+void amf_sg_su_state_changed_2 (struct amf_sg *sg, 
 	struct amf_su *su, SaAmfStateT type, int state)
 {
 	ENTER ("'%s' SU '%s' state %s",
@@ -2055,7 +2299,7 @@ void amf_sg_su_state_changed (struct amf_sg *sg,
 					}
 				}
 			} else {
-                /* TODO: Insert the assert (0) until solving defers in SU   */
+				/* TODO: Insert the assert (0) until solving defers in SU   */
 				dprintf("sg->avail_state = %d, su instantiation state = %d",
 					sg->avail_state, state);
 			}
@@ -2066,6 +2310,39 @@ void amf_sg_su_state_changed (struct amf_sg *sg,
 		}
 	}
 }
+#endif
+
+void amf_sg_su_state_changed (struct amf_sg *sg, struct amf_su *su, 
+	SaAmfStateT type, int state)
+{
+	ENTER ("'%s' SU '%s' state %s",
+		sg->name.value, su->name.value, amf_presence_state(state));
+
+	if (type == SA_AMF_PRESENCE_STATE) {
+		switch (state) {
+			case SA_AMF_PRESENCE_INSTANTIATED:
+				sg_su_state_changed_to_instantiated(sg, su);
+				break;
+			case SA_AMF_PRESENCE_UNINSTANTIATED:
+				amf_sg_su_state_changed_to_uninstantiated(sg, su);
+				break;
+			case SA_AMF_PRESENCE_TERMINATION_FAILED:
+				amf_sg_su_state_changed_to_termination_failed(sg, su);
+				break;
+			case SA_AMF_PRESENCE_INSTANTIATING:
+				; /* nop */
+				break;
+			case SA_AMF_PRESENCE_INSTANTIATION_FAILED:
+				amf_sg_su_state_changed_to_instantiation_failed(sg, su);
+				break;
+			default :
+				dprintf("sg->avail_state = %d, su instantiation state = %d",
+					sg->avail_state, state);
+				assert (0);
+				break;
+		}
+	}
+}
 
 void amf_sg_init (void)
 {
@@ -2080,6 +2357,7 @@ void amf_sg_failover_su_req (struct amf_sg *sg, struct amf_su *su,
 
 	switch (sg->avail_state) {
 		case SG_AC_Idle:
+			su->su_failover_cnt += 1;
 			set_scope_for_failover_su (sg, su);
 			if (has_any_su_in_scope_active_workload (sg)) {
 				acsm_enter_deactivating_dependent_workload (sg);

+ 1 - 1
exec/amfsi.c

@@ -311,7 +311,7 @@ void amf_si_comp_set_ha_state_done (
 	 */
 	if (csi_assignment->si_assignment->requested_ha_state ==
 		csi_assignment->si_assignment->saAmfSISUHAState) {
-
+		TRACE1("'%s', '%s'", si->name.value, csi_assignment->csi->name.value);
 		csi_assignment->si_assignment->assumed_callback_fn (
 			csi_assignment->si_assignment, 0);
 		csi_assignment->si_assignment->assumed_callback_fn = NULL;

+ 529 - 115
exec/amfsu.c

@@ -128,6 +128,79 @@
 #include "print.h"
 #include "main.h"
 
+static int terminate_all_components_in_level (struct amf_su *su, 
+	SaUint32T current_instantiation_level);
+static int are_all_comps_in_level_uninst_or_term_failed (struct amf_su *su);
+static int are_all_comps_in_level_instantiated (struct amf_su *su);
+static int instantiate_all_components_in_level (struct amf_su *su, 
+	SaUint32T current_instantiation_level);
+static SaUint32T su_lowest_comp_instantiation_level_set (struct amf_su *su);
+
+typedef struct su_event {
+	amf_su_event_type_t event_type;
+	amf_su_t *su;
+	amf_comp_t *comp;
+	SaAmfRecommendedRecoveryT recommended_recovery;
+} su_event_t;
+
+/**
+ * 
+ * @param su
+ * @param comp
+ * @param su_event
+ * @param event_type
+ */
+static void su_event_set(struct amf_su *su, struct amf_comp *comp, 
+	SaAmfRecommendedRecoveryT recommended_recovery,
+	su_event_t *su_event, amf_su_event_type_t event_type)
+{
+	su_event->event_type = event_type;
+	su_event->comp = comp;
+	su_event->su = su;
+	su_event->recommended_recovery = recommended_recovery;
+}
+
+static void su_defer_event (amf_su_t *su, amf_comp_t *comp, 
+	SaAmfRecommendedRecoveryT recommended_recovery, 
+	amf_su_event_type_t su_event_type)
+{
+	su_event_t event;
+	su_event_set(su, comp, recommended_recovery,&event, su_event_type);
+	ENTER("event_type = %d", event.event_type);
+	amf_fifo_put (event.event_type, &event.su->deferred_events,
+		sizeof (su_event_t), &event);
+}
+
+static void su_recall_deferred_events (amf_su_t *su)
+{
+	su_event_t su_event;
+
+	ENTER ("%s", su->name.value);
+	if (amf_fifo_get (&su->deferred_events, &su_event)) {
+		switch (su_event.event_type) {
+			case SU_COMP_ERROR_SUSPECTED_EV:
+				amf_su_comp_error_suspected (su_event.su,su_event.comp,
+					su_event.recommended_recovery);
+				break;
+			default:
+				dprintf("event_type = %d", su_event.event_type);
+				break;
+		}
+	}
+}
+
+static int has_component_restarted_max_times (amf_comp_t *comp, amf_su_t *su)
+{
+ return comp->saAmfCompRestartCount >= su->sg->saAmfSGCompRestartMax;
+}
+
+#ifdef COMPILE_OUT
+static int has_su_restarted_max_times (amf_su_t *su)
+{
+	return su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax;
+}
+#endif
+
 /**
  * This function only logs since the readiness state is runtime
  * calculated.
@@ -162,13 +235,15 @@ static void su_presence_state_set (struct amf_su *su,
 	log_printf (LOG_NOTICE, "Setting SU '%s' presence state: %s\n",
 		su->name.value, amf_presence_state (presence_state));
 
-	if (su->restart_control_state != SU_RC_RESTART_SU_SETTING) {
-		amf_sg_su_state_changed (
-			su->sg, su, SA_AMF_PRESENCE_STATE, presence_state);
+
+	if (su->restart_control_state != SU_RC_RESTART_SU_SETTING &&
+		su->restart_control_state != SU_RC_RESTART_COMP_RESTARTING) {
+		amf_sg_su_state_changed (su->sg, su, SA_AMF_PRESENCE_STATE, 
+			presence_state);
 	}
 }
 
-static void su_operational_state_set (struct amf_su *su,
+void amf_su_operational_state_set (struct amf_su *su,
 	SaAmfOperationalStateT oper_state)
 {
 	struct amf_comp* comp;
@@ -211,28 +286,6 @@ static void comp_assign_csi (struct amf_comp *comp, struct amf_csi *csi,
 	csi_assignment->si_assignment = si_assignment;
 }
 
-static void su_restart (struct amf_su *su)
-{
-	struct amf_comp *comp;
-	SaNameT dn;
-
-	ENTER ("'%s'", su->name.value);
-
-	amf_su_dn_make (su, &dn);
-	log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
-		"action: SU restart", dn.value);
-
-	su->restart_control_state = SU_RC_RESTART_SU_DEACTIVATING;
-	su->restart_control_state = SU_RC_RESTART_SU_INSTANTIATING;
-	su->escalation_level_history_state =
-		SU_RC_ESCALATION_LEVEL_2;
-
-	su->saAmfSURestartCount += 1;
-
-	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
-		amf_comp_restart (comp);
-	}
-}
 
 static void comp_restart (struct amf_comp *comp)
 {
@@ -245,7 +298,7 @@ static void comp_restart (struct amf_comp *comp)
 
 	comp->su->restart_control_state = SU_RC_RESTART_COMP_DEACTIVATING;
 	comp->su->restart_control_state = SU_RC_RESTART_COMP_RESTARTING;
-	comp->su->escalation_level_history_state = SU_RC_ESCALATION_LEVEL_1;
+	comp->su->escalation_level_history_state = SU_RC_IDLE_ESCALATION_LEVEL_1;
 	amf_comp_restart (comp);
 }
 
@@ -256,9 +309,7 @@ static void si_ha_state_assumed_cbfn (
 	struct amf_comp *comp;
 	struct amf_csi_assignment *csi_assignment;
 	int all_confirmed = 1;
-
 	ENTER ("");
-
 	tmp_si_assignment = amf_su_get_next_si_assignment(si_assignment->su, NULL);
 
 	while (tmp_si_assignment != NULL) {
@@ -294,6 +345,8 @@ static void si_ha_state_assumed_cbfn (
 		}
 		si_assignment->su->restart_control_state =
 			si_assignment->su->escalation_level_history_state;
+		su_recall_deferred_events (si_assignment->su);
+
 	}
 }
 
@@ -312,25 +365,112 @@ static void reassign_sis(struct amf_su *su)
 	}
 }
 
-static void su_comp_presence_state_changed (
-	struct amf_su *su, struct amf_comp *comp, int state)
+
+static int is_any_component_instantiating (amf_su_t *su)
 {
-	ENTER ("'%s', '%s'", su->name.value, comp->name.value);
+	amf_comp_t *component;
+	int any_component_instantiating = 0;
+	for (component = su->comp_head; component != NULL; 
+		  component = component->next) {
+		if (component->saAmfCompPresenceState == 
+			SA_AMF_PRESENCE_INSTANTIATING) {
+			any_component_instantiating = 1;
+			break;
+		}
+		
+	}
+	return any_component_instantiating;
+}
+
+static int is_any_component_terminating (amf_su_t *su)
+{
+	amf_comp_t *component;
+	int any_component_terminating = 0;
+	for (component = su->comp_head; component != NULL; 
+		  component = component->next) {
+		if (component->saAmfCompPresenceState == 
+			SA_AMF_PRESENCE_TERMINATING) {
+			any_component_terminating = 1;
+			break;
+		}
+		
+	}
+	return any_component_terminating;
+}
+
+static int is_any_component_restarting (amf_su_t *su)
+{
+	amf_comp_t *component;
+	int any_component_terminating = 0;
+	for (component = su->comp_head; component != NULL; 
+		  component = component->next) {
+		if (component->saAmfCompPresenceState == 
+			SA_AMF_PRESENCE_RESTARTING) {
+			any_component_terminating = 1;
+			break;
+		}
+	}
+	return any_component_terminating;
+}
+
+static int is_any_comp_instantiation_failed (amf_su_t *su)
+{
+	amf_comp_t *comp_;
+	int comp_instantiation_failed = 0;
+
+	for (comp_ = su->comp_head; comp_ != NULL; comp_ = comp_->next) {
+
+		if (comp_->saAmfCompPresenceState == 
+			SA_AMF_PRESENCE_INSTANTIATION_FAILED) {
+			comp_instantiation_failed = 1;
+			break;
+		}
+	}
+	return comp_instantiation_failed;
+}
+
+static SaAmfPresenceStateT 	get_worst_comps_presence_state_in_su (amf_su_t *su)
+{
+	amf_comp_t *component;
+	SaAmfPresenceStateT worst_presence_state = 0;
+
+	for (component = su->comp_head; component != NULL; 
+		  component = component->next) {
+		if (component->saAmfCompPresenceState > worst_presence_state) {
+			worst_presence_state = component->saAmfCompPresenceState;
+		}
+	}
+	return worst_presence_state;
+}
+
+static void su_comp_presence_state_changed (struct amf_su *su, 
+	struct amf_comp *comp, int state)
+{
+	ENTER ("'%s', '%s' %d %d", su->name.value, comp->name.value, state,
+		su->restart_control_state);
 
 	switch (state) {
 		case SA_AMF_PRESENCE_INSTANTIATED:
 			switch (su->restart_control_state) {
-				case SU_RC_ESCALATION_LEVEL_1:
-				case SU_RC_ESCALATION_LEVEL_2:
-					/* 
-					 * TODO: send to node
-					*/
-				case SU_RC_ESCALATION_LEVEL_0:
-					if (amf_su_presence_state_all_comps_in_su_are_set (
-						comp->su, SA_AMF_PRESENCE_INSTANTIATED)) {
-
-						su_presence_state_set (
-							comp->su, SA_AMF_PRESENCE_INSTANTIATED);
+				case SU_RC_IDLE_ESCALATION_LEVEL_1:
+				case SU_RC_IDLE_ESCALATION_LEVEL_2:
+				case SU_RC_IDLE_ESCALATION_LEVEL_0:
+					if (!is_any_component_instantiating (su)) {
+						if (are_all_comps_in_level_instantiated (su)) {
+							if (instantiate_all_components_in_level (su, 
+								++comp->su->current_comp_instantiation_level)) {
+                                /* All levels of instantiation is done */
+								su_presence_state_set (comp->su, 
+									SA_AMF_PRESENCE_INSTANTIATED);
+							}
+						} else {
+							if (is_any_comp_instantiation_failed (su)) {
+								su_presence_state_set (comp->su, 
+									SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+							} else {
+								assert (0);
+							}
+						}
 					}
 					break;
 				case SU_RC_RESTART_COMP_RESTARTING:
@@ -338,13 +478,21 @@ static void su_comp_presence_state_changed (
 					reassign_sis (comp->su);
 					break;
 				case SU_RC_RESTART_SU_INSTANTIATING:
-					if (amf_su_presence_state_all_comps_in_su_are_set (
-						comp->su, SA_AMF_PRESENCE_INSTANTIATED)) {
-
-						su->restart_control_state = SU_RC_RESTART_SU_SETTING;
-						su_presence_state_set (
-							comp->su, SA_AMF_PRESENCE_INSTANTIATED);
-						reassign_sis (comp->su);
+					if (!is_any_component_restarting(su)) {
+						if (amf_su_are_all_comps_in_su (
+							comp->su, SA_AMF_PRESENCE_INSTANTIATED)) {
+							su->restart_control_state = SU_RC_RESTART_SU_SETTING;
+							su_presence_state_set (comp->su, 
+								SA_AMF_PRESENCE_INSTANTIATED);
+							reassign_sis (comp->su);
+						} else {
+							if (is_any_comp_instantiation_failed (su)) {
+								su_presence_state_set (comp->su, 
+									SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+							} else {
+								assert (0);
+							}
+						}
 					}
 					break;
 				default:
@@ -354,10 +502,14 @@ static void su_comp_presence_state_changed (
 			}
 			break;
 		case SA_AMF_PRESENCE_UNINSTANTIATED:
-			if (amf_su_presence_state_all_comps_in_su_are_set (
-				su, SA_AMF_PRESENCE_UNINSTANTIATED)) {
-
-				su_presence_state_set (comp->su,SA_AMF_PRESENCE_UNINSTANTIATED);
+			if (!is_any_component_terminating (su)) {
+				if (are_all_comps_in_level_uninst_or_term_failed (su)) {
+					if (terminate_all_components_in_level (su,
+						--su->current_comp_instantiation_level)) {
+						su_presence_state_set (su,
+							get_worst_comps_presence_state_in_su (su));
+					}
+				} 
 			} 
 			break;
 		case SA_AMF_PRESENCE_INSTANTIATING:
@@ -368,8 +520,82 @@ static void su_comp_presence_state_changed (
 		case SA_AMF_PRESENCE_TERMINATING:
 			break;
 		case SA_AMF_PRESENCE_INSTANTIATION_FAILED:
-			su_presence_state_set (
-				comp->su, SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+			switch (su->restart_control_state) {
+				case SU_RC_IDLE_ESCALATION_LEVEL_0:
+				case SU_RC_IDLE_ESCALATION_LEVEL_1:
+				case SU_RC_IDLE_ESCALATION_LEVEL_2:
+					if (!is_any_component_instantiating (su)) {
+						su_presence_state_set (comp->su,
+							SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+					}
+					break;
+				case SU_RC_RESTART_COMP_RESTARTING:
+					su->restart_control_state = 
+						su->escalation_level_history_state;
+
+					su_presence_state_set (comp->su, 
+						SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+					break;
+				case SU_RC_RESTART_SU_INSTANTIATING:
+					if (!is_any_component_instantiating (su)) {
+						su->restart_control_state = 
+							su->escalation_level_history_state;
+
+						su_presence_state_set (comp->su, 
+							SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+					}
+					break;
+				default:
+					assert (0);
+					break;
+			}
+#ifdef COMPILE_OUT
+			su_presence_state_set (comp->su, 
+				SA_AMF_PRESENCE_INSTANTIATION_FAILED);
+#endif
+			break;
+		case SA_AMF_PRESENCE_TERMINATION_FAILED:
+			switch (su->restart_control_state) {
+				case SU_RC_IDLE_ESCALATION_LEVEL_0:
+				case SU_RC_IDLE_ESCALATION_LEVEL_1:
+				case SU_RC_IDLE_ESCALATION_LEVEL_2:
+					if (!is_any_component_terminating (su)) {
+						if (are_all_comps_in_level_uninst_or_term_failed (su)) {
+							if (terminate_all_components_in_level (su,
+								--su->current_comp_instantiation_level)) {
+								su_presence_state_set (su,
+									get_worst_comps_presence_state_in_su (su));
+							}
+						} 
+					} 
+					break;
+				case SU_RC_RESTART_COMP_RESTARTING:
+					su->restart_control_state = 
+						su->escalation_level_history_state;
+
+					su_presence_state_set (comp->su, 
+						SA_AMF_PRESENCE_TERMINATION_FAILED);
+
+					break;
+				case SU_RC_RESTART_SU_INSTANTIATING:
+                    /*
+                     * TODO Reconsider SU restart control concerning
+                     * TERMINATING and INSANITATION
+                     */
+				case SU_RC_RESTART_SU_TERMINATING:
+					if (!is_any_component_terminating (su)) {
+						su->restart_control_state = 
+							su->escalation_level_history_state;
+
+						su_presence_state_set (comp->su, 
+							SA_AMF_PRESENCE_TERMINATION_FAILED);
+					}
+					break;
+				default:
+					assert (0);
+					break;
+			}
+
 			break;
 		default:
 			assert (0);
@@ -380,7 +606,7 @@ static void su_comp_presence_state_changed (
 static void su_comp_op_state_changed (
 	struct amf_su *su, struct amf_comp *comp, int state)
 {
-	ENTER ("'%s', '%s'", su->name.value, comp->name.value);
+	ENTER ("'%s', '%s' %d", su->name.value, comp->name.value, state);
 
 	switch (state) {
 		case SA_AMF_OPERATIONAL_ENABLED:
@@ -397,45 +623,147 @@ static void su_comp_op_state_changed (
 					}
 				}
 				if (all_set) {
-					su_operational_state_set (comp->su, SA_AMF_OPERATIONAL_ENABLED);
+					amf_su_operational_state_set (comp->su, 
+						SA_AMF_OPERATIONAL_ENABLED);
 				} else {
-					su_operational_state_set (comp->su, SA_AMF_OPERATIONAL_DISABLED);
+					amf_su_operational_state_set (comp->su, 
+						SA_AMF_OPERATIONAL_DISABLED);
 				}
 				break;
 			}
 		case SA_AMF_OPERATIONAL_DISABLED:
+			amf_su_operational_state_set (comp->su, SA_AMF_OPERATIONAL_DISABLED);
 			break;
 		default:
 			assert (0);
 			break;
 	}
+	return;
 }
 
-int amf_su_presence_state_all_comps_in_su_are_set (struct amf_su *su,
-	SaAmfPresenceStateT state)
+/**
+ * 
+ * @param su
+ * @param comp
+ */
+static int instantiate_all_components_in_level (struct amf_su *su, 
+	SaUint32T current_instantiation_level)
 {
-	int all_set = 1;
-	struct amf_comp *comp;
+	amf_comp_t *comp;
+	SaUint32T all_components_instantiated = 1;
 
 	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
-		if (comp->saAmfCompPresenceState != state) {
-			all_set = 0;
-			break;
+		if (su->current_comp_instantiation_level == 
+			comp->saAmfCompInstantiationLevel) {
+			all_components_instantiated = 0;
+			amf_comp_instantiate (comp);
+		}
+	}
+	return all_components_instantiated;
+}
+
+static int are_all_comps_in_level_instantiated (struct amf_su *su)
+{
+	SaUint32T level = su->current_comp_instantiation_level;
+	amf_comp_t *comp;
+	int all = 1;
+
+	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
+		if (level == comp->saAmfCompInstantiationLevel) {
+			if (comp->saAmfCompPresenceState != SA_AMF_PRESENCE_INSTANTIATED) {
+				all = 0;
+				break;
+			}
+		}
+	}
+
+	return all;
+}
+
+
+static int are_all_comps_in_level_uninst_or_term_failed(
+	struct amf_su *su)
+{
+	SaUint32T level = su->current_comp_instantiation_level;
+	amf_comp_t *comp;
+	int all = 1;
+
+	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
+		if (level == comp->saAmfCompInstantiationLevel) {
+			if (comp->saAmfCompPresenceState != SA_AMF_PRESENCE_UNINSTANTIATED &&
+				comp->saAmfCompPresenceState != SA_AMF_PRESENCE_TERMINATION_FAILED) {
+				all = 0;
+				break;
+			}
 		}
 	}
 
-	return all_set;
+	return all;
+}
+
+int amf_su_are_all_comps_in_su (struct amf_su *su,
+	SaAmfPresenceStateT state)
+{
+	int all_comps_in_su_are_set = 1;
+	amf_comp_t *component;
+	for (component = su->comp_head; component != NULL; 
+		  component = component->next) {
+
+		if (component->saAmfCompPresenceState != state) {
+			all_comps_in_su_are_set = 0;
+		}
+	}
+	return all_comps_in_su_are_set;
 }
 
-void amf_su_instantiate (struct amf_su *su)
+void amf_su_restart (struct amf_su *su)
 {
 	struct amf_comp *comp;
+	SaNameT dn;
 
 	ENTER ("'%s'", su->name.value);
 
+	amf_su_dn_make (su, &dn);
+	log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
+		"action: SU restart", dn.value);
+
+	su->restart_control_state = SU_RC_RESTART_SU_DEACTIVATING;
+	su->restart_control_state = SU_RC_RESTART_SU_INSTANTIATING;
+	su->escalation_level_history_state = SU_RC_IDLE_ESCALATION_LEVEL_2;
+
+	su->saAmfSURestartCount += 1;
+
 	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
-		amf_comp_instantiate (comp);
+		amf_comp_restart (comp);
+	}
+}
+
+int amf_su_instantiate (struct amf_su *su)
+{
+	ENTER ("'%s %d'", su->name.value, su->saAmfSUPresenceState);
+
+	int performs_instantiating = 1;
+
+	switch (su->saAmfSUPresenceState) {
+		case SA_AMF_PRESENCE_UNINSTANTIATED:
+			instantiate_all_components_in_level(su, 
+				su_lowest_comp_instantiation_level_set (su));
+			break;
+		case SA_AMF_PRESENCE_RESTARTING:
+		case SA_AMF_PRESENCE_INSTANTIATING:
+			break;
+		case SA_AMF_PRESENCE_INSTANTIATED:
+		case SA_AMF_PRESENCE_TERMINATING:
+		case SA_AMF_PRESENCE_INSTANTIATION_FAILED:
+		case SA_AMF_PRESENCE_TERMINATION_FAILED:
+			performs_instantiating = 0;
+			break;
+		default:
+			assert (0);
+			break;
+		
 	}
+	return performs_instantiating;
 }
 
 amf_si_assignment_t *amf_su_assign_si (struct amf_su *su, struct amf_si *si,
@@ -534,6 +862,30 @@ int amf_su_is_local (struct amf_su *su)
 	}
 }
 
+
+static void su_rc_enter_idle_escalation_level_1 (amf_comp_t *component,
+	SaAmfRecommendedRecoveryT recommended_recovery)
+{
+	ENTER("");
+	component->su->restart_control_state = SU_RC_IDLE_ESCALATION_LEVEL_1;
+	if (has_component_restarted_max_times (component, component->su)) {
+		component->su->restart_control_state = SU_RC_IDLE_ESCALATION_LEVEL_2;
+		amf_su_comp_error_suspected (component->su, component, recommended_recovery);
+	} else {
+		comp_restart (component);
+	}
+}
+
+static void su_rc_enter_idle_escalation_level_2 (amf_comp_t *component,
+	SaAmfRecommendedRecoveryT recommended_recovery)
+{
+	ENTER("");
+	component->su->restart_control_state = SU_RC_IDLE_ESCALATION_LEVEL_2;
+	amf_node_t *node = amf_node_find (&component->su->saAmfSUHostedByNode);
+	amf_node_comp_restart_req (node, component); 
+}
+
+
 /**
  * Called by a component to report a suspected error on a component
  * @param su
@@ -545,57 +897,61 @@ void amf_su_comp_error_suspected (
 	struct amf_comp *comp,
 	SaAmfRecommendedRecoveryT recommended_recovery)
 {
-	ENTER ("Comp '%s', SU '%s'", comp->name.value, su->name.value);
+	ENTER ("Comp '%s', SU '%s' %d", comp->name.value, su->name.value,
+		su->restart_control_state);
 
 	switch (su->restart_control_state) {
-		case SU_RC_ESCALATION_LEVEL_0:
- 
-			if (comp->saAmfCompRestartCount >= su->sg->saAmfSGCompRestartMax) {
-				su->restart_control_state = SU_RC_ESCALATION_LEVEL_1;
-				amf_su_comp_error_suspected (su, comp, recommended_recovery);
+		case SU_RC_IDLE_ESCALATION_LEVEL_0:
+				su_rc_enter_idle_escalation_level_1 (comp,
+					recommended_recovery);
+			break;
+
+		case SU_RC_IDLE_ESCALATION_LEVEL_1:
+			if (has_component_restarted_max_times (comp, su)) {
+				su_rc_enter_idle_escalation_level_2 (comp,
+					recommended_recovery);
 			} else {
 				comp_restart (comp);
 			}
 			break;
+		case SU_RC_IDLE_ESCALATION_LEVEL_2: {
+				amf_node_t *node = amf_node_find (&comp->su->saAmfSUHostedByNode);
+				amf_node_comp_restart_req (node, comp); 
 
-		case SU_RC_ESCALATION_LEVEL_1:
-			if (comp->saAmfCompRestartCount >= su->sg->saAmfSGCompRestartMax) {
+#ifdef COMPILE_OUT
 				if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
-					su->restart_control_state = SU_RC_ESCALATION_LEVEL_2;
-					amf_su_comp_error_suspected (su, comp, recommended_recovery);
+
+					/*
+					 * TODO: delegate to node
+					*/
+					SaNameT dn;
+					amf_comp_operational_state_set (comp, 
+						SA_AMF_OPERATIONAL_DISABLED);
+					amf_su_operational_state_set (su, 
+						SA_AMF_OPERATIONAL_DISABLED);
+					
+					amf_comp_dn_make (comp, &dn);
+					log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
+						"action:\n\t\tSU failover", dn.value);
+					amf_sg_failover_su_req (comp->su->sg, comp->su, this_amf_node);
+					return;
 				} else {
 					su_restart (comp->su);
 				}
-			} else {
-				comp_restart (comp);
-			}
-			break;
-		case SU_RC_ESCALATION_LEVEL_2:
-			if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
-
-				/*                                                              
-				 * TODO: delegate to node
-				*/
-				SaNameT dn;
-				su_operational_state_set (su, SA_AMF_OPERATIONAL_DISABLED);
-				amf_comp_operational_state_set (
-					comp, SA_AMF_OPERATIONAL_DISABLED);
-				amf_comp_dn_make (comp, &dn);
-				log_printf (LOG_NOTICE, "Error detected for '%s', recovery "
-					"action:\n\t\tSU failover", dn.value);
-				amf_sg_failover_su_req (comp->su->sg, comp->su, this_amf_node);
-				return;
-			} else {
-				su_restart (comp->su);
+#endif
+				break;
 			}
+		case SU_RC_RESTART_SU_SETTING:
+		case SU_RC_RESTART_COMP_RESTARTING:
+		case SU_RC_RESTART_COMP_SETTING:
+			/* TODO: Complete the implementation of SU defer event */
+			su_defer_event (su, comp, recommended_recovery,
+				SU_COMP_ERROR_SUSPECTED_EV); 
 			break;
-
 		default:
-			dprintf ("TODO Restarting probably, on monday");
-			dprintf ("restart_control_state = %d",
-				su->restart_control_state);
+			dprintf ("restart_control_state = %d",su->restart_control_state);
 			break;
- 	}
+	}
 }
 
 void amf_su_init (void)
@@ -603,19 +959,75 @@ void amf_su_init (void)
 	log_init ("AMF");
 }
 
-void amf_su_terminate (struct amf_su *su)
+static int get_instantiation_max_level (amf_su_t *su)
 {
-	struct amf_comp *comp;
+	amf_comp_t *comp;
+	int instantiation_level = 0;
+	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
+		if (comp->saAmfCompInstantiationLevel > instantiation_level) {
+		   instantiation_level =  comp->saAmfCompInstantiationLevel;
+		}
+	}
+	return instantiation_level;
+}
+
 
-	ENTER ("'%s'", su->name.value);
 
+/**
+ * 
+ * @param su
+ * @param comp
+ */
+static int terminate_all_components_in_level (struct amf_su *su, 
+	SaUint32T current_instantiation_level)
+{
+	amf_comp_t *comp;
+	int all_components_in_level = 1;
 	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
 		/* 
-		 * Terminate all components in SU abruptly
-		*/
-		comp->error_suspected = 1;
-		amf_comp_terminate (comp);
+         * Terminate all components in instantiation level in SU
+         * abruptly.
+         */
+		if (comp->saAmfCompInstantiationLevel == current_instantiation_level) {
+			amf_comp_error_suspected_set (comp);
+			amf_comp_terminate (comp);
+			all_components_in_level = 0;
+		}
+	}
+	return all_components_in_level;
+}
+
+
+
+/**
+ * su_current_instantiation_level_init
+ * @param su
+ */
+static SaUint32T su_lowest_comp_instantiation_level_set (struct amf_su *su)
+{
+	amf_comp_t *component = su->comp_head;
+	int comp_instantiation_level = component->saAmfCompInstantiationLevel;
+	for (; component != NULL; component = component->next) {
+		TRACE1("component->saAmfCompInstantiationLevel=%d",
+			component->saAmfCompInstantiationLevel);
+
+			if (component->saAmfCompInstantiationLevel < 
+				comp_instantiation_level) {
+				comp_instantiation_level = 
+					component->saAmfCompInstantiationLevel;
+			}
 	}
+	su->current_comp_instantiation_level = comp_instantiation_level;
+	return comp_instantiation_level;
+}
+
+
+void amf_su_terminate (struct amf_su *su)
+{
+	ENTER ("'%s'", su->name.value);
+	su->current_comp_instantiation_level = get_instantiation_max_level (su);
+
+	terminate_all_components_in_level (su, su->current_comp_instantiation_level);
 }
 
 char *amf_su_dn_make (struct amf_su *su, SaNameT *name)
@@ -739,7 +1151,6 @@ SaAmfReadinessStateT amf_su_get_saAmfSUReadinessState (struct amf_su *su)
 	if ((su->saAmfSUOperState == SA_AMF_OPERATIONAL_ENABLED) &&
 		((su->saAmfSUPresenceState == SA_AMF_PRESENCE_INSTANTIATED) ||
 		(su->saAmfSUPresenceState == SA_AMF_PRESENCE_RESTARTING))) {
-
 		return SA_AMF_READINESS_IN_SERVICE;
 	} else if (su->saAmfSUOperState == SA_AMF_OPERATIONAL_ENABLED) {
 		return SA_AMF_READINESS_STOPPING;
@@ -784,8 +1195,8 @@ struct amf_su *amf_su_new (struct amf_sg *sg, char *name)
 	su->saAmfSUAdminState = SA_AMF_ADMIN_UNLOCKED;
 	su->saAmfSUOperState = SA_AMF_OPERATIONAL_DISABLED;
 	su->saAmfSUPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
-	su->restart_control_state = SU_RC_ESCALATION_LEVEL_0;
-
+	su->restart_control_state = SU_RC_IDLE_ESCALATION_LEVEL_0;
+	su->current_comp_instantiation_level = 0;
 	setSaNameT (&su->name, name);
 
 	return su;
@@ -838,6 +1249,8 @@ void *amf_su_serialize (struct amf_su *su, int *len)
 		buf, &size, &offset, su->clccli_path);
 	buf = amf_serialize_SaUint32T (
 		buf, &size, &offset, su->su_failover_cnt);
+	buf = amf_serialize_SaUint32T (
+		buf, &size, &offset, su->current_comp_instantiation_level);
 
 	*len = offset;
 
@@ -864,6 +1277,7 @@ struct amf_su *amf_su_deserialize (struct amf_sg *sg, char *buf)
 	tmp = amf_deserialize_SaUint32T (tmp, &su->escalation_level_history_state);
 	tmp = amf_deserialize_SaStringT (tmp, &su->clccli_path);
 	tmp = amf_deserialize_SaUint32T (tmp, &su->su_failover_cnt);
+	tmp = amf_deserialize_SaUint32T (tmp, &su->current_comp_instantiation_level);
 
 	return su;
 }

+ 2 - 0
exec/amfutil.c

@@ -174,6 +174,8 @@ static int init_recovery_on_error (struct amf_comp *comp, char *loc)
 		comp->saAmfCompRecoveryOnError = SA_AMF_APPLICATION_RESTART;
 	} else if (strcmp (loc, "cluster_reset") == 0) {
 		comp->saAmfCompRecoveryOnError = SA_AMF_CLUSTER_RESET;
+	} else if (strcmp (loc, "no_recomondation") == 0) {
+		comp->saAmfCompRecoveryOnError = SA_AMF_NO_RECOMMENDATION;
 	} else {
 		return -1;
 	}

+ 1 - 1
lib/amf.c

@@ -967,7 +967,7 @@ saAmfComponentErrorReport (
 	memcpy (&req_lib_amf_componenterrorreport.erroneousComponent, erroneousComponent,
 		sizeof (SaNameT));
 	req_lib_amf_componenterrorreport.errorDetectionTime = errorDetectionTime;
-
+	req_lib_amf_componenterrorreport.recommendedRecovery = recommendedRecovery;
     DPRINT (("start error report\n"));
 	error = saSendReceiveReply (amfInstance->response_fd,
 		&req_lib_amf_componenterrorreport,

+ 2 - 2
test/testamf1.c

@@ -188,7 +188,7 @@ void CSISetCallback (
 
 	switch (haState) {
 	case SA_AMF_HA_ACTIVE:
-		printf ("%d: Component '%s' requested to enter hastate SA_AMF_ACTIVE"
+		printf ("PID %d: Component '%s' requested to enter hastate SA_AMF_ACTIVE"
 				" for \n\tCSI '%s'\n",
 			(int)getpid(), compName->value, csiDescriptor->csiName.value);
 		response (handle, invocation, SA_AIS_OK);
@@ -222,7 +222,7 @@ void CSISetCallback (
 		break;  
          
 	case SA_AMF_HA_STANDBY:
-		printf ("%d: Component '%s' requested to enter hastate SA_AMF_STANDBY "
+		printf ("PID %d: Component '%s' requested to enter hastate SA_AMF_STANDBY "
 				"for \n\tCSI '%s'\n",
 			(int)getpid(), compName->value, csiDescriptor->csiName.value);
 		response (handle, invocation, SA_AIS_OK);