فهرست منبع

This patch contains several corrections concerning SU and Node fail
over and some hardening concerning health check handling.


git-svn-id: http://svn.fedorahosted.org/svn/corosync/trunk@1291 fd59a12c-fef9-0310-b244-a6a79926bd2f

Lon Hohberger 19 سال پیش
والد
کامیت
9c64bb9878
9فایلهای تغییر یافته به همراه257 افزوده شده و 85 حذف شده
  1. 1 6
      exec/amf.c
  2. 2 2
      exec/amf.h
  3. 63 14
      exec/amfapp.c
  4. 0 1
      exec/amfcluster.c
  5. 72 17
      exec/amfcomp.c
  6. 3 2
      exec/amfnode.c
  7. 108 39
      exec/amfsg.c
  8. 6 2
      exec/amfsu.c
  9. 2 2
      exec/util.c

+ 1 - 6
exec/amf.c

@@ -2300,12 +2300,7 @@ send_response:
 	res_lib.header.id = MESSAGE_RES_AMF_RESPONSE;
 	res_lib.header.size = sizeof (struct res_lib_amf_response);
 	res_lib.header.error = retval;
-
-//	ENTER ("");
-
-	if (openais_conn_send_response (conn, &res_lib, sizeof (res_lib)) != 0) {
-		openais_exit_error (AIS_DONE_FATAL_ERR);
-	}
+	openais_conn_send_response (conn, &res_lib, sizeof (res_lib));
 end:
 	return;
 }

+ 2 - 2
exec/amf.h

@@ -746,7 +746,7 @@ extern void amf_cluster_assign_workload (struct amf_cluster *cluster);
 
 /* Response event methods */
 extern void amf_cluster_application_started (
-	struct amf_cluster *cluster, struct amf_application *app);
+	amf_cluster_t *cluster, amf_application_t *app);
 extern void amf_cluster_application_workload_assigned (
 	struct amf_cluster *cluster, struct amf_application *app);
 
@@ -800,7 +800,7 @@ extern struct amf_sg *amf_sg_deserialize (
  * @param node - !NULL start all SUs in the SG for the specified
  *             node.
  */
-extern void amf_sg_start (struct amf_sg *sg, struct amf_node *node);
+extern int amf_sg_start (struct amf_sg *sg, struct amf_node *node);
 
 /**
  * Assign SIs on a certain dependency level to SUs

+ 63 - 14
exec/amfapp.c

@@ -137,11 +137,17 @@ typedef struct application_event {
 	amf_node_t	*node;
 } application_event_t;
 
+static	int is_cluster_start(amf_node_t *node_to_start)
+{
+	return node_to_start == NULL;
+}
+
 static void application_defer_event (
 	amf_application_event_type_t event_type, amf_application_t *app, 
 	amf_node_t *node) 
 {
 	application_event_t app_event = {event_type, app, node};
+	ENTER("");
 	amf_fifo_put (event_type, &app->deferred_events, 
 		sizeof (application_event_t), &app_event);
 }
@@ -215,15 +221,53 @@ static int all_sg_assigned (struct amf_application *app)
 	return all_sg_assigned;
 }
 
+static void start_all_sg_for_cluster (amf_application_t *app)
+{
+	amf_sg_t *sg;
+	int su_to_instantiate = 0;
+	for (sg = app->sg_head; sg != NULL; sg = sg->next) {
+		su_to_instantiate += amf_sg_start (sg, NULL);
+	}
+
+	if (su_to_instantiate == 0) {
+		amf_cluster_application_started (app->cluster, app);
+	}
+}
+
+static void timer_function_cluster_application_started (void* app)
+{
+	ENTER("");
+	amf_application_t *application = (amf_application_t*)app;
+	amf_cluster_application_started (application->cluster, application);
+}
+
+static void timer_function_node_application_started (void* app)
+{
+	ENTER("");
+	amf_application_t *application = (amf_application_t*)app;
+	amf_node_application_started (application->node_to_start, application);
+}
 static void application_enter_starting_sgs (struct amf_application *app, 
 	struct amf_node *node)
 {
 	amf_sg_t *sg = 0;
+	int su_to_instantiate = 0;
 	app->node_to_start = node;
 	app->acsm_state = APP_AC_STARTING_SGS;
 
 	for (sg = app->sg_head; sg != NULL; sg = sg->next) {
-		amf_sg_start (sg, node);
+		su_to_instantiate += amf_sg_start (sg, node);
+	}
+
+	if (su_to_instantiate == 0) {
+		app->acsm_state = APP_AC_STARTED;
+		if (is_cluster_start (app->node_to_start)) {
+			amf_call_function_asynchronous (
+				timer_function_cluster_application_started, app);
+		} else {
+			amf_call_function_asynchronous (
+				timer_function_node_application_started, app);
+		}   
 	}
 }
 
@@ -270,8 +314,6 @@ static void application_enter_workload_assigned (amf_application_t *app)
 void amf_application_start (
 	struct amf_application *app, struct amf_node *node)
 {
-	struct amf_sg *sg;
-
 	ENTER ("'%s'", app->name.value);
 	assert (app != NULL);
 	switch (app->acsm_state) {
@@ -279,21 +321,20 @@ void amf_application_start (
 			application_enter_starting_sgs (app, node);
 			break;
 		case APP_AC_STARTING_SGS:
-			if (app->node_to_start == node) {
-				for (sg = app->sg_head; sg != NULL; sg = sg->next) {
-					amf_sg_start (sg, node);
-				}
-			} else {
+			if (is_cluster_start (app->node_to_start)) {
+				start_all_sg_for_cluster (app);
+			} else { /*is_not_cluster_start*/
 				application_defer_event (APPLICATION_START_EV, app , node);
 			}
 			break;
 		case APP_AC_STARTED:
-			/* TODO: Recall deferred events */
-			app->node_to_start = node;
-			app->acsm_state = APP_AC_STARTING_SGS;
-			for (sg = app->sg_head; sg != NULL; sg = sg->next) {
-				amf_sg_start (sg, node);
+			if (is_cluster_start (app->node_to_start)) {
+				app->acsm_state = APP_AC_STARTING_SGS;
+				start_all_sg_for_cluster (app);
+			} else { /*is_not_cluster_start*/
+				application_defer_event (APPLICATION_START_EV, app , node);
 			}
+
 			break;
 		case APP_AC_ASSIGNING_WORKLOAD:
 			log_printf (LOG_LEVEL_ERROR, "Request to start application"
@@ -321,8 +362,14 @@ void amf_application_assign_workload (struct amf_application *app,
 
 	assert (app != NULL);
 	app->node_to_start = node;
+	ENTER("app->acsm_state = %d",app->acsm_state);
 
 	switch (app->acsm_state) {
+		case APP_AC_STARTING_SGS:
+			if (is_cluster_start (node)) {
+				application_enter_assigning_workload (app);
+			}
+			break;
 		case APP_AC_WORKLOAD_ASSIGNED:
 			application_enter_assigning_workload (app);
 			break;
@@ -348,6 +395,7 @@ void amf_application_assign_workload (struct amf_application *app,
 			/*
 			 * Calling object has violated the contract !
 			 */
+			dprintf ("acsm_state = %d",app->acsm_state);
 			assert (0);
 			break;
 	}
@@ -359,7 +407,7 @@ void amf_application_assign_workload (struct amf_application *app,
 void amf_application_sg_started (struct amf_application *app, struct amf_sg *sg,
 		struct amf_node *node)
 {
-	ENTER ("'%s'", app->name.value);
+	ENTER ("'%s %s'", app->name.value, sg->name.value);
 	
 	assert (app != NULL);
 
@@ -417,6 +465,7 @@ struct amf_application *amf_application_new (struct amf_cluster *cluster) {
 	app->next = cluster->application_head;
 	cluster->application_head = app;
 	app->acsm_state = APP_AC_UNINSTANTIATED;
+	app->node_to_start = NULL;
 	return app;
 }
 

+ 0 - 1
exec/amfcluster.c

@@ -407,7 +407,6 @@ void amf_cluster_application_started (
 	struct amf_cluster *cluster, struct amf_application *application)
 {
 	ENTER ("application '%s' started", application->name.value);
-
 	switch (cluster->acsm_state) {
 		case CLUSTER_AC_STARTING_APPLICATIONS:
 			if (cluster_applications_started_instantiated (cluster)) {

+ 72 - 17
exec/amfcomp.c

@@ -1,8 +1,8 @@
 /** @file amfcomp.c
  * 
- * Copyright (c) 2002-2006 MontaVista Software, Inc.
- * Copyright (c) 2006 Sun Microsystems, Inc.
  * Copyright (c) 2006 Ericsson AB.
+ * Copyright (c) 2002-2006 MontaVista Software, Inc.
+ * Copyright (c) 2006 Sun Microsystems, Inc. Copyright (c) 2006
  *
  * All rights reserved.
  *
@@ -190,6 +190,7 @@ static void lib_csi_set_request (
 	struct amf_comp *comp,
 	struct amf_csi_assignment *csi_assignment);
 
+
 /*
  * Life cycle functions
  */
@@ -233,6 +234,13 @@ struct invocation {
 static struct invocation *invocation_entries = 0;
 static int invocation_entries_size = 0;
 
+static int is_not_instantiating_or_instantiated_or_restarting (amf_comp_t *comp)
+{
+	return (!(comp->saAmfCompPresenceState == SA_AMF_PRESENCE_INSTANTIATING ||
+			  comp->saAmfCompPresenceState == SA_AMF_PRESENCE_INSTANTIATED ||
+			  comp->saAmfCompPresenceState == SA_AMF_PRESENCE_RESTARTING));
+}
+
 static int invocation_create (
 	int interface, 
 	void *data)
@@ -308,8 +316,7 @@ static void report_error_suspected (
 	SaAmfRecommendedRecoveryT recommended_recovery)
 {
 	comp->error_suspected = 1;
-	amf_su_comp_error_suspected (
-		comp->su, comp, recommended_recovery);
+	amf_su_comp_error_suspected (comp->su, comp, recommended_recovery);
 }
 
 
@@ -350,9 +357,9 @@ static void *clc_command_run (void *context)
 				" %d - %s\n", pid, WEXITSTATUS(status),
 				strerror (WEXITSTATUS(status)));
 			/*                                                              
-			 * TODO: remove this and handle properly later...
+             * Healthcheck timout will expire laterfore the component
+             * and this will lead to Intantiation failed for the component.
 			 */
-			openais_exit_error (AIS_DONE_FATAL_ERR);
 		}
 		if (WIFSIGNALED (status) != 0) {
 			fprintf (stderr, "Error: CLC_CLI (%d) failed with exit status:"
@@ -360,7 +367,12 @@ static void *clc_command_run (void *context)
 			/*                                                              
 			 * TODO: remove this and handle properly later...
 			 */
-			openais_exit_error (AIS_DONE_FATAL_ERR);
+
+			/*                                                              
+			 * Healthcheck timout will expire laterfore the component
+			 * and this will lead to Intantiation failed for the component.
+			 */
+
 		}
 		xprintf ("process (%d) finished with %x\n", pid, status);
 		if (clc_command_run_data->completion_callback) {
@@ -483,6 +495,7 @@ static void amf_comp_instantiate_tmo (void *component)
 
 static void start_component_instantiate_timer (struct amf_comp *component)
 {
+	ENTER("%s",component->name.value);
 	poll_timer_add (aisexec_poll_handle, 
 		component->saAmfCompInstantiateTimeout,
 		component,
@@ -726,6 +739,7 @@ struct amf_comp *amf_comp_new(struct amf_su *su, char *name)
 
 	comp->saAmfCompOperState = SA_AMF_OPERATIONAL_DISABLED;
 	comp->saAmfCompPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
+	comp->error_suspected = 0;
 	setSaNameT (&comp->name, name);
 
 	return comp;
@@ -1009,6 +1023,14 @@ static void mcast_healthcheck_tmo_event (
 {
 	struct req_exec_amf_healthcheck_tmo req_exec;
 	struct iovec iovec;
+	if (healthcheck->active == 0) {
+		log_printf (LOG_ERR, "Healthcheck timeout: ignored key = %s, "
+							 "due to wrong state = %d, comp = %s",
+			healthcheck->safHealthcheckKey.key, 
+			healthcheck->comp->saAmfCompPresenceState, 
+			healthcheck->comp->name.value);
+		goto out;
+	}
 	req_exec.header.size = sizeof (struct req_exec_amf_healthcheck_tmo);
 	req_exec.header.id = SERVICE_ID_MAKE (AMF_SERVICE,
 		MESSAGE_REQ_EXEC_AMF_HEALTHCHECK_TMO);
@@ -1021,6 +1043,8 @@ static void mcast_healthcheck_tmo_event (
 
 	assert (totempg_groups_mcast_joined (openais_group_handle,
 		&iovec, 1, TOTEMPG_AGREED) == 0);
+out:
+	return;
 }
 
 /**
@@ -1166,7 +1190,9 @@ static void lib_csi_set_request (
 
 static void stop_component_instantiate_timer (struct amf_comp *component)
 {
-   if (component->instantiate_timeout_handle) {
+	ENTER("%s",component->name.value);
+
+	if (component->instantiate_timeout_handle) {
 		dprintf ("Stop component instantiate timer");
 		poll_timer_delete (aisexec_poll_handle, 
 			component->instantiate_timeout_handle);
@@ -1249,7 +1275,7 @@ void amf_comp_cleanup_completed (struct amf_comp *comp)
 
 	/* clear error suspected flag, component is terminated now */
 	comp->error_suspected = 0;
-
+	
 	if (comp->saAmfCompPresenceState == SA_AMF_PRESENCE_RESTARTING) {
 		amf_comp_instantiate (comp);
 	} else {
@@ -1275,6 +1301,15 @@ SaAisErrorT amf_comp_healthcheck_start (
 {
 	struct amf_healthcheck *healthcheck;
 	SaAisErrorT error = SA_AIS_OK;
+	
+	if (is_not_instantiating_or_instantiated_or_restarting (comp)) {
+		log_printf (LOG_ERR, "Healthcheckstart: ignored key = %s, "
+							 "due to wrong state = %d, comp = %s",
+			healthcheckKey->key, comp->saAmfCompPresenceState, comp->name.value);
+		error = SA_AIS_OK;
+		goto error_exit;	
+	}
+		
 
 	healthcheck = amf_comp_find_healthcheck (comp, healthcheckKey);
 	if (healthcheck == 0) {
@@ -1501,6 +1536,17 @@ int amf_comp_response_1 (
 				SaNameT name;
 				TRACE7 ("Healthcheck response from '%s': %d",
 					amf_comp_dn_make (healthcheck->comp, &name), error);
+					
+				if (is_not_instantiating_or_instantiated_or_restarting(
+					healthcheck->comp)) {
+					log_printf (LOG_ERR, "HealthcheckResponse: ignored for key = %s, "
+										 "due to wrong state = %d comp = %s",
+						healthcheck->safHealthcheckKey.key, 
+						healthcheck->comp->saAmfCompPresenceState,
+						healthcheck->comp->name.value);
+					*retval = SA_AIS_OK;
+					return 0;  /* do not multicast event */
+				}
 
 				if (healthcheck->invocationType == SA_AMF_HEALTHCHECK_AMF_INVOKED) {
 				/* the response was on time, delete supervision timer */
@@ -1634,7 +1680,7 @@ void amf_comp_hastate_set (
 	struct amf_csi_assignment *csi_assignment)
 {
 	ENTER ("'%s'", csi_assignment->csi->name.value);
-
+	
 	assert (component != NULL && csi_assignment != NULL);
 
 
@@ -1644,6 +1690,8 @@ void amf_comp_hastate_set (
 		if (csi_assignment->requested_ha_state == SA_AMF_HA_QUIESCED) {
 			csi_assignment->saAmfCSICompHAState = csi_assignment->requested_ha_state;
 		} else {
+			dprintf ("csi_assignment->requested_ha_state = %d", 
+				component->error_suspected);
 			assert (0);
 		}
 	}
@@ -1730,6 +1778,13 @@ SaAisErrorT amf_comp_healthcheck_confirm (
 	SaAisErrorT error = SA_AIS_OK;
 
 	healthcheck = amf_comp_find_healthcheck (comp, healthcheckKey);
+	if (is_not_instantiating_or_instantiated_or_restarting(comp)) {
+		log_printf (LOG_ERR, "HealthcheckConfirm: ignored for key = %s, "
+							 "due to wrong state = %d, comp = %s",
+			healthcheckKey->key, comp->saAmfCompPresenceState, comp->name.value);
+		error = SA_AIS_OK;
+		goto out;
+	}
 	if (healthcheck == NULL) {
 		log_printf (LOG_ERR, "Healthcheckstop: Healthcheck '%s' not found",
 			healthcheckKey->key);
@@ -1753,7 +1808,7 @@ SaAisErrorT amf_comp_healthcheck_confirm (
 	} else {
 		error = SA_AIS_ERR_INVALID_PARAM;
 	}
-
+out:
 	return error;
 }
 
@@ -1839,22 +1894,24 @@ void amf_comp_node_left (struct amf_comp *component)
 	int change_pending = 0;
 	struct amf_csi_assignment *csi_assignment;
 
-	ENTER("");
+	ENTER("saAmfCompPresenceState = %d", component->saAmfCompPresenceState);
+	component->error_suspected = 0;
 	if (component->saAmfCompPresenceState == SA_AMF_PRESENCE_INSTANTIATING ||
 		component->saAmfCompPresenceState == SA_AMF_PRESENCE_RESTARTING ||
 		component->saAmfCompPresenceState == SA_AMF_PRESENCE_TERMINATING) {
 		change_pending = 1;
+
 	}
 
 	component->saAmfCompPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
 
 	if (amf_su_presence_state_all_comps_in_su_are_set (component->su,
-		SA_AMF_PRESENCE_UNINSTANTIATED) != 0) {
+		SA_AMF_PRESENCE_UNINSTANTIATED)) {
 		component->su->saAmfSUPresenceState = SA_AMF_PRESENCE_UNINSTANTIATED;
 	}
 
 	if (change_pending) {
-		change_pending =0;
+		change_pending = 0;
 		amf_su_comp_state_changed ( component->su,
 			component,
 			SA_AMF_PRESENCE_STATE,
@@ -1868,7 +1925,7 @@ void amf_comp_node_left (struct amf_comp *component)
 	component->saAmfCompOperState = SA_AMF_OPERATIONAL_DISABLED;
 	if (change_pending) {
 		change_pending =0;
-		amf_su_comp_state_changed ( component->su,
+		amf_su_comp_state_changed (component->su,
 			component,
 			SA_AMF_OP_STATE,
 			SA_AMF_OPERATIONAL_DISABLED);
@@ -1884,8 +1941,6 @@ void amf_comp_node_left (struct amf_comp *component)
 		csi_assignment = amf_comp_get_next_csi_assignment (
 			component, csi_assignment);
 	}
-
-
 }
 
 /**

+ 3 - 2
exec/amfnode.c

@@ -335,8 +335,9 @@ void amf_node_sync_ready (struct amf_node *node)
 		case NODE_ACSM_REPAIR_NEEDED:
 			break;
 		default:
-			log_printf (LOG_LEVEL_ERROR, "amf_node_sync_ready()called in state"
-				" = %d (should have been deferred)", node->acsm_state);
+			log_printf (LOG_LEVEL_ERROR, "amf_node_sync_ready() was called in "
+										 "state = %d (should have been deferred)",
+				node->acsm_state);
 			openais_exit_error (AIS_DONE_FATAL_ERR);
 			break;
 

+ 108 - 39
exec/amfsg.c

@@ -157,6 +157,8 @@ static void standby_su_activated_cbfn (
 
 static void dependent_si_deactivated_cbfn (
 	struct amf_si_assignment *si_assignment, int result);
+static void acsm_enter_removing_standby_assignments (amf_sg_t *sg);
+static void acsm_enter_assigning_standby_to_spare (amf_sg_t *sg);
 
 static const char *sg_event_type_text[] = {
 	"Unknown",
@@ -177,6 +179,11 @@ typedef struct sg_event {
 	amf_node_t *node;
 } sg_event_t;
 
+static	int is_cluster_start(amf_node_t *node_to_start) 
+{
+	return node_to_start == NULL;
+}
+
 static void sg_set_event (amf_sg_event_type_t sg_event_type,
 	amf_sg_t *sg, amf_su_t *su, amf_comp_t *comp, amf_node_t * node,
 	sg_event_t *sg_event)
@@ -191,6 +198,7 @@ static void sg_set_event (amf_sg_event_type_t sg_event_type,
 static void sg_defer_event (amf_sg_event_type_t event_type,
 	sg_event_t *sg_event)
 {
+	ENTER("Defered event = %d", event_type);
 	amf_fifo_put (event_type, &sg_event->sg->deferred_events,
 		sizeof (sg_event_t),
 		sg_event);
@@ -200,7 +208,7 @@ static void sg_recall_deferred_events (amf_sg_t *sg)
 {
 	sg_event_t sg_event;
 
-	ENTER ("SG: %s", sg->name.value);
+	ENTER ("%s", sg->name.value);
 	if (amf_fifo_get (&sg->deferred_events, &sg_event)) {
 		switch (sg_event.event_type) {
 			case SG_FAILOVER_SU_EV:
@@ -216,6 +224,7 @@ static void sg_recall_deferred_events (amf_sg_t *sg)
 			case SG_START_EV:
 			case SG_AUTO_ADJUST_EV:
 			default:
+				dprintf("event_type = %d", sg_event.event_type);
 				break;
 		}
 	}
@@ -322,7 +331,7 @@ static int has_any_su_in_scope_active_workload (struct amf_sg *sg)
 	return(*sus == NULL);
 }
 
-static int is_any_si_in_scope_assigned_stanby (struct amf_sg *sg)
+static int is_any_si_in_scope_assigned_standby (struct amf_sg *sg)
 {
 	struct amf_si **sis= sg->recovery_scope.sis;
 	struct amf_si_assignment *si_assignment;
@@ -364,7 +373,7 @@ static int is_any_si_in_scope_assigned_stanby (struct amf_sg *sg)
 static void acsm_enter_terminating_suspected (struct amf_sg *sg)
 {
 	struct amf_su **sus= sg->recovery_scope.sus;
-
+	ENTER("%s",sg->name.value);
 	sg->avail_state = SG_AC_TerminatingSuspected;
 	/* 
 	* Terminate suspected SU(s)
@@ -427,7 +436,7 @@ static void dependent_si_deactivated_cbfn2 (struct amf_sg *sg)
 		acsm_enter_terminating_suspected (sg);
 	} else {
 		delete_si_assignments_in_scope(sg);         
-		acsm_enter_activating_standby (sg);
+		acsm_enter_removing_standby_assignments (sg);
 	}
 }
 
@@ -588,7 +597,7 @@ static void acsm_enter_activating_standby (struct amf_sg *sg)
 
 	if (is_no_standby_activated) {
 
-		acsm_enter_activating_standby (sg);
+		acsm_enter_assigning_standby_to_spare (sg);
 	}
 }
 
@@ -719,7 +728,7 @@ static void assign_si_assumed_cbfn (
 					si_assignment->saAmfSISUHAState = SA_AMF_HA_STANDBY;
 				}
 			}
-				break;
+			break;
 		default:
 			dprintf ("%d, %d, %d", sg->avail_state, si_assignment_cnt,
 				confirmed_assignments);
@@ -1355,6 +1364,7 @@ static int no_su_has_presence_state (
 	return no_su_has_presence_state;
 }
 
+#if COMPILE_OUT
 static int all_su_in_scope_has_presence_state (
 	struct amf_sg *sg, SaAmfPresenceStateT state)
 {
@@ -1368,16 +1378,44 @@ static int all_su_in_scope_has_presence_state (
 	}
 	return(*sus == NULL);
 }
-
-static int all_su_in_scope_has_either_presence_state (
+#endif
+static int all_su_in_scope_has_either_two_presence_state (
 	amf_sg_t *sg, 
 	SaAmfPresenceStateT state1, 
 	SaAmfPresenceStateT state2)
 {
-	return all_su_in_scope_has_presence_state (sg, state1) ||
-		all_su_in_scope_has_presence_state (sg, state2); 
+	struct amf_su **sus = sg->recovery_scope.sus;
+
+	while (*sus != NULL) {
+		if (!((*sus)->saAmfSUPresenceState == state1 || 
+			(*sus)->saAmfSUPresenceState == state2)) {
+			break;
+		} 
+		sus++;
+	}
+	return (*sus == NULL);
 }
 
+
+static int all_su_in_scope_has_either_of_three_presence_state (amf_sg_t *sg, 
+	SaAmfPresenceStateT state1, SaAmfPresenceStateT state2, 
+	SaAmfPresenceStateT state3)
+{
+	struct amf_su **sus = sg->recovery_scope.sus;
+
+	while (*sus != NULL) {
+		if (!((*sus)->saAmfSUPresenceState ==  state1  || 
+			(*sus)->saAmfSUPresenceState   ==  state2  || 
+			(*sus)->saAmfSUPresenceState   ==  state3)) {
+			break;
+		} 
+		sus++;
+	}
+	return (*sus == NULL);
+}
+
+
+
 /**
  * Get number of SIs protected by the specified SG.
  * @param sg
@@ -1830,6 +1868,7 @@ void amf_sg_failover_node_req (struct amf_sg *sg, struct amf_node *node)
 		case SG_AC_AssigningAutoAdjust:
 		case SG_AC_AssigningStandBy:
 		case SG_AC_WaitingAfterOperationFailed:
+		case SG_AC_RemovingStandbyAssignments:
 			sg_set_event (SG_FAILOVER_NODE_EV, sg, 0, 0, node, &sg_event); 
 			sg_defer_event (SG_FAILOVER_NODE_EV, &sg_event); 
 			break;
@@ -1840,17 +1879,19 @@ void amf_sg_failover_node_req (struct amf_sg *sg, struct amf_node *node)
 	}
 }
 
-void amf_sg_start (struct amf_sg *sg, struct amf_node *node)
+int amf_sg_start (struct amf_sg *sg, struct amf_node *node)
 {
-	sg_event_t sg_event;
 
 	sg->recovery_scope.event_type = SG_START_EV;
+	ENTER ("'%s'", sg->name.value);
+	int instantiated_sus = 0;
+
 	switch (sg->avail_state) {
+		case SG_AC_InstantiatingServiceUnits:
 		case SG_AC_Idle: { 
 
 				amf_su_t *su;
 				sg_avail_control_state_t old_avail_state = sg->avail_state;
-				int instantiated_sus = 0;
 
 				ENTER ("'%s'", sg->name.value);
 
@@ -1863,16 +1904,12 @@ void amf_sg_start (struct amf_sg *sg, struct amf_node *node)
 					(instantiated_sus < sg->saAmfSGNumPrefInserviceSUs);
 					su = su->next) {
 
-					if (node == NULL) {
-
-						/*
-						 *  Cluster start
-						 */
+					if (is_cluster_start (node)) {
 
 						amf_su_instantiate (su);
 						instantiated_sus++;
 
-					} else {
+					} else { /*is_not_cluster_start*/
 
 						/*
 						 * Node start, match if SU is hosted on the
@@ -1892,10 +1929,6 @@ void amf_sg_start (struct amf_sg *sg, struct amf_node *node)
 				}
 				break;
 			}
-		case SG_AC_InstantiatingServiceUnits:
-			sg_set_event (SG_START_EV, sg, 0, 0, node, &sg_event);
-			sg_defer_event (SG_START_EV, &sg_event);
-			break;
 		case SG_AC_DeactivatingDependantWorkload:
 		case SG_AC_TerminatingSuspected:
 		case SG_AC_ActivatingStandby:
@@ -1908,10 +1941,12 @@ void amf_sg_start (struct amf_sg *sg, struct amf_node *node)
 		case SG_AC_AssigningAutoAdjust:
 		case SG_AC_AssigningStandBy:
 		case SG_AC_WaitingAfterOperationFailed:
+		case SG_AC_RemovingStandbyAssignments:
 		default:
 			assert (0);
 			break;
 	}
+	return instantiated_sus;
 }
 
 void amf_sg_su_state_changed (struct amf_sg *sg, 
@@ -1928,8 +1963,11 @@ void amf_sg_su_state_changed (struct amf_sg *sg,
 					acsm_enter_idle (sg);
 				}
 			} else if (sg->avail_state == SG_AC_ReparingSu) {
-				if (all_su_in_scope_has_presence_state(su->sg,
-					SA_AMF_PRESENCE_INSTANTIATED)) {
+				if (all_su_in_scope_has_either_of_three_presence_state(
+					su->sg,
+					SA_AMF_PRESENCE_INSTANTIATED,
+					SA_AMF_PRESENCE_INSTANTIATION_FAILED,
+					SA_AMF_PRESENCE_UNINSTANTIATED)) {
 					su->sg->avail_state = SG_AC_AssigningStandBy;
 					if (assign_si (sg, 0) == 0) {
 						acsm_enter_idle (sg);
@@ -1945,37 +1983,49 @@ void amf_sg_su_state_changed (struct amf_sg *sg,
 			}
 		} else if (state == SA_AMF_PRESENCE_UNINSTANTIATED) {
 			if (sg->avail_state == SG_AC_TerminatingSuspected) {
-				if (all_su_in_scope_has_either_presence_state (sg,
+				if (all_su_in_scope_has_either_two_presence_state (sg,
 					SA_AMF_PRESENCE_UNINSTANTIATED,
 					SA_AMF_PRESENCE_TERMINATION_FAILED)) {
-					
+
 					delete_si_assignments_in_scope (sg);
 
-					if (is_any_si_in_scope_assigned_stanby (sg)) {
+					if (is_any_si_in_scope_assigned_standby (sg)) {
 						remove_all_suspected_sus (sg);
 						acsm_enter_removing_standby_assignments (sg);
-					} else {
+					} else { /*is_no_si_in_scope_assigned_standby*/
 						remove_all_suspected_sus (sg);
 						acsm_enter_assigning_standby_to_spare (sg);
 					}
 				}
-			} else {
-				assert (0);
+			} else if (sg->avail_state == SG_AC_ReparingSu) {
+				if (all_su_in_scope_has_either_of_three_presence_state(
+					su->sg,
+					SA_AMF_PRESENCE_INSTANTIATED,
+					SA_AMF_PRESENCE_INSTANTIATION_FAILED,
+					SA_AMF_PRESENCE_UNINSTANTIATED)) {
+					su->sg->avail_state = SG_AC_AssigningStandBy;
+					if (assign_si (sg, 0) == 0) {
+						acsm_enter_idle (sg);
+					}
+				} else {
+					dprintf("%d",sg->avail_state);
+					assert (0);
+				}
 			}
 		} else if (state == SA_AMF_PRESENCE_TERMINATION_FAILED) {
 
-			if (all_su_in_scope_has_either_presence_state (sg,
+			if (all_su_in_scope_has_either_two_presence_state (sg,
 				SA_AMF_PRESENCE_UNINSTANTIATED,
 				SA_AMF_PRESENCE_TERMINATION_FAILED) && 
-				is_any_si_in_scope_assigned_stanby (sg)) {
+				is_any_si_in_scope_assigned_standby (sg)) {
 				remove_all_suspected_sus (sg);
 
 				acsm_enter_removing_standby_assignments (sg);
 
-			} else if (all_su_in_scope_has_either_presence_state (sg,
+			} else if (all_su_in_scope_has_either_two_presence_state (sg,
 				SA_AMF_PRESENCE_UNINSTANTIATED,
 				SA_AMF_PRESENCE_TERMINATION_FAILED) && 
-				!is_any_si_in_scope_assigned_stanby (sg)){
+				!is_any_si_in_scope_assigned_standby (sg)) {
 
 				remove_all_suspected_sus (sg);
 				acsm_enter_assigning_standby_to_spare (sg);
@@ -1990,15 +2040,30 @@ void amf_sg_su_state_changed (struct amf_sg *sg,
 			if (sg->avail_state == SG_AC_InstantiatingServiceUnits) {
 				if (no_su_has_presence_state(sg, sg->node_to_start, 
 					SA_AMF_PRESENCE_INSTANTIATING)) {
-
 					acsm_enter_idle (sg);
 				}
+
+			} else if (sg->avail_state == SG_AC_ReparingSu) {
+				if (all_su_in_scope_has_either_of_three_presence_state(
+					su->sg,
+					SA_AMF_PRESENCE_INSTANTIATED,
+					SA_AMF_PRESENCE_INSTANTIATION_FAILED,
+					SA_AMF_PRESENCE_UNINSTANTIATED)) {
+					su->sg->avail_state = SG_AC_AssigningStandBy;
+					if (assign_si (sg, 0) == 0) {
+						acsm_enter_idle (sg);
+					}
+				}
+			} else {
+                /* TODO: Insert the assert (0) until solving defers in SU   */
+				dprintf("sg->avail_state = %d, su instantiation state = %d",
+					sg->avail_state, state);
 			}
 		} else {
+			dprintf("sg->avail_state = %d, su instantiation state = %d",
+					sg->avail_state, state);
 			assert (0);
 		}
-	} else {
-		assert (0);
 	}
 }
 
@@ -2010,7 +2075,7 @@ void amf_sg_init (void)
 void amf_sg_failover_su_req (struct amf_sg *sg, struct amf_su *su, 
 	struct amf_node *node)
 {
-	ENTER ("");
+	ENTER ("%s", su->name.value);
 	sg_event_t sg_event;
 
 	switch (sg->avail_state) {
@@ -2035,6 +2100,7 @@ void amf_sg_failover_su_req (struct amf_sg *sg, struct amf_su *su,
 		case SG_AC_AssigningAutoAdjust:
 		case SG_AC_AssigningStandBy:
 		case SG_AC_WaitingAfterOperationFailed:
+		case SG_AC_RemovingStandbyAssignments:
 			sg_set_event (SG_FAILOVER_SU_EV, sg, su, 0, 0, &sg_event); 
 			sg_defer_event (SG_FAILOVER_SU_EV, &sg_event); 
 			break;
@@ -2139,6 +2205,8 @@ void *amf_sg_serialize (struct amf_sg *sg, int *len)
 		buf, &size, &offset, sg->clccli_path);
 	buf = amf_serialize_SaUint32T (
 		buf, &size, &offset, sg->avail_state);
+	buf = amf_serialize_SaUint32T (
+		buf, &size, &offset, sg->recovery_scope.event_type);
 
 	*len = offset;
 
@@ -2171,6 +2239,7 @@ struct amf_sg *amf_sg_deserialize (struct amf_application *app, char *buf)
 	tmp = amf_deserialize_SaUint32T (tmp, &sg->saAmfSGNumCurrInstantiatedSpareSUs);
 	tmp = amf_deserialize_SaStringT (tmp, &sg->clccli_path);
 	tmp = amf_deserialize_SaUint32T (tmp, &sg->avail_state);
+	tmp = amf_deserialize_SaUint32T (tmp, &sg->recovery_scope.event_type);
 
 	return sg;
 }

+ 6 - 2
exec/amfsu.c

@@ -320,6 +320,7 @@ static void su_comp_presence_state_changed (
 	switch (state) {
 		case SA_AMF_PRESENCE_INSTANTIATED:
 			switch (su->restart_control_state) {
+				case SU_RC_ESCALATION_LEVEL_1:
 				case SU_RC_ESCALATION_LEVEL_2:
 					/* 
 					 * TODO: send to node
@@ -419,6 +420,7 @@ int amf_su_presence_state_all_comps_in_su_are_set (struct amf_su *su,
 	for (comp = su->comp_head; comp != NULL; comp = comp->next) {
 		if (comp->saAmfCompPresenceState != state) {
 			all_set = 0;
+			break;
 		}
 	}
 
@@ -568,7 +570,6 @@ void amf_su_comp_error_suspected (
 				comp_restart (comp);
 			}
 			break;
-
 		case SU_RC_ESCALATION_LEVEL_2:
 			if (su->saAmfSURestartCount >= su->sg->saAmfSGSuRestartMax) {
 
@@ -590,8 +591,11 @@ void amf_su_comp_error_suspected (
 			break;
 
 		default:
+			dprintf ("TODO Restarting probably, on monday");
+			dprintf ("restart_control_state = %d",
+				su->restart_control_state);
 			break;
-	}
+ 	}
 }
 
 void amf_su_init (void)

+ 2 - 2
exec/util.c

@@ -86,8 +86,8 @@ SaTimeT clust_time_now(void)
 void _openais_exit_error (
 	enum e_ais_done err, const char *file, unsigned int line)
 {
-	log_printf (LOG_LEVEL_ERROR, "AIS Executive exiting with status %d at %s:%u.\n",
-		err, file, line);
+	log_printf (LOG_LEVEL_ERROR, "AIS Executive exiting "
+								 "with status %d at %s:%u.\n", err, file, line);
 	log_flush();
 	exit (EXIT_FAILURE);
 }