Просмотр исходного кода

defect 968
Improve worst-case failure detection time using active healthchecking


git-svn-id: http://svn.fedorahosted.org/svn/corosync/trunk@836 fd59a12c-fef9-0310-b244-a6a79926bd2f

Steven Dake 20 лет назад
Родитель
Сommit
067c60c866
3 измененных файлов с 109 добавлено и 1 удалено
  1. 5 0
      exec/totem.h
  2. 15 0
      exec/totemconfig.c
  3. 89 1
      exec/totemsrp.c

+ 5 - 0
exec/totem.h

@@ -120,6 +120,11 @@ struct totem_config {
 	unsigned int net_mtu;
 	unsigned int net_mtu;
 
 
 	unsigned int threads;
 	unsigned int threads;
+	
+	int heartbeat_failures_allowed;
+	
+	/*In Milliseconds*/
+	int max_network_delay;
 };
 };
 
 
 enum totem_configuration_type {
 enum totem_configuration_type {

+ 15 - 0
exec/totemconfig.c

@@ -64,6 +64,7 @@
 #define FAIL_TO_RECV_CONST			10
 #define FAIL_TO_RECV_CONST			10
 #define	SEQNO_UNCHANGED_CONST			30
 #define	SEQNO_UNCHANGED_CONST			30
 #define MINIMUM_TIMEOUT				(int)(1000/HZ)*3
 #define MINIMUM_TIMEOUT				(int)(1000/HZ)*3
+#define MAX_NETWORK_DELAY			50 /*In milliseconds*/
 
 
 static char error_string_response[512];
 static char error_string_response[512];
 
 
@@ -254,6 +255,10 @@ extern int totem_config_read (
 				totem_config->fail_to_recv_const = atoi(loc);
 				totem_config->fail_to_recv_const = atoi(loc);
 			} else if ((loc = strstr_rs (line, "seqno_unchanged_const:"))) {
 			} else if ((loc = strstr_rs (line, "seqno_unchanged_const:"))) {
 				totem_config->seqno_unchanged_const = atoi(loc);
 				totem_config->seqno_unchanged_const = atoi(loc);
+			} else if ((loc = strstr_rs (line, "heartbeat_failures_allowed:"))) {
+				totem_config->heartbeat_failures_allowed = atoi(loc);
+			} else if ((loc = strstr_rs (line, "max_network_delay:"))) {
+				totem_config->max_network_delay = atoi(loc);
 			} else if ((loc = strstr_rs (line, "}"))) {
 			} else if ((loc = strstr_rs (line, "}"))) {
 				parse = MAIN_HEAD;
 				parse = MAIN_HEAD;
 			} else {
 			} else {
@@ -347,6 +352,16 @@ int totem_config_validate (
 		}
 		}
 	}
 	}
 
 
+	if (totem_config->max_network_delay == 0) {
+		totem_config->max_network_delay = MAX_NETWORK_DELAY;
+	}
+
+	if (totem_config->max_network_delay < MINIMUM_TIMEOUT) {
+		sprintf (local_error_reason, "The max_network_delay parameter (%d ms) may not be less then (%d ms).",
+			totem_config->max_network_delay, MINIMUM_TIMEOUT);
+		goto parse_error;
+	}
+
 	if (totem_config->token_timeout < MINIMUM_TIMEOUT) {
 	if (totem_config->token_timeout < MINIMUM_TIMEOUT) {
 		sprintf (local_error_reason, "The token timeout parameter (%d ms) may not be less then (%d ms).",
 		sprintf (local_error_reason, "The token timeout parameter (%d ms) may not be less then (%d ms).",
 			totem_config->token_timeout, MINIMUM_TIMEOUT);
 			totem_config->token_timeout, MINIMUM_TIMEOUT);

+ 89 - 1
exec/totemsrp.c

@@ -304,6 +304,8 @@ struct totemsrp_instance {
 	int my_retrans_flg_count;
 	int my_retrans_flg_count;
 
 
 	unsigned int my_high_ring_delivered;
 	unsigned int my_high_ring_delivered;
+	
+	int heartbeat_timeout;
 
 
 	/*
 	/*
 	 * Queues used to order, deliver, and recover messages
 	 * Queues used to order, deliver, and recover messages
@@ -350,6 +352,8 @@ struct totemsrp_instance {
 
 
 	poll_timer_handle memb_timer_state_commit_timeout;
 	poll_timer_handle memb_timer_state_commit_timeout;
 
 
+	poll_timer_handle timer_heartbeat_timeout;
+
 	/*
 	/*
 	 * Function and data used to log messages
 	 * Function and data used to log messages
 	 */
 	 */
@@ -424,6 +428,8 @@ struct totemsrp_instance {
 	totemrrp_handle totemrrp_handle;
 	totemrrp_handle totemrrp_handle;
 
 
 	struct totem_config *totem_config;
 	struct totem_config *totem_config;
+
+	int use_heartbeat;
 };
 };
 
 
 struct message_handlers {
 struct message_handlers {
@@ -500,6 +506,7 @@ static void memb_commit_token_endian_convert (struct memb_commit_token *in, stru
 static void memb_join_endian_convert (struct memb_join *in, struct memb_join *out);
 static void memb_join_endian_convert (struct memb_join *in, struct memb_join *out);
 static void mcast_endian_convert (struct mcast *in, struct mcast *out);
 static void mcast_endian_convert (struct mcast *in, struct mcast *out);
 static void timer_function_orf_token_timeout (void *data);
 static void timer_function_orf_token_timeout (void *data);
+static void timer_function_heartbeat_timeout (void *data);
 static void timer_function_token_retransmit_timeout (void *data);
 static void timer_function_token_retransmit_timeout (void *data);
 static void timer_function_token_hold_retransmit_timeout (void *data);
 static void timer_function_token_hold_retransmit_timeout (void *data);
 static void timer_function_merge_detect_timeout (void *data);
 static void timer_function_merge_detect_timeout (void *data);
@@ -647,6 +654,10 @@ int totemsrp_initialize (
 		"seqno unchanged const (%d rotations) Maximum network MTU %d\n", totem_config->seqno_unchanged_const, totem_config->net_mtu);
 		"seqno unchanged const (%d rotations) Maximum network MTU %d\n", totem_config->seqno_unchanged_const, totem_config->net_mtu);
 	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
 	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
 		"send threads (%d threads)\n", totem_config->threads);
 		"send threads (%d threads)\n", totem_config->threads);
+	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+		"heartbeat_failures_allowed (%d)\n", totem_config->heartbeat_failures_allowed);
+	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+		"max_network_delay (%d ms)\n", totem_config->max_network_delay);
 
 
 
 
 	queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX,
 	queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX,
@@ -663,7 +674,36 @@ int totemsrp_initialize (
 	instance->totemsrp_deliver_fn = deliver_fn;
 	instance->totemsrp_deliver_fn = deliver_fn;
 
 
 	instance->totemsrp_confchg_fn = confchg_fn;
 	instance->totemsrp_confchg_fn = confchg_fn;
+	instance->use_heartbeat = 1;
 
 
+	if ( totem_config->heartbeat_failures_allowed == 0 ) {
+		instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+			"HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0\n");
+		instance->use_heartbeat = 0;
+	}
+
+	if (instance->use_heartbeat) {
+		instance->heartbeat_timeout 
+			= (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout 
+				+ totem_config->max_network_delay;
+
+		if (instance->heartbeat_timeout >= totem_config->token_timeout) {
+			instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+				"total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)\n", 
+				instance->heartbeat_timeout,
+				totem_config->token_timeout);
+			instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+				"heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay\n");
+			instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+				"heartbeat timeout should be less than the token timeout. HeartBeat is Diabled !!\n");
+			instance->use_heartbeat = 0;
+		}
+		else {
+			instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+                		"total heartbeat_timeout (%d ms)\n", instance->heartbeat_timeout);
+		}
+	}
+	
 	totemrrp_initialize (
 	totemrrp_initialize (
 		poll_handle,
 		poll_handle,
 		&instance->totemrrp_handle,
 		&instance->totemrrp_handle,
@@ -1042,10 +1082,24 @@ static void reset_token_timeout (struct totemsrp_instance *instance) {
 		&instance->timer_orf_token_timeout);
 		&instance->timer_orf_token_timeout);
 }
 }
 
 
+static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
+        poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout);
+        poll_timer_add (instance->totemsrp_poll_handle,
+                instance->heartbeat_timeout,
+                (void *)instance,
+                timer_function_heartbeat_timeout,
+                &instance->timer_heartbeat_timeout);
+}
+
+
 static void cancel_token_timeout (struct totemsrp_instance *instance) {
 static void cancel_token_timeout (struct totemsrp_instance *instance) {
 	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
 	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
 }
 }
 
 
+static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) {
+	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout);
+}
+
 static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance)
 static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance)
 {
 {
 	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout);
 	poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout);
@@ -1125,6 +1179,14 @@ static void timer_function_orf_token_timeout (void *data)
 	}
 	}
 }
 }
 
 
+static void timer_function_heartbeat_timeout (void *data)
+{
+	struct totemsrp_instance *instance = (struct totemsrp_instance *)data;
+	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+		"HeartBeat Timer expired Invoking token loss mechanism in state %d \n", instance->memb_state);
+	timer_function_orf_token_timeout(data);
+}
+
 static void memb_timer_function_state_gather (void *data)
 static void memb_timer_function_state_gather (void *data)
 {
 {
 	struct totemsrp_instance *instance = (struct totemsrp_instance *)data;
 	struct totemsrp_instance *instance = (struct totemsrp_instance *)data;
@@ -2624,7 +2686,7 @@ if (random () % 100 < 10) {
 	forward_token = 1;
 	forward_token = 1;
 	if (instance->my_ring_id.rep.s_addr == instance->my_id.sin_addr.s_addr) {
 	if (instance->my_ring_id.rep.s_addr == instance->my_id.sin_addr.s_addr) {
 		if (instance->my_token_held) {
 		if (instance->my_token_held) {
-			forward_token = 0;
+			forward_token = 0;			
 		}
 		}
 	}
 	}
 
 
@@ -2652,6 +2714,14 @@ if (random () % 100 < 10) {
 		if (memcmp (&token->ring_id, &instance->my_ring_id,
 		if (memcmp (&token->ring_id, &instance->my_ring_id,
 			sizeof (struct memb_ring_id)) != 0) {
 			sizeof (struct memb_ring_id)) != 0) {
 
 
+			if ((forward_token)
+				&& instance->use_heartbeat) {
+				reset_heartbeat_timeout(instance);
+			} 
+			else {
+				cancel_heartbeat_timeout(instance);
+			}
+
 			return (0); /* discard token */
 			return (0); /* discard token */
 		}
 		}
 
 
@@ -2667,6 +2737,15 @@ if (random () % 100 < 10) {
 			 * token timeout, and will cause a reconfiguration to occur.
 			 * token timeout, and will cause a reconfiguration to occur.
 			 */
 			 */
 			reset_token_timeout (instance);
 			reset_token_timeout (instance);
+
+			if ((forward_token)
+				&& instance->use_heartbeat) {
+				reset_heartbeat_timeout(instance);
+			}
+			else {
+				cancel_heartbeat_timeout(instance);
+			}
+
 			return (0); /* discard token */
 			return (0); /* discard token */
 		}		
 		}		
 		transmits_allowed = TRANSMITS_ALLOWED;
 		transmits_allowed = TRANSMITS_ALLOWED;
@@ -2802,6 +2881,15 @@ printf ("I held %0.4f ms\n", ((float)tv_diff.tv_usec) / 100.0);
 		}
 		}
 		break;
 		break;
 	}
 	}
+
+	if ((forward_token)
+		&& instance->use_heartbeat) {
+		reset_heartbeat_timeout(instance);
+	}
+	else {
+		cancel_heartbeat_timeout(instance);
+	}
+
 	return (0);
 	return (0);
 }
 }