Jelajahi Sumber

Add token_warning configuration option

Token_warning is used to present information about
when the token was last received.

Signed-off-by: Chris Walker <cwalker@cray.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
Chris Walker 7 tahun lalu
induk
melakukan
3f7d2cf6aa

+ 1 - 0
exec/coroparse.c

@@ -597,6 +597,7 @@ static int main_config_parser_cb(const char *path,
 			    (strcmp(path, "totem.token") == 0) ||
 			    (strcmp(path, "totem.token_coefficient") == 0) ||
 			    (strcmp(path, "totem.token_retransmit") == 0) ||
+			    (strcmp(path, "totem.token_warning") == 0) ||
 			    (strcmp(path, "totem.hold") == 0) ||
 			    (strcmp(path, "totem.token_retransmits_before_loss_const") == 0) ||
 			    (strcmp(path, "totem.join") == 0) ||

+ 3 - 0
exec/main.c

@@ -546,6 +546,9 @@ static void corosync_totem_stats_updater (void *data)
 		stats->srp->avg_backlog_calc = (total_backlog_calc / token_count);
 	}
 
+	stats->srp->time_since_token_last_received = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC -
+		stats->srp->token[stats->srp->latest_token].rx;
+
 	stats_trigger_trackers();
 
 	api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL,

+ 1 - 0
exec/stats.c

@@ -96,6 +96,7 @@ struct cs_stats_conv cs_srp_stats[] = {
 	{ STAT_SRP, "recovery_token_lost",    offsetof(totemsrp_stats_t, recovery_token_lost),    ICMAP_VALUETYPE_UINT64},
 	{ STAT_SRP, "consensus_timeouts",     offsetof(totemsrp_stats_t, consensus_timeouts),     ICMAP_VALUETYPE_UINT64},
 	{ STAT_SRP, "rx_msg_dropped",         offsetof(totemsrp_stats_t, rx_msg_dropped),         ICMAP_VALUETYPE_UINT64},
+	{ STAT_SRP, "time_since_token_last_received", offsetof(totemsrp_stats_t, time_since_token_last_received), ICMAP_VALUETYPE_UINT64},
 	{ STAT_SRP, "continuous_gather",      offsetof(totemsrp_stats_t, continuous_gather),      ICMAP_VALUETYPE_UINT32},
 	{ STAT_SRP, "continuous_sendmsg_failures", offsetof(totemsrp_stats_t, continuous_sendmsg_failures), ICMAP_VALUETYPE_UINT32},
 	{ STAT_SRP, "firewall_enabled_or_nic_failure", offsetof(totemsrp_stats_t, firewall_enabled_or_nic_failure), ICMAP_VALUETYPE_UINT8},

+ 24 - 0
exec/totemconfig.c

@@ -66,6 +66,7 @@
 
 #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST	4
 #define TOKEN_TIMEOUT				1000
+#define TOKEN_WARNING				75
 #define TOKEN_COEFFICIENT			650
 #define JOIN_TIMEOUT				50
 #define MERGE_TIMEOUT				200
@@ -96,6 +97,8 @@ static void *totem_get_param_by_name(struct totem_config *totem_config, const ch
 {
 	if (strcmp(param_name, "totem.token") == 0)
 		return &totem_config->token_timeout;
+	if (strcmp(param_name, "totem.token_warning") == 0)
+		return &totem_config->token_warning;
 	if (strcmp(param_name, "totem.token_retransmit") == 0)
 		return &totem_config->token_retransmit_timeout;
 	if (strcmp(param_name, "totem.hold") == 0)
@@ -246,6 +249,8 @@ static void totem_volatile_config_read (struct totem_config *totem_config, const
 
 	totem_volatile_config_set_uint32_value(totem_config, "totem.token", deleted_key, TOKEN_TIMEOUT, 0);
 
+	totem_volatile_config_set_uint32_value(totem_config, "totem.token_warning", deleted_key, TOKEN_WARNING, 1);
+
 	if (totem_config->interfaces[0].member_count > 2) {
 		u32 = TOKEN_COEFFICIENT;
 		icmap_get_uint32("totem.token_coefficient", &u32);
@@ -323,6 +328,13 @@ static int totem_volatile_config_validate (
 		goto parse_error;
 	}
 
+	if (totem_config->token_warning > 100 || totem_config->token_warning < 0) {
+		snprintf (local_error_reason, sizeof(local_error_reason),
+			"The token warning parameter (%d%%) must be between 0 (disabled) and 100.",
+			totem_config->token_warning);
+		goto parse_error;
+	}
+
 	if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) {
 		snprintf (local_error_reason, sizeof(local_error_reason),
 			"The token retransmit timeout parameter (%d ms) may not be less than (%d ms).",
@@ -1986,6 +1998,18 @@ static void debug_dump_totem_config(const struct totem_config *totem_config)
 
 	log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)",
 	    totem_config->token_timeout, totem_config->token_retransmit_timeout);
+	if (totem_config->token_warning) {
+		uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
+		log_printf(LOGSYS_LEVEL_DEBUG, "Token warning every %d ms (%d%% of Token Timeout)",
+		    token_warning_ms, totem_config->token_warning);
+		if (token_warning_ms < totem_config->token_retransmit_timeout)
+			log_printf (LOGSYS_LEVEL_DEBUG,
+				"The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
+				"which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
+				token_warning_ms, totem_config->token_retransmit_timeout);
+
+	} else
+		log_printf(LOGSYS_LEVEL_DEBUG, "Token warnings disabled");
 	log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)",
 	    totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
 	log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)",

+ 59 - 0
exec/totemsrp.c

@@ -401,6 +401,8 @@ struct totemsrp_instance {
 
 	qb_loop_timer_handle timer_orf_token_timeout;
 
+	qb_loop_timer_handle timer_orf_token_warning;
+
 	qb_loop_timer_handle timer_orf_token_retransmit_timeout;
 
 	qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout;
@@ -653,6 +655,7 @@ static void memb_merge_detect_endian_convert (
 	struct memb_merge_detect *out);
 static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in);
 static void timer_function_orf_token_timeout (void *data);
+static void timer_function_orf_token_warning (void *data);
 static void timer_function_pause_timeout (void *data);
 static void timer_function_heartbeat_timeout (void *data);
 static void timer_function_token_retransmit_timeout (void *data);
@@ -883,6 +886,20 @@ int totemsrp_initialize (
 	log_printf (instance->totemsrp_log_level_debug,
 		"Token Timeout (%d ms) retransmit timeout (%d ms)",
 		totem_config->token_timeout, totem_config->token_retransmit_timeout);
+	if (totem_config->token_warning) {
+		uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
+		log_printf(instance->totemsrp_log_level_debug,
+			"Token warning every %d ms (%d%% of Token Timeout)",
+			token_warning_ms, totem_config->token_warning);
+		if (token_warning_ms < totem_config->token_retransmit_timeout)
+			log_printf (LOGSYS_LEVEL_DEBUG,
+				"The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
+				"which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
+				token_warning_ms, totem_config->token_retransmit_timeout);
+	} else {
+		log_printf(instance->totemsrp_log_level_debug,
+			"Token warnings disabled");
+	}
 	log_printf (instance->totemsrp_log_level_debug,
 		"token hold (%d ms) retransmits before loss (%d retrans)",
 		totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
@@ -1566,6 +1583,21 @@ static void reset_pause_timeout (struct totemsrp_instance *instance)
 	}
 }
 
+static void reset_token_warning (struct totemsrp_instance *instance) {
+	int32_t res;
+
+	qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning);
+	res = qb_loop_timer_add (instance->totemsrp_poll_handle,
+		QB_LOOP_MED,
+		instance->totem_config->token_warning * instance->totem_config->token_timeout / 100 * QB_TIME_NS_IN_MSEC,
+		(void *)instance,
+		timer_function_orf_token_warning,
+		&instance->timer_orf_token_warning);
+	if (res != 0) {
+		log_printf(instance->totemsrp_log_level_error, "reset_token_warning - qb_loop_timer_add error : %d", res);
+	}
+}
+
 static void reset_token_timeout (struct totemsrp_instance *instance) {
 	int32_t res;
 
@@ -1579,6 +1611,9 @@ static void reset_token_timeout (struct totemsrp_instance *instance) {
 	if (res != 0) {
 		log_printf(instance->totemsrp_log_level_error, "reset_token_timeout - qb_loop_timer_add error : %d", res);
 	}
+
+	if (instance->totem_config->token_warning)
+		reset_token_warning(instance);
 }
 
 static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
@@ -1597,8 +1632,15 @@ static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
 }
 
 
+static void cancel_token_warning (struct totemsrp_instance *instance) {
+	qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning);
+}
+
 static void cancel_token_timeout (struct totemsrp_instance *instance) {
 	qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
+
+        if (instance->totem_config->token_warning)
+                cancel_token_warning(instance);
 }
 
 static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) {
@@ -1680,6 +1722,23 @@ static void memb_recovery_state_token_loss (struct totemsrp_instance *instance)
 	instance->stats.recovery_token_lost++;
 }
 
+static void timer_function_orf_token_warning (void *data)
+{
+	struct totemsrp_instance *instance = data;
+	uint64_t tv_diff;
+
+	/* need to protect against the case where token_warning is set to 0 dynamically */
+	if (instance->totem_config->token_warning) {
+		tv_diff = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC -
+			instance->stats.token[instance->stats.latest_token].rx;
+		log_printf (instance->totemsrp_log_level_notice,
+			"Token has not been received in %d ms ", (unsigned int) tv_diff);
+		reset_token_warning(instance);
+        } else {
+		cancel_token_warning(instance);
+	}
+}
+
 static void timer_function_orf_token_timeout (void *data)
 {
 	struct totemsrp_instance *instance = data;

+ 2 - 0
include/corosync/totem/totem.h

@@ -173,6 +173,8 @@ struct totem_config {
 	 */
 	unsigned int token_timeout;
 
+	unsigned int token_warning;
+
 	unsigned int token_retransmit_timeout;
 
 	unsigned int token_hold_timeout;

+ 1 - 0
include/corosync/totem/totemstats.h

@@ -77,6 +77,7 @@ typedef struct {
 	uint64_t rx_msg_dropped;
 	uint32_t continuous_gather;
 	uint32_t continuous_sendmsg_failures;
+	uint64_t time_since_token_last_received; // relative time
 
 	uint8_t  firewall_enabled_or_nic_failure;
 	uint32_t mtt_rx_token;

+ 8 - 0
man/corosync.conf.5

@@ -320,6 +320,14 @@ key.
 
 The default is 1000 milliseconds.
 
+.TP
+token_warning
+Specifies the interval between warnings that the token has not been received.  The
+value is a percentage of the token timeout and can be set to 0 to disable
+warnings.
+
+The default is 75%.
+
 .TP
 token_coefficient
 This value is used only when