Переглянути джерело

rrp: Higher threshold in passive mode for mcast

There were too much false positives with passive mode rrp when high
number of messages were received.

Patch adds new configurable variable rrp_problem_count_mcast_threshold
which is by default 10 times rrp_problem_count_threshold and this is
used as threshold for multicast packets in passive mode. Variable is
unused in active mode.

Signed-off-by: Jan Friesse <jfriesse@redhat.com>
Reviewed by: Steven Dake <sdake@redhat.com>
(cherry picked from commit 752239eaa1edd68695a6e40bcde60471f34a02fd)
Jan Friesse 14 роки тому
батько
коміт
4e32c3112a
5 змінених файлів з 28 додано та 2 видалено
  1. 11 0
      exec/totemconfig.c
  2. 4 2
      exec/totemrrp.c
  3. 3 0
      exec/totemsrp.c
  4. 2 0
      include/corosync/totem/totem.h
  5. 8 0
      man/corosync.conf.5

+ 11 - 0
exec/totemconfig.c

@@ -213,6 +213,8 @@ static void totem_volatile_config_read (
 
 
 	objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_threshold", &totem_config->rrp_problem_count_threshold);
 	objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_threshold", &totem_config->rrp_problem_count_threshold);
 
 
+	objdb_get_int (objdb,object_totem_handle, "rrp_problem_count_mcast_threshold", &totem_config->rrp_problem_count_mcast_threshold);
+
 	objdb_get_int (objdb,object_totem_handle, "rrp_autorecovery_check_timeout", &totem_config->rrp_autorecovery_check_timeout);
 	objdb_get_int (objdb,object_totem_handle, "rrp_autorecovery_check_timeout", &totem_config->rrp_autorecovery_check_timeout);
 
 
 	objdb_get_int (objdb,object_totem_handle, "heartbeat_failures_allowed", &totem_config->heartbeat_failures_allowed);
 	objdb_get_int (objdb,object_totem_handle, "heartbeat_failures_allowed", &totem_config->heartbeat_failures_allowed);
@@ -667,12 +669,21 @@ int totem_config_validate (
 	if (totem_config->rrp_problem_count_threshold == 0) {
 	if (totem_config->rrp_problem_count_threshold == 0) {
 		totem_config->rrp_problem_count_threshold = RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT;
 		totem_config->rrp_problem_count_threshold = RRP_PROBLEM_COUNT_THRESHOLD_DEFAULT;
 	}
 	}
+	if (totem_config->rrp_problem_count_mcast_threshold == 0) {
+		totem_config->rrp_problem_count_mcast_threshold = totem_config->rrp_problem_count_threshold * 10;
+	}
 	if (totem_config->rrp_problem_count_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) {
 	if (totem_config->rrp_problem_count_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) {
 		snprintf (local_error_reason, sizeof(local_error_reason),
 		snprintf (local_error_reason, sizeof(local_error_reason),
 			"The RRP problem count threshold (%d problem count) may not be less then (%d problem count).",
 			"The RRP problem count threshold (%d problem count) may not be less then (%d problem count).",
 			totem_config->rrp_problem_count_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN);
 			totem_config->rrp_problem_count_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN);
 		goto parse_error;
 		goto parse_error;
 	}
 	}
+	if (totem_config->rrp_problem_count_mcast_threshold < RRP_PROBLEM_COUNT_THRESHOLD_MIN) {
+		snprintf (local_error_reason, sizeof(local_error_reason),
+			"The RRP multicast problem count threshold (%d problem count) may not be less then (%d problem count).",
+			totem_config->rrp_problem_count_mcast_threshold, RRP_PROBLEM_COUNT_THRESHOLD_MIN);
+		goto parse_error;
+	}
 	if (totem_config->rrp_token_expired_timeout == 0) {
 	if (totem_config->rrp_token_expired_timeout == 0) {
 		totem_config->rrp_token_expired_timeout =
 		totem_config->rrp_token_expired_timeout =
 			totem_config->token_retransmit_timeout;
 			totem_config->token_retransmit_timeout;

+ 4 - 2
exec/totemrrp.c

@@ -890,14 +890,17 @@ static void passive_monitor (
 	unsigned int max;
 	unsigned int max;
 	unsigned int i;
 	unsigned int i;
 	unsigned int min_all, min_active;
 	unsigned int min_all, min_active;
+	unsigned int threshold;
 
 
 	/*
 	/*
 	 * Monitor for failures
 	 * Monitor for failures
 	 */
 	 */
 	if (is_token_recv_count) {
 	if (is_token_recv_count) {
 		recv_count = passive_instance->token_recv_count;
 		recv_count = passive_instance->token_recv_count;
+		threshold = rrp_instance->totem_config->rrp_problem_count_threshold;
 	} else {
 	} else {
 		recv_count = passive_instance->mcast_recv_count;
 		recv_count = passive_instance->mcast_recv_count;
+		threshold = rrp_instance->totem_config->rrp_problem_count_mcast_threshold;
 	}
 	}
 
 
 	recv_count[iface_no] += 1;
 	recv_count[iface_no] += 1;
@@ -959,8 +962,7 @@ static void passive_monitor (
 
 
 	for (i = 0; i < rrp_instance->interface_count; i++) {
 	for (i = 0; i < rrp_instance->interface_count; i++) {
 		if ((passive_instance->faulty[i] == 0) &&
 		if ((passive_instance->faulty[i] == 0) &&
-			(max - recv_count[i] >
-			rrp_instance->totem_config->rrp_problem_count_threshold)) {
+		    (max - recv_count[i] > threshold)) {
 			passive_instance->faulty[i] = 1;
 			passive_instance->faulty[i] = 1;
 			poll_timer_add (rrp_instance->poll_handle,
 			poll_timer_add (rrp_instance->poll_handle,
 				rrp_instance->totem_config->rrp_autorecovery_check_timeout,
 				rrp_instance->totem_config->rrp_autorecovery_check_timeout,

+ 3 - 0
exec/totemsrp.c

@@ -857,6 +857,9 @@ int totemsrp_initialize (
 	log_printf (instance->totemsrp_log_level_debug,
 	log_printf (instance->totemsrp_log_level_debug,
 		"RRP threshold (%d problem count)\n",
 		"RRP threshold (%d problem count)\n",
 		totem_config->rrp_problem_count_threshold);
 		totem_config->rrp_problem_count_threshold);
+	log_printf (instance->totemsrp_log_level_debug,
+		"RRP multicast threshold (%d problem count)\n",
+		totem_config->rrp_problem_count_mcast_threshold);
 	log_printf (instance->totemsrp_log_level_debug,
 	log_printf (instance->totemsrp_log_level_debug,
 		"RRP automatic recovery check timeout (%d ms)\n",
 		"RRP automatic recovery check timeout (%d ms)\n",
 		totem_config->rrp_autorecovery_check_timeout);
 		totem_config->rrp_autorecovery_check_timeout);

+ 2 - 0
include/corosync/totem/totem.h

@@ -143,6 +143,8 @@ struct totem_config {
 
 
 	unsigned int rrp_problem_count_threshold;
 	unsigned int rrp_problem_count_threshold;
 
 
+	unsigned int rrp_problem_count_mcast_threshold;
+
 	unsigned int rrp_autorecovery_check_timeout;
 	unsigned int rrp_autorecovery_check_timeout;
 
 
 	char rrp_mode[TOTEM_RRP_MODE_BYTES];
 	char rrp_mode[TOTEM_RRP_MODE_BYTES];

+ 8 - 0
man/corosync.conf.5

@@ -471,6 +471,14 @@ may occur.
 
 
 The default is 10 problem counts.
 The default is 10 problem counts.
 
 
+.TP
+rrp_problem_count_mcast_threshold
+This specifies the number of times a problem is detected with multicast before
+setting the link faulty for passive rrp mode. This variable is unused in active
+rrp mode.
+
+The default is 10 times rrp_problem_count_threshold.
+
 .TP
 .TP
 rrp_token_expired_timeout
 rrp_token_expired_timeout
 This specifies the time in milliseconds to increment the problem counter for
 This specifies the time in milliseconds to increment the problem counter for