فهرست منبع

defect 205
implement totem single ring protocol flow control algorithm


git-svn-id: http://svn.fedorahosted.org/svn/corosync/trunk@966 fd59a12c-fef9-0310-b244-a6a79926bd2f

Steven Dake 20 سال پیش
والد
کامیت
74952a56a3
6فایلهای تغییر یافته به همراه162 افزوده شده و 30 حذف شده
  1. 0 2
      exec/mainconfig.c
  2. 6 3
      exec/totem.h
  3. 25 4
      exec/totemconfig.c
  4. 93 14
      exec/totemsrp.c
  5. 4 0
      include/queue.h
  6. 34 7
      man/openais.conf.5

+ 0 - 2
exec/mainconfig.c

@@ -164,8 +164,6 @@ extern int openais_main_config_read (
 			continue;
 		}
 			
-		line_number += 1;
-
 		switch (parse) {
 		case MAIN_HEAD:
 			if (logging_parsed == 0 && strstr_rs (line, "logging{")) {

+ 6 - 3
exec/totem.h

@@ -125,10 +125,13 @@ struct totem_config {
 
 	unsigned int threads;
 	
-	int heartbeat_failures_allowed;
+	unsigned int heartbeat_failures_allowed;
 	
-	/*In Milliseconds*/
-	int max_network_delay;
+	unsigned int max_network_delay;
+
+	unsigned int window_size;
+
+	unsigned int max_messages;
 };
 
 enum totem_configuration_type {

+ 25 - 4
exec/totemconfig.c

@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2002-2005 MontaVista Software, Inc.
+ * Copyright (c) 2006 RedHat, Inc.
  *
  * All rights reserved.
  *
@@ -68,7 +69,9 @@
 #define FAIL_TO_RECV_CONST			50
 #define	SEQNO_UNCHANGED_CONST			30
 #define MINIMUM_TIMEOUT				(int)(1000/HZ)*3
-#define MAX_NETWORK_DELAY			50 /*In milliseconds*/
+#define MAX_NETWORK_DELAY			50
+#define WINDOW_SIZE				50
+#define MAX_MESSAGES				17
 
 static char error_string_response[512];
 
@@ -186,8 +189,6 @@ extern int totem_config_read (
 			continue;
 		}
 			
-		line_number += 1;
-
 		switch (parse) {
 		case MAIN_HEAD:
 			if (totem_parsed == 0 && strstr_rs (line, "network{")) {
@@ -265,7 +266,12 @@ extern int totem_config_read (
 				totem_config->heartbeat_failures_allowed = atoi(loc);
 			} else if ((loc = strstr_rs (line, "max_network_delay:"))) {
 				totem_config->max_network_delay = atoi(loc);
-			} else if ((loc = strstr_rs (line, "}"))) {
+			} else if ((loc = strstr_rs (line, "window_size:"))) {
+				totem_config->window_size = atoi(loc);
+			} else if ((loc = strstr_rs (line, "max_messages:"))) {
+				totem_config->max_messages = atoi(loc);
+			} else
+			if ((loc = strstr_rs (line, "}"))) {
 				parse = MAIN_HEAD;
 			} else {
 				goto parse_error;
@@ -375,6 +381,14 @@ int totem_config_validate (
 		goto parse_error;
 	}
 
+	if (totem_config->window_size == 0) {
+		totem_config->window_size = WINDOW_SIZE;
+	}
+
+	if (totem_config->max_messages == 0) {
+		totem_config->max_messages = MAX_MESSAGES;
+	}
+
 	if (totem_config->token_timeout < MINIMUM_TIMEOUT) {
 		sprintf (local_error_reason, "The token timeout parameter (%d ms) may not be less then (%d ms).",
 			totem_config->token_timeout, MINIMUM_TIMEOUT);
@@ -456,6 +470,13 @@ int totem_config_validate (
 	if (totem_config->net_mtu == 0) {
 		totem_config->net_mtu = 1500;
 	}
+
+	if ((256000 / totem_config->net_mtu) < totem_config->max_messages) {
+		sprintf (local_error_reason, "The max_messages parameter (%d messages) may not be greater then (%d messages).",
+			totem_config->max_messages, 256000/totem_config->net_mtu);
+		goto parse_error;
+	}
+
 	if (totem_config->threads > SEND_THREADS_MAX) {
 		totem_config->threads = SEND_THREADS_MAX;
 	}

+ 93 - 14
exec/totemsrp.c

@@ -1,5 +1,6 @@
 /*
- * Copyright (c) 2003-2005 MontaVista Software, Inc.
+ * Copyright (c) 2003-2006 MontaVista Software, Inc.
+ * Copyright (c) 2006 RedHat, Inc.
  *
  * All rights reserved.
  *
@@ -81,10 +82,10 @@
 
 #include "crypto.h"
 
-#define LOCALHOST_IP					inet_addr("127.0.0.1")
+#define LOCALHOST_IP				inet_addr("127.0.0.1")
 #define QUEUE_RTR_ITEMS_SIZE_MAX		256 /* allow 512 retransmit items */
-#define RETRANS_MESSAGE_QUEUE_SIZE_MAX	500 /* allow 500 messages to be queued */
-#define RECEIVED_MESSAGE_QUEUE_SIZE_MAX	500 /* allow 500 messages to be queued */
+#define RETRANS_MESSAGE_QUEUE_SIZE_MAX		500 /* allow 500 messages to be queued */
+#define RECEIVED_MESSAGE_QUEUE_SIZE_MAX		500 /* allow 500 messages to be queued */
 #define MAXIOVS							5	
 #define RETRANSMIT_ENTRIES_MAX			30
 #define MISSING_MCAST_WINDOW			128
@@ -117,8 +118,8 @@
 #define ENDIAN_LOCAL					 0xff22
 
 enum message_type {
-	MESSAGE_TYPE_ORF_TOKEN = 0,			/* Ordering, Reliability, Flow (ORF) control Token */
-	MESSAGE_TYPE_MCAST = 1,				/* ring ordered multicast message */
+	MESSAGE_TYPE_ORF_TOKEN = 0,		/* Ordering, Reliability, Flow (ORF) control Token */
+	MESSAGE_TYPE_MCAST = 1,			/* ring ordered multicast message */
 	MESSAGE_TYPE_MEMB_MERGE_DETECT = 2,	/* merge rings if there are available rings */
 	MESSAGE_TYPE_MEMB_JOIN = 3, 		/* membership join message */
 	MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4,	/* membership commit token */
@@ -186,7 +187,8 @@ struct orf_token {
 	unsigned int aru;
 	struct totem_ip_address aru_addr;
 	struct memb_ring_id ring_id; 
-	short int fcc;
+	unsigned int backlog;
+	unsigned int fcc;
 	int retrans_flg;
 	int rtr_list_entries;
 	struct rtr_item rtr_list[0];
@@ -446,7 +448,11 @@ struct totemsrp_instance {
 
 	struct totem_config *totem_config;
 
-	int use_heartbeat;
+	unsigned int use_heartbeat;
+
+	unsigned int my_trc;
+
+	unsigned int my_pbl;
 };
 
 struct message_handlers {
@@ -676,8 +682,14 @@ int totemsrp_initialize (
 		totem_config->downcheck_timeout, totem_config->fail_to_recv_const);
 	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
 		"seqno unchanged const (%d rotations) Maximum network MTU %d\n", totem_config->seqno_unchanged_const, totem_config->net_mtu);
+
+	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
+		"window size per rotation (%d messages) maximum messages per rotation (%d messages)\n",
+		totem_config->window_size, totem_config->max_messages);
+
 	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
 		"send threads (%d threads)\n", totem_config->threads);
+
 	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
 		"heartbeat_failures_allowed (%d)\n", totem_config->heartbeat_failures_allowed);
 	instance->totemsrp_log_printf (instance->totemsrp_log_level_notice,
@@ -1497,6 +1509,11 @@ static void memb_state_commit_enter (
 
 	instance->memb_state = MEMB_STATE_COMMIT;
 
+	/*
+	 * reset all flow control variables since we are starting a new ring
+	 */
+	instance->my_trc = 0;
+	instance->my_pbl = 0;
 	return;
 }
 
@@ -1980,7 +1997,7 @@ static int orf_token_mcast (
 		sort_queue = &instance->regular_sort_queue;
 	}
 
-	for (instance->fcc_mcast_current = 0; instance->fcc_mcast_current < fcc_mcasts_allowed; instance->fcc_mcast_current++) {
+	for (instance->fcc_mcast_current = 0; instance->fcc_mcast_current <= fcc_mcasts_allowed; instance->fcc_mcast_current++) {
 		if (queue_is_empty (mcast_queue)) {
 			break;
 		}
@@ -2329,6 +2346,7 @@ static int orf_token_send_initial (struct totemsrp_instance *instance)
 
 	memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id));
 	orf_token.fcc = 0;
+	orf_token.backlog = 0;
 
 	orf_token.rtr_list_entries = 0;
 
@@ -2673,6 +2691,61 @@ static void token_callbacks_execute (
 	}
 }
 
+/*
+ * Flow control functions
+ */
+static unsigned int backlog_get (struct totemsrp_instance *instance)
+{
+	unsigned int backlog = 0;
+
+	if (instance->memb_state == MEMB_STATE_OPERATIONAL) {
+		backlog = queue_used (&instance->new_message_queue);
+	} else
+	if (instance->memb_state == MEMB_STATE_RECOVERY) {
+		backlog = queue_used (&instance->retrans_message_queue);
+	}
+	return (backlog);
+}
+
+static int fcc_calculate (
+	struct totemsrp_instance *instance,
+	struct orf_token *token)
+{
+	unsigned int transmits_allowed;
+	unsigned int backlog_calc;
+
+	transmits_allowed = instance->totem_config->max_messages;
+
+	if (transmits_allowed > instance->totem_config->window_size - token->fcc) {
+		transmits_allowed = instance->totem_config->window_size - token->fcc;
+	}
+
+	/*
+	 * Only do backlog calculation if there is a backlog otherwise
+	 * we would result in div by zero
+	 */
+	if (token->backlog + backlog_get (instance) - instance->my_pbl) {
+		backlog_calc = (instance->totem_config->window_size * instance->my_pbl) /
+			(token->backlog + backlog_get (instance) - instance->my_pbl);
+		if (backlog_calc > 0 && transmits_allowed > backlog_calc) {
+			transmits_allowed = backlog_calc;
+		}
+	}
+
+	return (transmits_allowed);
+}
+
+static void fcc_token_update (
+	struct totemsrp_instance *instance,
+	struct orf_token *token,
+	unsigned int msgs_transmitted)
+{
+	token->fcc += msgs_transmitted - instance->my_trc;
+	token->backlog += backlog_get (instance) - instance->my_pbl;
+	instance->my_trc = msgs_transmitted;
+	instance->my_pbl = backlog_get (instance);
+}
+
 /*
  * Message Handlers
  */
@@ -2690,9 +2763,10 @@ static int message_handler_orf_token (
 	char token_storage[1500];
 	char token_convert[1500];
 	struct orf_token *token = NULL;
-	int transmits_allowed;
 	int forward_token;
-	int mcasted;
+	unsigned int transmits_allowed;
+	unsigned int mcasted_retransmit;
+	unsigned int mcasted_regular;
 	unsigned int last_aru;
 	unsigned int low_water;
 
@@ -2829,13 +2903,17 @@ static int message_handler_orf_token (
 
 			return (0); /* discard token */
 		}		
-		transmits_allowed = TRANSMITS_ALLOWED;
-		mcasted = orf_token_rtr (instance, token, &transmits_allowed);
+
+		transmits_allowed = fcc_calculate (instance, token);
+		mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed);
 
 		if (sq_lt_compare (instance->last_released + MISSING_MCAST_WINDOW, token->seq + TRANSMITS_ALLOWED)) {
 			transmits_allowed = 0;
 		}
-		mcasted = orf_token_mcast (instance, token, transmits_allowed, system_from);
+		mcasted_regular = orf_token_mcast (instance, token, transmits_allowed, system_from);
+		fcc_token_update (instance, token, mcasted_retransmit +
+			mcasted_regular);
+			
 		if (sq_lt_compare (instance->my_aru, token->aru) ||
 			totemip_equal(&instance->my_id, &token->aru_addr) ||
 			totemip_zero_check(&token->aru_addr)) {
@@ -3409,6 +3487,7 @@ static void orf_token_endian_convert (struct orf_token *in, struct orf_token *ou
 	totemip_copy_endian_convert(&out->aru_addr, &in->aru_addr);
 	out->ring_id.seq = swab64 (in->ring_id.seq);
 	out->fcc = swab32 (in->fcc);
+	out->backlog = swab32 (in->backlog);
 	out->retrans_flg = swab32 (in->retrans_flg);
 	out->rtr_list_entries = swab32 (in->rtr_list_entries);
 	for (i = 0; i < out->rtr_list_entries; i++) {

+ 4 - 0
include/queue.h

@@ -168,4 +168,8 @@ static inline void queue_avail (struct queue *queue, int *avail)
 	assert (*avail >= 0);
 }
 
+static inline int queue_used (struct queue *queue) {
+	return (queue->used);
+}
+
 #endif /* QUEUE_H_DEFINED */

+ 34 - 7
man/openais.conf.5

@@ -1,5 +1,6 @@
 .\"/*
 .\" * Copyright (c) 2005 MontaVista Software, Inc.
+.\" * Copyright (c) 2006 RedHat, Inc.
 .\" *
 .\" * All rights reserved.
 .\" *
@@ -31,7 +32,7 @@
 .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 .\" * THE POSSIBILITY OF SUCH DAMAGE.
 .\" */
-.TH OPENAIS_CONF 5 2005-06-08 "openais Man Page" "Openais Programmer's Manual"
+.TH OPENAIS_CONF 5 2006-03-28 "openais Man Page" "Openais Programmer's Manual"
 .SH NAME
 openais.conf - openais executive configuration file
 
@@ -164,7 +165,7 @@ mtu size set from 1500 to whatever frame size is specified here.
 
 Please note while some NICs or switches claim large frame support, they support
 9000 MTU as the maximum frame size including the IP header.  Setting the netmtu
-and host MTUs to 9000 will cause totem to use the full 9000 bytes of frame room.
+and host MTUs to 9000 will cause totem to use the full 9000 bytes of the frame.
 Then Linux will add a 18 byte header moving the full frame size to 9018.  As a
 result some hardware will not operate properly with this size of data.  A netmtu 
 of 8982 seems to work for the few large frame devices that have been tested.
@@ -208,7 +209,7 @@ token
 This timeout specifies in milliseconds until a token loss is declared after not
 receiving a token.  This is the time spent detecting a failure of a processor
 in the current configuration.  Reforming a new configuration takes about 50
-milliseconds and is independent of this timeout.
+milliseconds in addition to this timeout.
 
 .TP
 token_retransmit
@@ -277,17 +278,39 @@ before declaring heartbeat failure e.g 3. Also if this value is not set or is 0
 heartbeat mechanism is not engaged in the system and token rotation is the method
 of failure detection
 
-default value = 0.
+The default is 0 (disabled).
 
 .TP
 max_network_delay
 [HeartBeating mechanism]
-This constant specifies in Milliseconds the approximate delay that your network takes
+This constant specifies in milliseconds the approximate delay that your network takes
 to transport one packet from one machine to another. This value is to be set by system
 engineers and please dont change if not sure as this effects the failure detection
 mechanism using heartbeat.
 
-default value = 50 milliseconds.
+The default is 50 milliseconds.
+
+.TP
+window_size
+This constant specifies the maximum number of messages that may be sent on one
+token rotation.  If all processors perform equally well, this value could be
+large (300), which would introduce higher latency from origination to delivery
+for very large rings.  To reduce latency in large rings(16+), the defaults are
+a safe compromise.  If 1 or more slow processor(s) are present among fast
+processors, window_size should be no larger then 256000 / netmtu to avoid
+overflow of the kernel receive buffers.  The user is notified of this by
+the display of a retransmit list in the notification logs.  There is no loss
+of data, but performance is reduced when these errors occur.
+
+The default is 50 messages.
+
+.TP
+max_messages
+This constant specifies the maximum number of messages that may be sent by one
+processor on receipt of the token.  The max_messages parameter is limited to
+256000 / netmtu to prevent overflow of the kernel transmit buffers.
+
+The default is 17 messages.
 
 .PP
 Within the 
@@ -303,7 +326,9 @@ options multiple times in the top level directive.
 .TP
 logfile
 If the logoutput: file directive is set, this option specifies where the
-log file is written to.  
+log file is written to.
+
+The default is syslog.
 
 .TP
 debug
@@ -311,6 +336,8 @@ This specifies whether debug output is logged.  This is generally a bad idea,
 unless there is some specific bug or problem that must be found in the
 executive.  Set the value to on to debug, off to turn of debugging.
 
+The default is off.
+
 .TP
 timestamp
 This specifies that a timestamp is placed on all log messages.