Ver código fonte

qnetd: Improve dead peer detection

Previously dead peer detection timer was scheduled every dpd_interval,
added dpd_interval to all of the clients timestamp and if timestamp was
larger than client hearbeat interval * 1.2 then check if client sent
some message. If so, flag was reset.

This method was source of number of problems so instead different method
is now used.

Every single client has its own timer with timeout based on
(configurable) dpd_interval_coefficient and multiplied with
client heartbeat timeout. When message is received from client timer is
rescheduled. When timer callback is called (= client doesn't sent
message during timeout) then client is disconnected.

Signed-off-by: Jan Friesse <jfriesse@redhat.com>
Jan Friesse 5 anos atrás
pai
commit
a8b7513df9

+ 4 - 3
man/corosync-qnetd.8

@@ -31,7 +31,7 @@
 .\" * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 .\" * THE POSSIBILITY OF SUCH DAMAGE.
 .\" */
-.TH COROSYNC-QNETD 8 2020-09-22
+.TH COROSYNC-QNETD 8 2020-11-18
 .SH NAME
 corosync-qnetd \- QNet daemon
 .SH SYNOPSIS
@@ -216,8 +216,9 @@ Maximum heartbeat timeout accepted by server in ms. (120000)
 .B dpd_enabled
 Dead peer detection enabled. (on)
 .TP
-.B dpd_interval
-How often the DPD algorithm detects dead peers in ms. (1000)
+.B dpd_interval_coefficient
+Value is multiplied with heartbeat interval sent by qdevice client and used as a timeout
+for dead peer detection. (1.5)
 .TP
 .B lock_file
 Lock file location. (/var/run/corosync-qnetd/corosync-qnetd.pid)

+ 1 - 1
qdevices/Makefile.am

@@ -64,7 +64,7 @@ corosync_qnetd_SOURCES	= corosync-qnetd.c \
                           qnetd-client-msg-received.c qnetd-client-msg-received.h \
                           qnetd-log-debug.c qnetd-log-debug.h \
                           qnetd-client-algo-timer.c qnetd-client-algo-timer.h \
-                          qnetd-dpd-timer.c qnetd-dpd-timer.h \
+                          qnetd-client-dpd-timer.c qnetd-client-dpd-timer.h \
                           qnetd-ipc.c qnetd-ipc.h unix-socket-ipc.c unix-socket-ipc.h \
                           dynar-simple-lex.c dynar-simple-lex.h dynar-str.c dynar-str.h \
                           unix-socket-client.c unix-socket-client.h \

+ 3 - 0
qdevices/corosync-qnetd.c

@@ -237,6 +237,9 @@ cli_parse_long_opt(struct qnetd_advanced_settings *advanced_settings, const char
 		case -2:
 			errx(EXIT_FAILURE, "Invalid value '%s' for option '%s'", val, opt);
 			break;
+		case -3:
+			warnx("Option '%s' is deprecated and has no effect anymore", opt);
+			break;
 		}
 	}
 

+ 3 - 2
qdevices/qnet-config.h

@@ -71,8 +71,9 @@ extern "C" {
 #define QNETD_MIN_HEARTBEAT_INTERVAL			1
 
 #define QNETD_DEFAULT_DPD_ENABLED			1
-#define QNETD_DEFAULT_DPD_INTERVAL			(1*1000)
-#define QNETD_MIN_DPD_INTERVAL				1
+#define QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT		1.5
+#define QNETD_MIN_DPD_INTERVAL_COEFFICIENT		1
+#define QNETD_MAX_DPD_INTERVAL_COEFFICIENT		1000
 
 #define QNETD_DEFAULT_LOCK_FILE				LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.pid"
 #define QNETD_DEFAULT_LOCAL_SOCKET_FILE			LOCALSTATEDIR "/run/corosync-qnetd/corosync-qnetd.sock"

+ 8 - 3
qdevices/qnetd-advanced-settings.c

@@ -63,7 +63,7 @@ qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings)
 	settings->heartbeat_interval_min = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MIN;
 	settings->heartbeat_interval_max = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX;
 	settings->dpd_enabled = QNETD_DEFAULT_DPD_ENABLED;
-	settings->dpd_interval = QNETD_DEFAULT_DPD_INTERVAL;
+	settings->dpd_interval_coefficient = QNETD_DEFAULT_DPD_INTERVAL_COEFFICIENT;
 	if ((settings->lock_file = strdup(QNETD_DEFAULT_LOCK_FILE)) == NULL) {
 		return (-1);
 	}
@@ -94,12 +94,14 @@ qnetd_advanced_settings_destroy(struct qnetd_advanced_settings *settings)
  * 0 - No error
  * -1 - Unknown option
  * -2 - Incorrect value
+ * -3 - Deprecated value
  */
 int
 qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings,
     const char *option, const char *value)
 {
 	long long int tmpll;
+	double tmpdbl;
 
 	if (strcasecmp(option, "listen_backlog") == 0) {
 		if (utils_strtonum(value, QNETD_MIN_LISTEN_BACKLOG, INT_MAX, &tmpll) == -1) {
@@ -156,11 +158,14 @@ qnetd_advanced_settings_set(struct qnetd_advanced_settings *settings,
 
 		settings->dpd_enabled = (uint8_t)tmpll;
 	} else if (strcasecmp(option, "dpd_interval") == 0) {
-		if (utils_strtonum(value, QNETD_MIN_DPD_INTERVAL, UINT32_MAX, &tmpll) == -1) {
+		return (-3);
+	} else if (strcasecmp(option, "dpd_interval_coefficient") == 0) {
+		if (utils_strtod(value, QNETD_MIN_DPD_INTERVAL_COEFFICIENT,
+		    QNETD_MAX_DPD_INTERVAL_COEFFICIENT, &tmpdbl) == -1) {
 			return (-2);
 		}
 
-		settings->dpd_interval = (uint32_t)tmpll;
+		settings->dpd_interval_coefficient = tmpdbl;
 	} else if (strcasecmp(option, "lock_file") == 0) {
 		free(settings->lock_file);
 

+ 1 - 1
qdevices/qnetd-advanced-settings.h

@@ -49,7 +49,6 @@ struct qnetd_advanced_settings {
 	uint32_t heartbeat_interval_min;
 	uint32_t heartbeat_interval_max;
 	uint8_t dpd_enabled;
-	uint32_t dpd_interval;
 	char *lock_file;
 	char *local_socket_file;
 	int local_socket_backlog;
@@ -57,6 +56,7 @@ struct qnetd_advanced_settings {
 	size_t ipc_max_send_size;
 	size_t ipc_max_receive_size;
 	enum tlv_keep_active_partition_tie_breaker keep_active_partition_tie_breaker;
+	double dpd_interval_coefficient;
 };
 
 extern int		qnetd_advanced_settings_init(struct qnetd_advanced_settings *settings);

+ 50 - 33
qdevices/qnetd-dpd-timer.c → qdevices/qnetd-client-dpd-timer.c

@@ -33,53 +33,42 @@
  */
 
 #include "log.h"
-#include "qnetd-dpd-timer.h"
+#include "qnetd-client-dpd-timer.h"
 
 static int
 qnetd_dpd_timer_cb(void *data1, void *data2)
 {
-	struct qnetd_instance *instance;
 	struct qnetd_client *client;
 
-	instance = (struct qnetd_instance *)data1;
+	client = (struct qnetd_client *)data1;
 
-	TAILQ_FOREACH(client, &instance->clients, entries) {
-		if (!client->init_received) {
-			continue;
-		}
+	log(LOG_WARNING, "Client %s doesn't sent any message during "
+	    "%" PRIu32 "ms. Disconnecting",
+	    client->addr_str,
+	    timer_list_entry_get_interval(client->dpd_timer));
 
-		client->dpd_time_since_last_check += instance->advanced_settings->dpd_interval;
+	client->schedule_disconnect = 1;
+	/*
+	 * Timer gets removed by timer-list because of returning 0
+	 */
+	client->dpd_timer = NULL;
 
-		if (client->dpd_time_since_last_check > client->heartbeat_interval * 1.2) {
-			if (!client->dpd_msg_received_since_last_check) {
-				log(LOG_WARNING, "Client %s doesn't sent any message during "
-				    "%"PRIu32"ms. Disconnecting",
-				    client->addr_str, client->dpd_time_since_last_check);
-
-				client->schedule_disconnect = 1;
-			} else {
-				client->dpd_time_since_last_check = 0;
-				client->dpd_msg_received_since_last_check = 0;
-			}
-		}
-	}
-
-	return (-1);
+	return (0);
 }
 
 int
-qnetd_dpd_timer_init(struct qnetd_instance *instance)
+qnetd_client_dpd_timer_init(struct qnetd_instance *instance, struct qnetd_client *client)
 {
 
 	if (!instance->advanced_settings->dpd_enabled) {
 		return (0);
 	}
 
-	instance->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
-	    instance->advanced_settings->dpd_interval,
-	    qnetd_dpd_timer_cb, (void *)instance, NULL);
-	if (instance->dpd_timer == NULL) {
-		log(LOG_ERR, "Can't initialize dpd timer");
+	client->dpd_timer = timer_list_add(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
+	    (PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval),
+	    qnetd_dpd_timer_cb, (void *)client, NULL);
+	if (client->dpd_timer == NULL) {
+		log(LOG_ERR, "Can't initialize dpd timer for client %s", client->addr_str);
 
 		return (-1);
 	}
@@ -88,11 +77,39 @@ qnetd_dpd_timer_init(struct qnetd_instance *instance)
 }
 
 void
-qnetd_dpd_timer_destroy(struct qnetd_instance *instance)
+qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance, struct qnetd_client *client)
+{
+
+	if (client->dpd_timer != NULL) {
+		timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
+		    client->dpd_timer);
+
+		client->dpd_timer = NULL;
+	}
+}
+
+void
+qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance, struct qnetd_client *client)
+{
+
+	if (client->dpd_timer != NULL) {
+		timer_list_entry_reschedule(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
+		    client->dpd_timer);
+	}
+}
+
+int
+qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance, struct qnetd_client *client)
 {
+	int res;
 
-	if (instance->dpd_timer != NULL) {
-		timer_list_entry_delete(pr_poll_loop_get_timer_list(&instance->main_poll_loop), instance->dpd_timer);
-		instance->dpd_timer = NULL;
+	if (client->dpd_timer == NULL) {
+		return (0);
 	}
+
+	res = timer_list_entry_set_interval(pr_poll_loop_get_timer_list(&instance->main_poll_loop),
+	    client->dpd_timer,
+	    (PRUint32)(instance->advanced_settings->dpd_interval_coefficient * client->heartbeat_interval));
+
+	return (res);
 }

+ 14 - 6
qdevices/qnetd-dpd-timer.h → qdevices/qnetd-client-dpd-timer.h

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016 Red Hat, Inc.
+ * Copyright (c) 2015-2020 Red Hat, Inc.
  *
  * All rights reserved.
  *
@@ -32,8 +32,8 @@
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef _QNETD_DPD_TIMER_H_
-#define _QNETD_DPD_TIMER_H_
+#ifndef _QNETD_CLIENT_DPD_TIMER_H_
+#define _QNETD_CLIENT_DPD_TIMER_H_
 
 #include "qnetd-instance.h"
 
@@ -41,12 +41,20 @@
 extern "C" {
 #endif
 
-extern int		qnetd_dpd_timer_init(struct qnetd_instance *instance);
+extern int		qnetd_client_dpd_timer_init(struct qnetd_instance *instance,
+    struct qnetd_client *client);
 
-extern void		qnetd_dpd_timer_destroy(struct qnetd_instance *instance);
+extern void		qnetd_client_dpd_timer_destroy(struct qnetd_instance *instance,
+     struct qnetd_client *client);
+
+extern void		qnetd_client_dpd_timer_reschedule(struct qnetd_instance *instance,
+    struct qnetd_client *client);
+
+extern int		qnetd_client_dpd_timer_update_interval(struct qnetd_instance *instance,
+    struct qnetd_client *client);
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* _QNETD_DPD_TIMER_H_ */
+#endif /* _QNETD_CLIENT_DPD_TIMER_H_ */

+ 15 - 1
qdevices/qnetd-client-msg-received.c

@@ -40,6 +40,7 @@
 #include "qnetd-instance.h"
 #include "qnetd-log-debug.h"
 #include "qnetd-client-send.h"
+#include "qnetd-client-dpd-timer.h"
 #include "msg.h"
 #include "nss-sock.h"
 
@@ -368,6 +369,10 @@ qnetd_client_msg_received_init(struct qnetd_instance *instance, struct qnetd_cli
 			reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL;
 		} else {
 			client->heartbeat_interval = msg->heartbeat_interval;
+
+			if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) {
+				reply_error_code = TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL;
+			}
 		}
 	}
 
@@ -569,6 +574,15 @@ qnetd_client_msg_received_set_option(struct qnetd_instance *instance, struct qne
 		}
 
 		client->heartbeat_interval = msg->heartbeat_interval;
+
+		if (qnetd_client_dpd_timer_update_interval(instance, client) != 0) {
+			if (qnetd_client_send_err(client, msg->seq_number_set, msg->seq_number,
+			    TLV_REPLY_ERROR_CODE_INVALID_HEARTBEAT_INTERVAL) != 0) {
+				return (-1);
+			}
+
+			return (0);
+		}
 	}
 
 	if (msg->keep_active_partition_tie_breaker_set) {
@@ -1148,7 +1162,7 @@ qnetd_client_msg_received(struct qnetd_instance *instance, struct qnetd_client *
 	int ret_val;
 	int msg_processed;
 
-	client->dpd_msg_received_since_last_check = 1;
+	qnetd_client_dpd_timer_reschedule(instance, client);
 
 	msg_decoded_init(&msg);
 

+ 20 - 1
qdevices/qnetd-client-net.c

@@ -38,6 +38,7 @@
 #include "msgio.h"
 #include "msg.h"
 #include "nss-sock.h"
+#include "qnetd-client-dpd-timer.h"
 #include "qnetd-client-net.h"
 #include "qnetd-client-send.h"
 #include "qnetd-client-msg-received.h"
@@ -322,11 +323,29 @@ qnetd_client_net_accept(struct qnetd_instance *instance)
 	    instance, client) == -1) {
 		log(LOG_ERR, "Can't add client to main poll loop");
 		res_err = -2;
-		goto exit_close;
+		goto exit_client_list_del_close;
+	}
+
+	if (qnetd_client_dpd_timer_init(instance, client) == -1) {
+		res_err = -2;
+		goto exit_client_nspr_list_del_close;
 	}
 
 	return (0);
 
+exit_client_nspr_list_del_close:
+	if (pr_poll_loop_del_prfd(&instance->main_poll_loop, client_socket) == -1) {
+		log(LOG_ERR, "pr_poll_loop_del_prfd for client socket failed");
+	}
+
+exit_client_list_del_close:
+	qnetd_client_list_del(&instance->clients, client);
+	/*
+	 * client_addr_str is passed to qnetd_client_list_add and becomes part of client struct.
+	 * qnetd_client_list_del calls qnetd_client_destroy which frees this memory
+	 */
+	client_addr_str = NULL;
+
 exit_close:
 	free(client_addr_str);
 	PR_Close(client_socket);

+ 4 - 0
qdevices/qnetd-client.c

@@ -56,6 +56,10 @@ qnetd_client_init(struct qnetd_client *client, PRFileDesc *sock, PRNetAddr *addr
 	node_list_init(&client->last_membership_node_list);
 	node_list_init(&client->last_quorum_node_list);
 	client->main_timer_list = main_timer_list;
+	/*
+	 * Set max heartbeat interval before client sends init msg
+	 */
+	client->heartbeat_interval = QNETD_DEFAULT_HEARTBEAT_INTERVAL_MAX;
 }
 
 void

+ 1 - 2
qdevices/qnetd-client.h

@@ -83,8 +83,7 @@ struct qnetd_client {
 	struct timer_list_entry *algo_timer;
 	uint32_t algo_timer_vote_info_msq_seq_number;
 	int schedule_disconnect;
-	uint32_t dpd_time_since_last_check;
-	uint32_t dpd_msg_received_since_last_check;
+	struct timer_list_entry *dpd_timer;
 	enum tlv_vote last_sent_vote;
 	enum tlv_vote last_sent_ack_nack_vote;
 	enum tlv_heuristics last_membership_heuristics; /* Passed in membership node list */

+ 3 - 7
qdevices/qnetd-instance.c

@@ -37,9 +37,9 @@
 #include <pk11func.h>
 #include "qnetd-instance.h"
 #include "qnetd-client.h"
+#include "qnetd-client-dpd-timer.h"
 #include "qnetd-algorithm.h"
 #include "qnetd-log-debug.h"
-#include "qnetd-dpd-timer.h"
 #include "qnetd-client-algo-timer.h"
 
 int
@@ -62,10 +62,6 @@ qnetd_instance_init(struct qnetd_instance *instance,
 
 	pr_poll_loop_init(&instance->main_poll_loop);
 
-	if (qnetd_dpd_timer_init(instance) != 0) {
-		return (0);
-	}
-
 	return (0);
 }
 
@@ -75,8 +71,6 @@ qnetd_instance_destroy(struct qnetd_instance *instance)
 	struct qnetd_client *client;
 	struct qnetd_client *client_next;
 
-	qnetd_dpd_timer_destroy(instance);
-
 	client = TAILQ_FIRST(&instance->clients);
 	while (client != NULL) {
 		client_next = TAILQ_NEXT(client, entries);
@@ -105,6 +99,8 @@ qnetd_instance_client_disconnect(struct qnetd_instance *instance, struct qnetd_c
 		qnetd_algorithm_client_disconnect(client, server_going_down);
 	}
 
+	qnetd_client_dpd_timer_destroy(instance, client);
+
 	PR_Close(client->socket);
 	if (client->cluster != NULL) {
 		qnetd_cluster_list_del_client(&instance->clusters, client->cluster, client);

+ 0 - 1
qdevices/qnetd-instance.h

@@ -67,7 +67,6 @@ struct qnetd_instance {
 	int tls_client_cert_required;
 	const char *host_addr;
 	uint16_t host_port;
-	struct timer_list_entry *dpd_timer;		/* Dead peer detection timer */
 	struct unix_socket_ipc local_ipc;
 	const struct qnetd_advanced_settings *advanced_settings;
 	struct pr_poll_loop main_poll_loop;