Răsfoiți Sursa

qdevice-lms: improvements to LMS algorithm

Use the new timers to get better response from LMS when the network
splits, this also closes a race where the remote side could go inquorate
before we confirmed the vote.

Add client-side (qdevice-net) code to cope with a detached qnetd if we
are quorate and have wait_for_all enabled. THat situation will now
keep quorum.

Signed-off-by: Christine Caulfield <ccaulfie@redhat.com>
Christine Caulfield 10 ani în urmă
părinte
comite
6cfe96eb03
2 a modificat fișierele cu 74 adăugiri și 15 ștergeri
  1. 52 10
      qdevices/qdevice-net-algo-lms.c
  2. 22 5
      qdevices/qnetd-algo-lms.c

+ 52 - 10
qdevices/qdevice-net-algo-lms.c

@@ -41,19 +41,42 @@
 #include "qdevice-net-send.h"
 #include "qdevice-net-send.h"
 #include "qdevice-net-cast-vote-timer.h"
 #include "qdevice-net-cast-vote-timer.h"
 
 
+struct algo_lms_instance_data {
+	uint32_t quorate;
+	uint8_t have_wfa;
+	enum tlv_vote vote;
+};
+
 int
 int
 qdevice_net_algo_lms_init(struct qdevice_net_instance *instance)
 qdevice_net_algo_lms_init(struct qdevice_net_instance *instance)
 {
 {
-
-
+	struct algo_lms_instance_data *data;
+	int res;
+
+	data = malloc(sizeof(struct algo_lms_instance_data));
+	if (!data) {
+		return (-1);
+	}
+	instance->algorithm_data = data;
+
+	data->quorate = 0;
+	data->vote = TLV_VOTE_ASK_LATER;
+	res = cmap_get_uint8(instance->qdevice_instance_ptr->cmap_handle, "quorum.wait_for_all", &data->have_wfa);
+	if (res != CS_OK) {
+		qdevice_log(LOG_DEBUG, "algo-lms: Can't get WFA res = %d", res);
+		data->have_wfa = 0;
+	}
+
+	qdevice_log(LOG_DEBUG, "algo-lms: initialised. WFA = %d", data->have_wfa);
 	return (0);
 	return (0);
 }
 }
 
 
+
 int
 int
-qdevice_net_algo_lms_connected(struct qdevice_net_instance *instance, int *send_config_node_list,
-    int *send_membership_node_list, int *send_quorum_node_list, enum tlv_vote *vote)
+qdevice_net_algo_lms_connected(struct qdevice_net_instance *instance,
+    int *send_config_node_list, int *send_membership_node_list, int *send_quorum_node_list,
+    enum tlv_vote *vote)
 {
 {
-
 	return (0);
 	return (0);
 }
 }
 
 
@@ -71,7 +94,6 @@ qdevice_net_algo_lms_votequorum_node_list_notify(struct qdevice_net_instance *in
     const struct tlv_ring_id *ring_id, uint32_t node_list_entries, uint32_t node_list[],
     const struct tlv_ring_id *ring_id, uint32_t node_list_entries, uint32_t node_list[],
     int *send_node_list, enum tlv_vote *vote)
     int *send_node_list, enum tlv_vote *vote)
 {
 {
-
 	return (0);
 	return (0);
 }
 }
 
 
@@ -80,6 +102,10 @@ qdevice_net_algo_lms_votequorum_quorum_notify(struct qdevice_net_instance *insta
     uint32_t quorate, uint32_t node_list_entries, votequorum_node_t node_list[], int *send_node_list,
     uint32_t quorate, uint32_t node_list_entries, votequorum_node_t node_list[], int *send_node_list,
     enum tlv_vote *vote)
     enum tlv_vote *vote)
 {
 {
+	struct algo_lms_instance_data *data = instance->algorithm_data;
+
+	data->quorate = quorate;
+	qdevice_log(LOG_DEBUG, "algo-lms: quorum_notify. quorate = %d", data->quorate);
 
 
 	return (0);
 	return (0);
 }
 }
@@ -96,7 +122,6 @@ int
 qdevice_net_algo_lms_membership_node_list_reply_received(struct qdevice_net_instance *instance,
 qdevice_net_algo_lms_membership_node_list_reply_received(struct qdevice_net_instance *instance,
     uint32_t seq_number, const struct tlv_ring_id *ring_id, enum tlv_vote *vote)
     uint32_t seq_number, const struct tlv_ring_id *ring_id, enum tlv_vote *vote)
 {
 {
-
 	return (0);
 	return (0);
 }
 }
 
 
@@ -104,7 +129,6 @@ int
 qdevice_net_algo_lms_quorum_node_list_reply_received(struct qdevice_net_instance *instance,
 qdevice_net_algo_lms_quorum_node_list_reply_received(struct qdevice_net_instance *instance,
     uint32_t seq_number, enum tlv_vote *vote)
     uint32_t seq_number, enum tlv_vote *vote)
 {
 {
-
 	return (0);
 	return (0);
 }
 }
 
 
@@ -120,7 +144,6 @@ int
 qdevice_net_algo_lms_vote_info_received(struct qdevice_net_instance *instance,
 qdevice_net_algo_lms_vote_info_received(struct qdevice_net_instance *instance,
     uint32_t seq_number, enum tlv_vote *vote)
     uint32_t seq_number, enum tlv_vote *vote)
 {
 {
-
 	return (0);
 	return (0);
 }
 }
 
 
@@ -135,6 +158,17 @@ qdevice_net_algo_lms_echo_reply_received(struct qdevice_net_instance *instance,
 int
 int
 qdevice_net_algo_lms_echo_reply_not_received(struct qdevice_net_instance *instance)
 qdevice_net_algo_lms_echo_reply_not_received(struct qdevice_net_instance *instance)
 {
 {
+	struct algo_lms_instance_data *data = instance->algorithm_data;
+
+	qdevice_log(LOG_DEBUG, "algo-lms: echo_not_recvd. quorate = %d, WFA = %d", data->quorate, data->have_wfa);
+
+	/* qnetd server is disconnected, if we were already quorate AND WFA is enabled
+	   then we can continue to provide our vote.
+	   Otherwise ... no
+	*/
+	if (data->quorate && data->have_wfa) {
+		return (0);
+	}
 
 
 	return (-1);
 	return (-1);
 }
 }
@@ -143,14 +177,22 @@ int
 qdevice_net_algo_lms_disconnected(struct qdevice_net_instance *instance,
 qdevice_net_algo_lms_disconnected(struct qdevice_net_instance *instance,
     enum qdevice_net_disconnect_reason disconnect_reason, int *try_reconnect, enum tlv_vote *vote)
     enum qdevice_net_disconnect_reason disconnect_reason, int *try_reconnect, enum tlv_vote *vote)
 {
 {
+	struct algo_lms_instance_data *data = instance->algorithm_data;
 
 
+	qdevice_log(LOG_DEBUG, "algo-lms: disconnected. quorate = %d, WFA = %d", data->quorate, data->have_wfa);
+	qdevice_log(LOG_DEBUG, "algo-lms: disconnected. reason = %d, WFA = %d", disconnect_reason, data->have_wfa);
+
+	if (!data->quorate || !data->have_wfa) {
+		*vote = TLV_VOTE_NACK;
+	}
+	*try_reconnect = 1;
 	return (0);
 	return (0);
 }
 }
 
 
 void
 void
 qdevice_net_algo_lms_destroy(struct qdevice_net_instance *instance)
 qdevice_net_algo_lms_destroy(struct qdevice_net_instance *instance)
 {
 {
-
+	free(instance->algorithm_data);
 }
 }
 
 
 static struct qdevice_net_algorithm qdevice_net_algo_lms = {
 static struct qdevice_net_algorithm qdevice_net_algo_lms = {

+ 22 - 5
qdevices/qnetd-algo-lms.c

@@ -56,6 +56,7 @@
 #include "qnetd-log.h"
 #include "qnetd-log.h"
 #include "qnetd-cluster-list.h"
 #include "qnetd-cluster-list.h"
 #include "qnetd-algo-utils.h"
 #include "qnetd-algo-utils.h"
+#include "qnetd-client-algo-timer.h"
 
 
 struct qnetd_algo_lms_info {
 struct qnetd_algo_lms_info {
 	int num_config_nodes;
 	int num_config_nodes;
@@ -73,10 +74,15 @@ static enum tlv_reply_error_code do_lms_algorithm(struct qnetd_client *client, c
 	int num_partitions;
 	int num_partitions;
 	int joint_leader;
 	int joint_leader;
 
 
+	/* We are running the algorithm, don't do it again unless we say so */
+	qnetd_client_algo_timer_abort(client);
+
 	if (qnetd_algo_all_ring_ids_match(client, ring_id) == -1) {
 	if (qnetd_algo_all_ring_ids_match(client, ring_id) == -1) {
 		qnetd_log(LOG_DEBUG, "algo-lms: nodeid %d: ring ID %d/%ld not unique in this membership, waiting",
 		qnetd_log(LOG_DEBUG, "algo-lms: nodeid %d: ring ID %d/%ld not unique in this membership, waiting",
 			  client->node_id, ring_id->node_id, ring_id->seq);
 			  client->node_id, ring_id->node_id, ring_id->seq);
-		*result_vote = info->last_result = TLV_VOTE_ASK_LATER;
+
+		qnetd_client_algo_timer_schedule(client);
+		*result_vote = info->last_result = TLV_VOTE_WAIT_FOR_REPLY;
 		return (TLV_REPLY_ERROR_CODE_NO_ERROR);
 		return (TLV_REPLY_ERROR_CODE_NO_ERROR);
 	}
 	}
 
 
@@ -89,7 +95,9 @@ static enum tlv_reply_error_code do_lms_algorithm(struct qnetd_client *client, c
 	/* This can happen if we are first on the block */
 	/* This can happen if we are first on the block */
 	if (num_partitions == 0) {
 	if (num_partitions == 0) {
 		qnetd_log(LOG_DEBUG, "algo-lms: No partitions found");
 		qnetd_log(LOG_DEBUG, "algo-lms: No partitions found");
-		*result_vote = info->last_result = TLV_VOTE_ASK_LATER;
+
+		qnetd_client_algo_timer_schedule(client);
+		*result_vote = info->last_result = TLV_VOTE_WAIT_FOR_REPLY;
 		return (TLV_REPLY_ERROR_CODE_NO_ERROR);
 		return (TLV_REPLY_ERROR_CODE_NO_ERROR);
 	}
 	}
 
 
@@ -315,7 +323,7 @@ qnetd_algo_lms_client_disconnect(struct qnetd_client *client, int server_going_d
 
 
 /*
 /*
  * Called after client sent ask for vote message. This is usually happening after server
  * Called after client sent ask for vote message. This is usually happening after server
- * replied TLV_VOTE_ASK_LATER.
+ * replied TLV_VOTE_WAIT_FOR_REPLY.
  */
  */
 enum tlv_reply_error_code
 enum tlv_reply_error_code
 qnetd_algo_lms_ask_for_vote_received(struct qnetd_client *client, uint32_t msg_seq_num,
 qnetd_algo_lms_ask_for_vote_received(struct qnetd_client *client, uint32_t msg_seq_num,
@@ -331,7 +339,6 @@ qnetd_algo_lms_ask_for_vote_received(struct qnetd_client *client, uint32_t msg_s
 enum tlv_reply_error_code
 enum tlv_reply_error_code
 qnetd_algo_lms_vote_info_reply_received(struct qnetd_client *client, uint32_t msg_seq_num)
 qnetd_algo_lms_vote_info_reply_received(struct qnetd_client *client, uint32_t msg_seq_num)
 {
 {
-
 	qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id %"PRIx32") "
 	qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id %"PRIx32") "
 	    "replied back to vote info message", client, client->cluster_name, client->node_id);
 	    "replied back to vote info message", client, client->cluster_name, client->node_id);
 
 
@@ -342,8 +349,18 @@ enum tlv_reply_error_code
 qnetd_algo_lms_timer_callback(struct qnetd_client *client, int *reschedule_timer,
 qnetd_algo_lms_timer_callback(struct qnetd_client *client, int *reschedule_timer,
     int *send_vote, enum tlv_vote *result_vote)
     int *send_vote, enum tlv_vote *result_vote)
 {
 {
+	enum tlv_reply_error_code ret;
 
 
-	return (TLV_REPLY_ERROR_CODE_NO_ERROR);
+	qnetd_log(LOG_DEBUG, "algo-lms: Client %p (cluster %s, node_id %"PRIx32") "
+	    "Timer callback", client, client->cluster_name, client->node_id);
+
+	ret = do_lms_algorithm(client, &client->last_ring_id, result_vote);
+
+	if (ret == TLV_REPLY_ERROR_CODE_NO_ERROR &&
+	    (*result_vote == TLV_VOTE_ACK || *result_vote == TLV_VOTE_NACK)) {
+		*send_vote = 1;
+	}
+	return ret;
 }
 }
 
 
 static struct qnetd_algorithm qnetd_algo_lms = {
 static struct qnetd_algorithm qnetd_algo_lms = {