Răsfoiți Sursa

Detect big scheduling pauses

Add poll timer scheduler to be called 3 times per token timeout.
If poll timer was not called for more then 0.8 * token timeout, it means
corosync process was not scheduled and ether token_timeout should be
increased or load should be reduced (useful for VM, where host is
overcommitted so VM is not scheduled as expected).

Signed-off-by: Jan Friesse <jfriesse@redhat.com>
Reviewed-by: Fabio M. Di Nitto <fdinitto@redhat.com>
Jan Friesse 13 ani în urmă
părinte
comite
8429d01389
1 a modificat fișierele cu 50 adăugiri și 0 ștergeri
  1. 50 0
      exec/main.c

+ 50 - 0
exec/main.c

@@ -786,6 +786,51 @@ void message_source_set (
 	source->conn = conn;
 }
 
+struct scheduler_pause_timeout_data {
+	struct totem_config *totem_config;
+	qb_loop_timer_handle handle;
+	unsigned long long tv_prev;
+	unsigned long long max_tv_diff;
+};
+
+static void timer_function_scheduler_timeout (void *data)
+{
+	struct scheduler_pause_timeout_data *timeout_data = (struct scheduler_pause_timeout_data *)data;
+	unsigned long long tv_current;
+	unsigned long long tv_diff;
+
+	tv_current = qb_util_nano_current_get ();
+
+	if (timeout_data->tv_prev == 0) {
+		/*
+		 * Initial call -> just pretent everything is ok
+		 */
+		timeout_data->tv_prev = tv_current;
+		timeout_data->max_tv_diff = 0;
+	}
+
+	tv_diff = tv_current - timeout_data->tv_prev;
+	timeout_data->tv_prev = tv_current;
+
+	if (tv_diff > timeout_data->max_tv_diff) {
+		log_printf (LOGSYS_LEVEL_WARNING, "Corosync main process was not scheduled for %0.4f ms "
+		    "(threshold is %0.4f ms). Consider token timeout increase.",
+		    (float)tv_diff / QB_TIME_NS_IN_MSEC, (float)timeout_data->max_tv_diff / QB_TIME_NS_IN_MSEC);
+	}
+
+	/*
+	 * Set next threshold, because token_timeout can change
+	 */
+	timeout_data->max_tv_diff = timeout_data->totem_config->token_timeout * QB_TIME_NS_IN_MSEC * 0.8;
+	qb_loop_timer_add (corosync_poll_handle,
+		QB_LOOP_MED,
+		timeout_data->totem_config->token_timeout / 3,
+		timeout_data,
+		timer_function_scheduler_timeout,
+		&timeout_data->handle);
+}
+
+
 static void corosync_setscheduler (void)
 {
 #if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX) && defined(HAVE_SCHED_SETSCHEDULER)
@@ -1038,6 +1083,7 @@ int main (int argc, char **argv, char **envp)
 	char corosync_lib_dir[PATH_MAX];
 	enum e_corosync_done flock_err;
 	uint64_t totem_config_warnings;
+	struct scheduler_pause_timeout_data scheduler_pause_timeout_data;
 
 	/* default configuration
 	 */
@@ -1194,6 +1240,10 @@ int main (int argc, char **argv, char **envp)
 
 	corosync_poll_handle = qb_loop_create ();
 
+	memset(&scheduler_pause_timeout_data, 0, sizeof(scheduler_pause_timeout_data));
+	scheduler_pause_timeout_data.totem_config = &totem_config;
+	timer_function_scheduler_timeout (&scheduler_pause_timeout_data);
+
 	qb_loop_signal_add(corosync_poll_handle, QB_LOOP_LOW,
 		SIGUSR2, NULL, sig_diag_handler, NULL);
 	qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH,