Просмотр исходного кода

feat: Better prometheus support

jamesread 1 неделя назад
Родитель
Сommit
d1787a3823

+ 48 - 4
docs/modules/ROOT/pages/advanced_configuration/prometheus.adoc

@@ -25,16 +25,60 @@ This will give you metrics available at http://yourserver:1337/metrics. The page
 [source]
 ----
 # HELP olivetin_actions_requested_count The actions requested count
-# TYPE olivetin_actions_requested_count gauge
+# TYPE olivetin_actions_requested_count counter
 olivetin_actions_requested_count 0
+# HELP olivetin_action_executions_total Total number of finished action executions grouped by result.
+# TYPE olivetin_action_executions_total counter
+olivetin_action_executions_total{result="success"} 0
+olivetin_action_executions_total{result="failed"} 0
+olivetin_action_executions_total{result="blocked"} 0
+olivetin_action_executions_total{result="timeout"} 0
+olivetin_action_executions_total{result="error"} 0
+# HELP olivetin_action_execution_duration_seconds Action execution duration in seconds from start to finish.
+# TYPE olivetin_action_execution_duration_seconds histogram
+olivetin_action_execution_duration_seconds_bucket{le="0.1"} 0
 # HELP olivetin_config_action_count Then number of actions in the config file
 # TYPE olivetin_config_action_count gauge
 olivetin_config_action_count 18
 # HELP olivetin_config_reloaded_count The number of times the config has been reloaded
 # TYPE olivetin_config_reloaded_count counter
 olivetin_config_reloaded_count 1
-# HELP olivetin_sv_count The number entries in the sv map
-# TYPE olivetin_sv_count gauge
-olivetin_sv_count 49
+----
+
+=== Failed job monitoring
+
+Finished action executions are counted in `olivetin_action_executions_total` with a `result` label:
+
+[cols="1,2"]
+|===
+| `success` | Command finished with exit code 0
+| `failed` | Command ran but exited with a non-zero code
+| `timeout` | Command exceeded its configured timeout
+| `blocked` | Execution was blocked (ACL, rate limit, concurrency, or queue limit)
+| `error` | Execution failed before the command ran (for example, invalid arguments)
+|===
+
+`olivetin_action_execution_duration_seconds` records how long each finished execution took.
+
+Example Prometheus alert rules:
+
+[source,yaml]
+----
+groups:
+  - name: olivetin
+    rules:
+      - alert: OliveTinActionFailed
+        expr: increase(olivetin_action_executions_total{result="failed"}[15m]) > 0
+        labels:
+          severity: warning
+        annotations:
+          summary: OliveTin action failed with non-zero exit code
+
+      - alert: OliveTinActionTimedOut
+        expr: increase(olivetin_action_executions_total{result="timeout"}[15m]) > 0
+        labels:
+          severity: warning
+        annotations:
+          summary: OliveTin action timed out
 ----
 

+ 2 - 0
integration-tests/tests/prometheus/prometheus.mjs

@@ -8,6 +8,8 @@ import {
 
 let metrics = [
   {'name': 'olivetin_actions_requested_count', 'type': 'counter', 'desc': 'The actions requested count'},
+  {'name': 'olivetin_action_executions_total', 'type': 'counter', 'desc': 'Total number of finished action executions grouped by result\\.'},
+  {'name': 'olivetin_action_execution_duration_seconds', 'type': 'histogram', 'desc': 'Action execution duration in seconds from start to finish\\.'},
   {'name': 'olivetin_config_action_count', 'type': 'gauge', 'desc': 'The number of actions in the config file'},
   {'name': 'olivetin_config_reloaded_count', 'type': 'counter', 'desc': 'The number of times the config has been reloaded'},
 ]

+ 2 - 9
service/internal/executor/executor.go

@@ -11,8 +11,6 @@ import (
 	"github.com/google/uuid"
 	log "github.com/sirupsen/logrus"
 
-	"github.com/prometheus/client_golang/prometheus"
-	"github.com/prometheus/client_golang/prometheus/promauto"
 	"gopkg.in/yaml.v3"
 
 	"bytes"
@@ -40,13 +38,6 @@ func isValidTrackingID(id string) bool {
 	return id != "" && len(id) <= MaxTrackingIDLength && validTrackingIDPattern.MatchString(id)
 }
 
-var (
-	metricActionsRequested = promauto.NewCounter(prometheus.CounterOpts{
-		Name: "olivetin_actions_requested_count",
-		Help: "The actions requested count",
-	})
-)
-
 type ActionBinding struct {
 	ID           string
 	Action       *config.Action
@@ -705,6 +696,8 @@ func (e *Executor) finishExecChain(req *ExecutionRequest) {
 		entry.ExecutionFinished = true
 	})
 
+	recordExecutionMetrics(req.logEntry)
+
 	notifyListenersFinished(req)
 	e.drainGroupQueue()
 }

+ 81 - 0
service/internal/executor/prometheus.go

@@ -0,0 +1,81 @@
+package executor
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+)
+
+const (
+	executionResultSuccess = "success"
+	executionResultFailed  = "failed"
+	executionResultBlocked = "blocked"
+	executionResultTimeout = "timeout"
+	executionResultError   = "error"
+)
+
+var (
+	metricActionsRequested = promauto.NewCounter(prometheus.CounterOpts{
+		Name: "olivetin_actions_requested_count",
+		Help: "The actions requested count",
+	})
+
+	metricActionExecutionsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
+		Name: "olivetin_action_executions_total",
+		Help: "Total number of finished action executions grouped by result.",
+	}, []string{"result"})
+
+	metricActionExecutionDuration = promauto.NewHistogram(prometheus.HistogramOpts{
+		Name:    "olivetin_action_execution_duration_seconds",
+		Help:    "Action execution duration in seconds from start to finish.",
+		Buckets: []float64{0.1, 0.5, 1, 2, 5, 10, 30, 60, 120, 300, 600},
+	})
+)
+
+func executionResultLabel(entry *InternalLogEntry) string {
+	if entry.Blocked {
+		return executionResultBlocked
+	}
+
+	return finishedExecutionResultLabel(entry)
+}
+
+func finishedExecutionResultLabel(entry *InternalLogEntry) string {
+	if entry.TimedOut {
+		return executionResultTimeout
+	}
+
+	switch {
+	case entry.ExitCode == 0:
+		return executionResultSuccess
+	case isPreExecutionError(entry):
+		return executionResultError
+	default:
+		return executionResultFailed
+	}
+}
+
+func isPreExecutionError(entry *InternalLogEntry) bool {
+	return entry.ExitCode == DefaultExitCodeNotExecuted || !entry.ExecutionStarted
+}
+
+func recordExecutionMetrics(entry *InternalLogEntry) {
+	if entry == nil || entry.Queued {
+		return
+	}
+
+	metricActionExecutionsTotal.WithLabelValues(executionResultLabel(entry)).Inc()
+	recordExecutionDuration(entry)
+}
+
+func recordExecutionDuration(entry *InternalLogEntry) {
+	if entry.DatetimeFinished.IsZero() || entry.DatetimeStarted.IsZero() {
+		return
+	}
+
+	duration := entry.DatetimeFinished.Sub(entry.DatetimeStarted).Seconds()
+	if duration < 0 {
+		return
+	}
+
+	metricActionExecutionDuration.Observe(duration)
+}

+ 76 - 0
service/internal/executor/prometheus_test.go

@@ -0,0 +1,76 @@
+package executor
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestExecutionResultLabel(t *testing.T) {
+	tests := []struct {
+		name  string
+		entry *InternalLogEntry
+		want  string
+	}{
+		{
+			name: "success",
+			entry: &InternalLogEntry{
+				ExecutionStarted:  true,
+				ExecutionFinished: true,
+				ExitCode:          0,
+			},
+			want: executionResultSuccess,
+		},
+		{
+			name: "failed nonzero exit",
+			entry: &InternalLogEntry{
+				ExecutionStarted:  true,
+				ExecutionFinished: true,
+				ExitCode:          1,
+			},
+			want: executionResultFailed,
+		},
+		{
+			name: "blocked",
+			entry: &InternalLogEntry{
+				Blocked:           true,
+				ExecutionFinished: true,
+				ExitCode:          0,
+			},
+			want: executionResultBlocked,
+		},
+		{
+			name: "timeout",
+			entry: &InternalLogEntry{
+				ExecutionStarted:  true,
+				ExecutionFinished: true,
+				TimedOut:          true,
+				ExitCode:          -1,
+			},
+			want: executionResultTimeout,
+		},
+		{
+			name: "error before execution",
+			entry: &InternalLogEntry{
+				ExecutionFinished: true,
+				ExitCode:          DefaultExitCodeNotExecuted,
+			},
+			want: executionResultError,
+		},
+		{
+			name: "error never started",
+			entry: &InternalLogEntry{
+				ExecutionStarted:  false,
+				ExecutionFinished: true,
+				ExitCode:          2,
+			},
+			want: executionResultError,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.want, executionResultLabel(tt.entry))
+		})
+	}
+}