From 0f275bb11ad1389925d16f15d894eb2dabb0a06b Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Tue, 28 Apr 2026 14:58:21 +0200 Subject: [PATCH] Add per-rule duration histogram for failure classification Signed-off-by: Dejan Zele Pejchev --- internal/executor/categorizer/classifier.go | 7 +- .../executor/categorizer/classifier_test.go | 65 +++++++++++++++++++ internal/executor/metrics/metrics.go | 27 ++++++++ internal/executor/metrics/metrics_test.go | 49 ++++++++++++++ 4 files changed, 147 insertions(+), 1 deletion(-) diff --git a/internal/executor/categorizer/classifier.go b/internal/executor/categorizer/classifier.go index e50bb25a8af..988c8e02166 100644 --- a/internal/executor/categorizer/classifier.go +++ b/internal/executor/categorizer/classifier.go @@ -3,10 +3,12 @@ package categorizer import ( "fmt" "regexp" + "time" v1 "k8s.io/api/core/v1" "github.com/armadaproject/armada/internal/common/errormatch" + "github.com/armadaproject/armada/internal/executor/metrics" ) // maxCategoryNameLen is the maximum length for category and subcategory strings. @@ -157,7 +159,10 @@ func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult { for _, cat := range c.categories { for _, r := range cat.rules { - if ruleMatches(r, containers, podReason) { + start := time.Now() + matched := ruleMatches(r, containers, podReason) + metrics.RecordRuleEvaluationDuration(cat.name, r.subcategory, time.Since(start)) + if matched { return ClassifyResult{Category: cat.name, Subcategory: r.subcategory} } } diff --git a/internal/executor/categorizer/classifier_test.go b/internal/executor/categorizer/classifier_test.go index d222f12b9cb..c71f7d7920d 100644 --- a/internal/executor/categorizer/classifier_test.go +++ b/internal/executor/categorizer/classifier_test.go @@ -4,6 +4,8 @@ import ( "strings" "testing" + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" v1 "k8s.io/api/core/v1" @@ -408,6 +410,69 @@ func TestNewClassifier_ValidationErrors(t *testing.T) { } } +func TestClassify_RecordsRuleEvaluationDuration(t *testing.T) { + classifier, err := NewClassifier(ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "infra", Rules: []CategoryRule{ + {OnConditions: []string{errormatch.ConditionOOMKilled}, Subcategory: "oom"}, + }}, + {Name: "user_error", Rules: []CategoryRule{ + {OnExitCodes: &errormatch.ExitCodeMatcher{Operator: errormatch.ExitCodeOperatorIn, Values: []int32{42}}, Subcategory: "bad_exit"}, + {OnExitCodes: &errormatch.ExitCodeMatcher{Operator: errormatch.ExitCodeOperatorIn, Values: []int32{74}}, Subcategory: "other_exit"}, + }}, + }}) + require.NoError(t, err) + + before := ruleHistogramCounts(t) + pod := podWithTerminatedContainer(42, "Error", "") + result := classifier.Classify(pod) + require.Equal(t, "user_error", result.Category) + require.Equal(t, "bad_exit", result.Subcategory) + after := ruleHistogramCounts(t) + + // First rule (infra/oom) does not match -> observed. + assert.Equal(t, uint64(1), after[labelKey{"infra", "oom"}]-before[labelKey{"infra", "oom"}], + "non-matching rule before the match should be observed") + // Second rule (user_error/bad_exit) matches -> observed. + assert.Equal(t, uint64(1), after[labelKey{"user_error", "bad_exit"}]-before[labelKey{"user_error", "bad_exit"}], + "matching rule should be observed") + // Third rule (user_error/other_exit) is after the match -> not reached, not observed. + assert.Equal(t, uint64(0), after[labelKey{"user_error", "other_exit"}]-before[labelKey{"user_error", "other_exit"}], + "rule after the match should not be observed") +} + +type labelKey struct{ category, subcategory string } + +// ruleHistogramCounts gathers the rule-evaluation histogram from the default +// registry and returns a map of (category, subcategory) -> sample count. +func ruleHistogramCounts(t *testing.T) map[labelKey]uint64 { + t.Helper() + families, err := prometheus.DefaultGatherer.Gather() + require.NoError(t, err) + counts := map[labelKey]uint64{} + for _, mf := range families { + if mf.GetName() != "armada_executor_job_failure_rule_evaluation_duration_seconds" { + continue + } + for _, m := range mf.GetMetric() { + counts[labelKeyFromMetric(m)] = m.GetHistogram().GetSampleCount() + } + } + return counts +} + +func labelKeyFromMetric(m *dto.Metric) labelKey { + var k labelKey + for _, lp := range m.GetLabel() { + switch lp.GetName() { + case "failure_category": + k.category = lp.GetValue() + case "failure_subcategory": + k.subcategory = lp.GetValue() + } + } + return k +} + func podWithTerminatedContainer(exitCode int32, reason, message string) *v1.Pod { return &v1.Pod{ Status: v1.PodStatus{ diff --git a/internal/executor/metrics/metrics.go b/internal/executor/metrics/metrics.go index c0483201865..3f4c0a5078d 100644 --- a/internal/executor/metrics/metrics.go +++ b/internal/executor/metrics/metrics.go @@ -1,6 +1,8 @@ package metrics import ( + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" ) @@ -20,6 +22,21 @@ var jobFailureCategoryTotal = promauto.NewCounterVec( []string{failureCategoryLabel, failureSubcategoryLabel}, ) +var jobFailureRuleEvaluationDurationSeconds = promauto.NewHistogramVec( + prometheus.HistogramOpts{ + Name: ArmadaExecutorMetricsPrefix + "job_failure_rule_evaluation_duration_seconds", + Help: "Duration of evaluating a single classification rule against a pod, " + + "labeled by the rule's category and subcategory. Observed for every " + + "rule evaluation regardless of whether it matched.", + Buckets: []float64{ + 0.00001, 0.0001, 0.0005, + 0.001, 0.005, 0.01, 0.05, + 0.1, 0.25, + }, + }, + []string{failureCategoryLabel, failureSubcategoryLabel}, +) + // RecordJobFailure increments the per-category failure counter. Should be // called from paths that commit to emitting a JobFailedEvent — retryable // pod issues that emit a ReturnLease instead must not call this. @@ -34,3 +51,13 @@ func RecordJobFailure(category, subcategory string) { } jobFailureCategoryTotal.WithLabelValues(category, subcategory).Inc() } + +// RecordRuleEvaluationDuration records the time a single classification +// rule took to evaluate. Called for every rule regardless of match outcome. +// An empty category is a no-op to avoid an empty failure_category label. +func RecordRuleEvaluationDuration(category, subcategory string, duration time.Duration) { + if category == "" { + return + } + jobFailureRuleEvaluationDurationSeconds.WithLabelValues(category, subcategory).Observe(duration.Seconds()) +} diff --git a/internal/executor/metrics/metrics_test.go b/internal/executor/metrics/metrics_test.go index edcc22b8634..254d36b5476 100644 --- a/internal/executor/metrics/metrics_test.go +++ b/internal/executor/metrics/metrics_test.go @@ -2,9 +2,13 @@ package metrics import ( "testing" + "time" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" + dto "github.com/prometheus/client_model/go" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestRecordJobFailure(t *testing.T) { @@ -38,3 +42,48 @@ func TestRecordJobFailure(t *testing.T) { }) } } + +func TestRecordRuleEvaluationDuration(t *testing.T) { + tests := map[string]struct { + category string + subcategory string + duration time.Duration + expectedDelta uint64 + }{ + "records observation for category and subcategory": { + category: "rule-eval-test-cat", + subcategory: "rule-eval-test-sub", + duration: 100 * time.Microsecond, + expectedDelta: 1, + }, + "accepts empty subcategory": { + category: "rule-eval-test-no-sub", + subcategory: "", + duration: 50 * time.Microsecond, + expectedDelta: 1, + }, + "empty category is a no-op": { + category: "", + subcategory: "rule-eval-test-empty-cat", + duration: 75 * time.Microsecond, + expectedDelta: 0, + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + h := jobFailureRuleEvaluationDurationSeconds.WithLabelValues(tc.category, tc.subcategory).(prometheus.Histogram) + before := histogramSampleCount(t, h) + RecordRuleEvaluationDuration(tc.category, tc.subcategory, tc.duration) + after := histogramSampleCount(t, h) + assert.Equal(t, tc.expectedDelta, after-before) + }) + } +} + +func histogramSampleCount(t *testing.T, h prometheus.Histogram) uint64 { + t.Helper() + pb := &dto.Metric{} + require.NoError(t, h.Write(pb)) + require.NotNil(t, pb.Histogram) + return pb.Histogram.GetSampleCount() +}