diff --git a/internal/executor/categorizer/classifier.go b/internal/executor/categorizer/classifier.go index e50bb25a8af..dcf50fef50a 100644 --- a/internal/executor/categorizer/classifier.go +++ b/internal/executor/categorizer/classifier.go @@ -24,14 +24,29 @@ type rule struct { containerName string onExitCodes *errormatch.ExitCodeMatcher onTerminationMessage *regexp.Regexp + onPodError *regexp.Regexp onConditions []string subcategory string + hint string } // ClassifyResult holds the classification output for a failed pod. type ClassifyResult struct { Category string Subcategory string + // Hint is operator-supplied user-facing copy attached to the matching rule. + // Use AppendHint to attach it to the failure message before emitting events. + Hint string +} + +// AppendHint returns the message with this result's hint appended after a blank +// line. Returns the message unchanged when no hint is set. Centralizing the +// format here keeps both event-reporting call sites consistent. +func (r ClassifyResult) AppendHint(message string) string { + if r.Hint == "" { + return message + } + return fmt.Sprintf("%s\n\n%s", message, r.Hint) } // Classifier evaluates pods against a set of category rules and returns @@ -99,11 +114,14 @@ func buildRule(cfg CategoryRule) (rule, error) { if cfg.OnTerminationMessage != nil { matcherCount++ } + if cfg.OnPodError != nil { + matcherCount++ + } if matcherCount == 0 { - return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, or onTerminationMessage") + return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, onTerminationMessage, or onPodError") } if matcherCount > 1 { - return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, or onTerminationMessage") + return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, onTerminationMessage, or onPodError") } for _, cond := range cfg.OnConditions { @@ -126,29 +144,57 @@ func buildRule(cfg CategoryRule) (rule, error) { } } - var compiledRegex *regexp.Regexp + var terminationRegex *regexp.Regexp if cfg.OnTerminationMessage != nil { re, err := regexp.Compile(cfg.OnTerminationMessage.Pattern) if err != nil { - return rule{}, fmt.Errorf("invalid regex %q: %w", cfg.OnTerminationMessage.Pattern, err) + return rule{}, fmt.Errorf("invalid onTerminationMessage regex %q: %w", cfg.OnTerminationMessage.Pattern, err) + } + terminationRegex = re + } + + var podErrorRegex *regexp.Regexp + if cfg.OnPodError != nil { + re, err := regexp.Compile(cfg.OnPodError.Pattern) + if err != nil { + return rule{}, fmt.Errorf("invalid onPodError regex %q: %w", cfg.OnPodError.Pattern, err) } - compiledRegex = re + podErrorRegex = re } return rule{ containerName: cfg.ContainerName, onExitCodes: cfg.OnExitCodes, onConditions: cfg.OnConditions, - onTerminationMessage: compiledRegex, + onTerminationMessage: terminationRegex, + onPodError: podErrorRegex, subcategory: cfg.Subcategory, + hint: cfg.Hint, }, nil } -// Classify returns the category and subcategory for the given pod. -// Rules are evaluated in config order; the first matching rule wins. +// ClassifyContainerError returns the category and subcategory for a pod whose +// failure is described by its own state: terminated containers, exit codes, +// and Kubernetes conditions. Use it for terminated pods (PodFailed phase). +// Returns empty result if the receiver is nil or the pod is nil. +// Returns (defaultCategory, defaultSubcategory) if no rules match. +func (c *Classifier) ClassifyContainerError(pod *v1.Pod) ClassifyResult { + return c.classify(pod, "") +} + +// ClassifyPodError returns the category and subcategory for a pod-level failure +// captured by the executor (image pull, missing volume, stuck terminating, +// active deadline exceeded, etc.). It additionally matches podErrorMessage +// against onPodError rules (see CategoryRule.OnPodError); all other rule types +// are evaluated against pod state, preserving first-match-wins across config order. // Returns empty result if the receiver is nil or the pod is nil. // Returns (defaultCategory, defaultSubcategory) if no rules match. -func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult { +func (c *Classifier) ClassifyPodError(pod *v1.Pod, podErrorMessage string) ClassifyResult { + return c.classify(pod, podErrorMessage) +} + +// Rules are evaluated in config order; the first matching rule wins. +func (c *Classifier) classify(pod *v1.Pod, podErrorMessage string) ClassifyResult { if c == nil || pod == nil { return ClassifyResult{} } @@ -157,18 +203,19 @@ func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult { for _, cat := range c.categories { for _, r := range cat.rules { - if ruleMatches(r, containers, podReason) { - return ClassifyResult{Category: cat.name, Subcategory: r.subcategory} + if ruleMatches(r, containers, podReason, podErrorMessage) { + return ClassifyResult{Category: cat.name, Subcategory: r.subcategory, Hint: r.hint} } } } return ClassifyResult{Category: c.defaultCategory, Subcategory: c.defaultSubcategory} } -// ruleMatches evaluates a single rule. When containerName is set, only that -// container is considered. It checks the first non-nil matcher: -// conditions > exit codes > termination message. -func ruleMatches(r rule, containers []containerInfo, podReason string) bool { +// ruleMatches evaluates a single rule. Container-level matchers honor the +// rule's containerName scope (when set); onPodError ignores it because the +// pod-level error has no container attribution. Exactly one matcher is set +// per rule (validated at NewClassifier). +func ruleMatches(r rule, containers []containerInfo, podReason, podErrorMessage string) bool { filtered := containers if r.containerName != "" { filtered = filterByName(containers, r.containerName) @@ -182,6 +229,9 @@ func ruleMatches(r rule, containers []containerInfo, podReason string) bool { if r.onTerminationMessage != nil { return matchesTerminationMessage(r.onTerminationMessage, filtered) } + if r.onPodError != nil { + return podErrorMessage != "" && errormatch.MatchPattern(r.onPodError, podErrorMessage) + } return false } diff --git a/internal/executor/categorizer/classifier_test.go b/internal/executor/categorizer/classifier_test.go index d222f12b9cb..6ef47c35fb4 100644 --- a/internal/executor/categorizer/classifier_test.go +++ b/internal/executor/categorizer/classifier_test.go @@ -15,6 +15,7 @@ func TestClassify(t *testing.T) { tests := map[string]struct { config ErrorCategoriesConfig pod *v1.Pod + podErrorMessage string expectedCategory string expectedSubcategory string }{ @@ -281,19 +282,130 @@ func TestClassify(t *testing.T) { pod: podWithTerminatedContainer(1, "Error", ""), expectedCategory: "", }, + "onPodError matches the captured kubelet error": { + config: ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "infrastructure", Rules: []CategoryRule{ + {OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"}, + }}, + }}, + pod: &v1.Pod{Status: v1.PodStatus{ + Phase: v1.PodPending, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "main", State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"}, + }}, + }, + }}, + podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`, + expectedCategory: "infrastructure", + expectedSubcategory: "platform_mismatch", + }, + "empty podErrorMessage does not match onPodError": { + config: ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "infrastructure", Rules: []CategoryRule{ + {OnPodError: &errormatch.RegexMatcher{Pattern: "anything"}, Subcategory: "x"}, + }}, + }}, + pod: &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}}, + podErrorMessage: "", + expectedCategory: "", + }, + // Pins the public contract that ClassifyContainerError never matches onPodError rules, + // even with a pattern (".*") that would otherwise match empty input. The guard is enforced + // at two layers (ruleMatches and errormatch.MatchPattern); this test fails only if both go, + // which is the right level to assert the contract regardless of internal layering. + "ClassifyContainerError must not match onPodError even when regex matches empty": { + config: ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "infrastructure", Rules: []CategoryRule{ + {OnPodError: &errormatch.RegexMatcher{Pattern: ".*"}, Subcategory: "should_not_fire"}, + }}, + }}, + pod: &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}}, + podErrorMessage: "", + expectedCategory: "", + }, + "onPodError ignores ContainerName scope (pod-level error has no container attribution)": { + config: ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "infrastructure", Rules: []CategoryRule{ + {ContainerName: "init", OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"}, + }}, + }}, + pod: &v1.Pod{Status: v1.PodStatus{ + Phase: v1.PodPending, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "main", State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"}, + }}, + }, + }}, + podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`, + expectedCategory: "infrastructure", + expectedSubcategory: "platform_mismatch", + }, + "onTerminationMessage does not match pod-level podErrorMessage": { + config: ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "infrastructure", Rules: []CategoryRule{ + {OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "should_not_fire"}, + }}, + }}, + pod: &v1.Pod{Status: v1.PodStatus{ + Phase: v1.PodPending, + ContainerStatuses: []v1.ContainerStatus{ + {Name: "main", State: v1.ContainerState{ + Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"}, + }}, + }, + }}, + podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`, + expectedCategory: "", + }, } for name, tc := range tests { t.Run(name, func(t *testing.T) { classifier, err := NewClassifier(tc.config) require.NoError(t, err) - result := classifier.Classify(tc.pod) + var result ClassifyResult + if tc.podErrorMessage == "" { + result = classifier.ClassifyContainerError(tc.pod) + } else { + result = classifier.ClassifyPodError(tc.pod, tc.podErrorMessage) + } assert.Equal(t, tc.expectedCategory, result.Category) assert.Equal(t, tc.expectedSubcategory, result.Subcategory) }) } } +func TestClassifyResult_AppendHint(t *testing.T) { + tests := map[string]struct { + result ClassifyResult + message string + expected string + }{ + "empty hint returns message unchanged": { + result: ClassifyResult{Category: "x", Subcategory: "y"}, + message: "raw runtime error", + expected: "raw runtime error", + }, + "non-empty hint is appended after a blank line": { + result: ClassifyResult{Hint: "operator guidance"}, + message: "raw runtime error", + expected: "raw runtime error\n\noperator guidance", + }, + "empty message with hint preserves separator": { + result: ClassifyResult{Hint: "guidance"}, + message: "", + expected: "\n\nguidance", + }, + } + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + assert.Equal(t, tc.expected, tc.result.AppendHint(tc.message)) + }) + } +} + func TestNewClassifier_ValidationErrors(t *testing.T) { tests := map[string]struct { config ErrorCategoriesConfig @@ -346,13 +458,21 @@ func TestNewClassifier_ValidationErrors(t *testing.T) { }}, errContains: "requires at least one value", }, - "invalid regex": { + "invalid onTerminationMessage regex": { config: ErrorCategoriesConfig{Categories: []CategoryConfig{ {Name: "bad", Rules: []CategoryRule{ {OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "[invalid"}}, }}, }}, - errContains: "invalid regex", + errContains: "invalid onTerminationMessage regex", + }, + "invalid onPodError regex": { + config: ErrorCategoriesConfig{Categories: []CategoryConfig{ + {Name: "bad", Rules: []CategoryRule{ + {OnPodError: &errormatch.RegexMatcher{Pattern: "[invalid"}}, + }}, + }}, + errContains: "invalid onPodError regex", }, "empty rules": { config: ErrorCategoriesConfig{Categories: []CategoryConfig{{Name: "empty", Rules: nil}}}, diff --git a/internal/executor/categorizer/doc.go b/internal/executor/categorizer/doc.go index add67dc02a5..c75527d32ca 100644 --- a/internal/executor/categorizer/doc.go +++ b/internal/executor/categorizer/doc.go @@ -11,9 +11,19 @@ // category name and the rule's optional subcategory. // // Each rule uses exactly one matcher: -// - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded, AppError) +// - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded) // - OnExitCodes: matches non-zero container exit codes using In/NotIn set operators // - OnTerminationMessage: matches container termination messages against a regex +// - OnPodError: matches a pod-level error message captured by the executor +// against a regex; covers failures with no useful container terminationMessage +// (image pull, missing volume, stuck terminating, deadline exceeded, etc.) +// +// Container-level matchers honor ContainerName scoping when set. OnPodError +// ignores it because pod-level error text has no container attribution. +// +// Each rule may also set Hint, an optional user-facing string that the executor +// appends to the failure message. Hints land in lookoutdb.job_run.error and +// are surfaced to users in Lookout alongside the raw runtime error. // // Exit code 0 is always skipped. Both regular and init containers are checked. // @@ -28,8 +38,13 @@ // rules: // - onConditions: ["OOMKilled"] // subcategory: "oom" +// hint: "Increase the memory request in your job spec" // - onConditions: ["Evicted"] // subcategory: "eviction" +// - onPodError: +// pattern: "no match for platform in manifest" +// subcategory: "platform_mismatch" +// hint: "Build the image for the cluster's CPU architecture (typically x64/arm64 mismatch)" // - name: user_code // rules: // - onExitCodes: @@ -52,5 +67,11 @@ // if err != nil { // // handle invalid config // } -// result := classifier.Classify(pod) // result.Category = "infrastructure", result.Subcategory = "oom" +// +// // Terminated pod: container state carries the relevant termination signals. +// result := classifier.ClassifyContainerError(pod) +// +// // Pre-terminal failure: an executor-captured error message is matched +// // against onPodError rules in addition to pod state. +// result = classifier.ClassifyPodError(pod, podErrorMessage) package categorizer diff --git a/internal/executor/categorizer/types.go b/internal/executor/categorizer/types.go index adbc4fe3b9c..b5114c52813 100644 --- a/internal/executor/categorizer/types.go +++ b/internal/executor/categorizer/types.go @@ -23,12 +23,27 @@ type CategoryConfig struct { // CategoryRule defines a single matching condition. Exactly one matcher must // be set per rule (validated by NewClassifier). Rules within a category are OR'd. -// When ContainerName is set, only failures from that container are considered. -// When empty, failures from any container can match (default). +// +// Container-level matchers (OnConditions, OnExitCodes, OnTerminationMessage) +// inspect per-container state from pod.Status; ContainerName scopes them to a +// specific container when set, otherwise any container can match. +// +// OnPodError is pod-level: it matches a regex against the failure message the +// executor captured for the issue. Use it for failures where no container has +// a useful terminationMessage, including kubelet/runtime errors (image pull, +// missing volume, missing config) and Armada-detected conditions (stuck +// terminating, active deadline exceeded, externally deleted). ContainerName +// is ignored for OnPodError because the message has no container attribution. type CategoryRule struct { ContainerName string `yaml:"containerName,omitempty"` OnExitCodes *errormatch.ExitCodeMatcher `yaml:"onExitCodes,omitempty"` OnTerminationMessage *errormatch.RegexMatcher `yaml:"onTerminationMessage,omitempty"` + OnPodError *errormatch.RegexMatcher `yaml:"onPodError,omitempty"` OnConditions []string `yaml:"onConditions,omitempty"` Subcategory string `yaml:"subcategory,omitempty"` + // Hint is operator-supplied user-facing copy describing this failure mode. + // When set, it is appended to the failure message that lands in + // lookoutdb.job_run.error so end users see actionable guidance alongside the + // raw runtime error. Optional; empty means no hint is added. + Hint string `yaml:"hint,omitempty"` } diff --git a/internal/executor/reporter/event.go b/internal/executor/reporter/event.go index 0b0f98456d8..1119d781310 100644 --- a/internal/executor/reporter/event.go +++ b/internal/executor/reporter/event.go @@ -83,9 +83,10 @@ func CreateEventForCurrentState(pod *v1.Pod, clusterId string, classifyResult ca }) return sequence, nil case v1.PodFailed: + reason := classifyResult.AppendHint(util.ExtractPodFailedReason(pod)) return CreateJobFailedEvent( pod, - util.ExtractPodFailedReason(pod), + reason, util.ExtractPodFailureCause(pod), "", util.ExtractFailedPodContainerStatuses(pod, clusterId), diff --git a/internal/executor/reporter/event_test.go b/internal/executor/reporter/event_test.go index 5ef48cd2eeb..3e8978eedea 100644 --- a/internal/executor/reporter/event_test.go +++ b/internal/executor/reporter/event_test.go @@ -1,6 +1,7 @@ package reporter import ( + "strings" "testing" "github.com/stretchr/testify/assert" @@ -81,7 +82,7 @@ func TestCreateEventForCurrentState_WhenPodFailed_WithClassifier(t *testing.T) { }) require.NoError(t, err) - result, err := CreateEventForCurrentState(pod, "cluster1", classifier.Classify(pod)) + result, err := CreateEventForCurrentState(pod, "cluster1", classifier.ClassifyContainerError(pod)) assert.NoError(t, err) assert.Len(t, result.Events, 1) @@ -93,6 +94,53 @@ func TestCreateEventForCurrentState_WhenPodFailed_WithClassifier(t *testing.T) { assert.Equal(t, "exit-74", event.JobRunErrors.Errors[0].GetFailureSubcategory()) } +func TestCreateEventForCurrentState_WhenPodFailed_HintAppendedAfterReason(t *testing.T) { + pod := makeTestPod(v1.PodFailed) + pod.Status.ContainerStatuses = []v1.ContainerStatus{ + { + Name: "main", + State: v1.ContainerState{ + Terminated: &v1.ContainerStateTerminated{ + ExitCode: 74, + Reason: "Error", + Message: "raw runtime error from container", + }, + }, + }, + } + hint := "Operator-supplied actionable guidance" + + classifier, err := categorizer.NewClassifier(categorizer.ErrorCategoriesConfig{ + Categories: []categorizer.CategoryConfig{ + { + Name: "custom-error", + Rules: []categorizer.CategoryRule{ + { + OnExitCodes: &errormatch.ExitCodeMatcher{Operator: errormatch.ExitCodeOperatorIn, Values: []int32{74}}, + Subcategory: "exit-74", + Hint: hint, + }, + }, + }, + }, + }) + require.NoError(t, err) + + result, err := CreateEventForCurrentState(pod, "cluster1", classifier.ClassifyContainerError(pod)) + require.NoError(t, err) + require.Len(t, result.Events, 1) + event, ok := result.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors) + require.True(t, ok) + require.Len(t, event.JobRunErrors.Errors, 1) + + message := event.JobRunErrors.Errors[0].GetPodError().Message + rawErrorIdx := strings.Index(message, "raw runtime error from container") + hintIdx := strings.Index(message, hint) + require.GreaterOrEqual(t, rawErrorIdx, 0, "raw container error must appear in message") + require.GreaterOrEqual(t, hintIdx, 0, "hint must appear in message") + assert.Greater(t, hintIdx, rawErrorIdx, "hint must come after the raw error, not before; defends against prepend regression") +} + func TestCreateEventForCurrentState_WhenPodFailed_NilClassifier(t *testing.T) { pod := makeTestPod(v1.PodFailed) pod.Status.ContainerStatuses = []v1.ContainerStatus{ diff --git a/internal/executor/service/job_state_reporter.go b/internal/executor/service/job_state_reporter.go index 18c19a72bf9..2bfd8c87155 100644 --- a/internal/executor/service/job_state_reporter.go +++ b/internal/executor/service/job_state_reporter.go @@ -93,7 +93,7 @@ func (stateReporter *JobStateReporter) reportCurrentStatus(pod *v1.Pod) { var classifyResult categorizer.ClassifyResult if pod.Status.Phase == v1.PodFailed { - classifyResult = stateReporter.classifier.Classify(pod) + classifyResult = stateReporter.classifier.ClassifyContainerError(pod) } event, err := reporter.CreateEventForCurrentState(pod, stateReporter.clusterContext.GetClusterId(), classifyResult) diff --git a/internal/executor/service/pod_issue_handler.go b/internal/executor/service/pod_issue_handler.go index 0d79052e2ee..768e5b709bc 100644 --- a/internal/executor/service/pod_issue_handler.go +++ b/internal/executor/service/pod_issue_handler.go @@ -425,12 +425,14 @@ func (p *PodIssueHandler) handleNonRetryableJobIssue(issue *issue) { log.Infof("Handling non-retryable issue detected for job %s run %s", issue.RunIssue.JobId, issue.RunIssue.RunId) podIssue := issue.RunIssue.PodIssue - result := p.classifier.Classify(podIssue.OriginalPodState) + result := p.classifier.ClassifyPodError(podIssue.OriginalPodState, podIssue.Message) clusterId := p.clusterContext.GetClusterId() + message := result.AppendHint(podIssue.Message) + failedEvent, err := reporter.CreateJobFailedEvent( podIssue.OriginalPodState, - podIssue.Message, + message, podIssue.Cause, podIssue.DebugMessage, util.ExtractFailedPodContainerStatuses(podIssue.OriginalPodState, clusterId), diff --git a/internal/executor/service/pod_issue_handler_test.go b/internal/executor/service/pod_issue_handler_test.go index dab019941dc..8b1a7f3794f 100644 --- a/internal/executor/service/pod_issue_handler_test.go +++ b/internal/executor/service/pod_issue_handler_test.go @@ -154,6 +154,65 @@ func TestPodIssueService_FailureCategorySet_WhenClassifierConfigured(t *testing. assert.NotEmpty(t, failedEvent.JobRunErrors.Errors[0].GetPodError().ContainerErrors) } +func TestPodIssueService_OnPodErrorClassifies(t *testing.T) { + tests := map[string]struct { + category string + subcategory string + pattern string + hint string + pod func() *v1.Pod + expectMessageContains string + }{ + "platform mismatch from kubelet error with hint": { + category: "infrastructure", + subcategory: "platform_mismatch", + pattern: "no match for platform in manifest", + hint: "Build for the cluster's CPU architecture (typically x64/arm64 mismatch)", + pod: func() *v1.Pod { + p := makeUnretryableStuckPod() + p.Status.ContainerStatuses[0].State.Waiting.Message = `Failed to pull image "amd64/busybox:latest": no match for platform in manifest` + return p + }, + expectMessageContains: "no match for platform in manifest", + }, + "active deadline exceeded from executor-side detection": { + category: "user_error", + subcategory: "deadline_exceeded", + pattern: "exceeded active deadline", + // 10 minutes old with 5 minute deadline -> exceeded. + pod: func() *v1.Pod { return makePodWithDeadline(time.Now().Add(-time.Minute*10), 300, 0) }, + expectMessageContains: "exceeded active deadline", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + classifier := podErrorClassifier(t, tc.category, tc.subcategory, tc.pattern, tc.hint) + podIssueService, _, fakeClusterContext, eventReporter, err := setupTestComponentsWithClassifier([]*job.RunState{}, classifier) + require.NoError(t, err) + addPod(t, fakeClusterContext, tc.pod()) + + podIssueService.HandlePodIssues() + + require.Len(t, eventReporter.ReceivedEvents, 1) + failedEvent, ok := eventReporter.ReceivedEvents[0].Event.Events[0].Event.(*armadaevents.EventSequence_Event_JobRunErrors) + require.True(t, ok) + + assert.Equal(t, tc.category, failedEvent.JobRunErrors.Errors[0].GetFailureCategory()) + assert.Equal(t, tc.subcategory, failedEvent.JobRunErrors.Errors[0].GetFailureSubcategory()) + message := failedEvent.JobRunErrors.Errors[0].GetPodError().Message + assert.Contains(t, message, tc.expectMessageContains) + if tc.hint != "" { + rawIdx := strings.Index(message, tc.expectMessageContains) + hintIdx := strings.Index(message, tc.hint) + require.GreaterOrEqual(t, rawIdx, 0, "raw error must appear in message") + require.GreaterOrEqual(t, hintIdx, 0, "hint must appear in message") + assert.Greater(t, hintIdx, rawIdx, "hint must come after raw error, not before; defends against prepend regression") + } + }) + } +} + func TestPodIssueService_OnlyDeletesPod_IfStuckTerminatingButDeletedByExecutor(t *testing.T) { podIssueService, _, fakeClusterContext, eventsReporter, err := setupTestComponents([]*job.RunState{}) require.NoError(t, err) @@ -552,6 +611,22 @@ func conditionClassifier(t *testing.T, category, subcategory, condition string) return c } +func podErrorClassifier(t *testing.T, category, subcategory, pattern, hint string) *categorizer.Classifier { + t.Helper() + c, err := categorizer.NewClassifier(categorizer.ErrorCategoriesConfig{ + Categories: []categorizer.CategoryConfig{ + { + Name: category, + Rules: []categorizer.CategoryRule{ + {OnPodError: &errormatch.RegexMatcher{Pattern: pattern}, Subcategory: subcategory, Hint: hint}, + }, + }, + }, + }) + require.NoError(t, err) + return c +} + // The metric counter itself is tested in the metrics package. These tests // cover the emission gating for the non-retryable issue path: only a // successful Report call should have led to a counter increment.