armadaproject · dejanzele · Apr 30, 2026 · May 6, 2026
diff --git a/internal/executor/categorizer/classifier.go b/internal/executor/categorizer/classifier.go
@@ -24,14 +24,29 @@ type rule struct {
 	containerName        string
 	onExitCodes          *errormatch.ExitCodeMatcher
 	onTerminationMessage *regexp.Regexp
+	onPodError           *regexp.Regexp
 	onConditions         []string
 	subcategory          string
+	hint                 string
 }
 
 // ClassifyResult holds the classification output for a failed pod.
 type ClassifyResult struct {
 	Category    string
 	Subcategory string
+	// Hint is operator-supplied user-facing copy attached to the matching rule.
+	// Use AppendHint to attach it to the failure message before emitting events.
+	Hint string
+}
+
+// AppendHint returns the message with this result's hint appended after a blank
+// line. Returns the message unchanged when no hint is set. Centralizing the
+// format here keeps both event-reporting call sites consistent.
+func (r ClassifyResult) AppendHint(message string) string {
+	if r.Hint == "" {
+		return message
+	}
+	return fmt.Sprintf("%s\n\n%s", message, r.Hint)
 }
 
 // Classifier evaluates pods against a set of category rules and returns
@@ -99,11 +114,14 @@ func buildRule(cfg CategoryRule) (rule, error) {
 	if cfg.OnTerminationMessage != nil {
 		matcherCount++
 	}
+	if cfg.OnPodError != nil {
+		matcherCount++
+	}
 	if matcherCount == 0 {
-		return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, or onTerminationMessage")
+		return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, onTerminationMessage, or onPodError")
 	}
 	if matcherCount > 1 {
-		return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, or onTerminationMessage")
+		return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, onTerminationMessage, or onPodError")
 	}
 
 	for _, cond := range cfg.OnConditions {
@@ -126,29 +144,57 @@ func buildRule(cfg CategoryRule) (rule, error) {
 		}
 	}
 
-	var compiledRegex *regexp.Regexp
+	var terminationRegex *regexp.Regexp
 	if cfg.OnTerminationMessage != nil {
 		re, err := regexp.Compile(cfg.OnTerminationMessage.Pattern)
 		if err != nil {
-			return rule{}, fmt.Errorf("invalid regex %q: %w", cfg.OnTerminationMessage.Pattern, err)
+			return rule{}, fmt.Errorf("invalid onTerminationMessage regex %q: %w", cfg.OnTerminationMessage.Pattern, err)
+		}
+		terminationRegex = re
+	}
+
+	var podErrorRegex *regexp.Regexp
+	if cfg.OnPodError != nil {
+		re, err := regexp.Compile(cfg.OnPodError.Pattern)
+		if err != nil {
+			return rule{}, fmt.Errorf("invalid onPodError regex %q: %w", cfg.OnPodError.Pattern, err)
 		}
-		compiledRegex = re
+		podErrorRegex = re
 	}
 
 	return rule{
 		containerName:        cfg.ContainerName,
 		onExitCodes:          cfg.OnExitCodes,
 		onConditions:         cfg.OnConditions,
-		onTerminationMessage: compiledRegex,
+		onTerminationMessage: terminationRegex,
+		onPodError:           podErrorRegex,
 		subcategory:          cfg.Subcategory,
+		hint:                 cfg.Hint,
 	}, nil
 }
 
-// Classify returns the category and subcategory for the given pod.
-// Rules are evaluated in config order; the first matching rule wins.
+// ClassifyContainerError returns the category and subcategory for a pod whose
+// failure is described by its own state: terminated containers, exit codes,
+// and Kubernetes conditions. Use it for terminated pods (PodFailed phase).
+// Returns empty result if the receiver is nil or the pod is nil.
+// Returns (defaultCategory, defaultSubcategory) if no rules match.
+func (c *Classifier) ClassifyContainerError(pod *v1.Pod) ClassifyResult {
+	return c.classify(pod, "")
+}
+
+// ClassifyPodError returns the category and subcategory for a pod-level failure
+// captured by the executor (image pull, missing volume, stuck terminating,
+// active deadline exceeded, etc.). It additionally matches podErrorMessage
+// against onPodError rules (see CategoryRule.OnPodError); all other rule types
+// are evaluated against pod state, preserving first-match-wins across config order.
 // Returns empty result if the receiver is nil or the pod is nil.
 // Returns (defaultCategory, defaultSubcategory) if no rules match.
-func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult {
+func (c *Classifier) ClassifyPodError(pod *v1.Pod, podErrorMessage string) ClassifyResult {
+	return c.classify(pod, podErrorMessage)
+}
+
+// Rules are evaluated in config order; the first matching rule wins.
+func (c *Classifier) classify(pod *v1.Pod, podErrorMessage string) ClassifyResult {
 	if c == nil || pod == nil {
 		return ClassifyResult{}
 	}
@@ -157,18 +203,19 @@ func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult {
 
 	for _, cat := range c.categories {
 		for _, r := range cat.rules {
-			if ruleMatches(r, containers, podReason) {
-				return ClassifyResult{Category: cat.name, Subcategory: r.subcategory}
+			if ruleMatches(r, containers, podReason, podErrorMessage) {
+				return ClassifyResult{Category: cat.name, Subcategory: r.subcategory, Hint: r.hint}
 			}
 		}
 	}
 	return ClassifyResult{Category: c.defaultCategory, Subcategory: c.defaultSubcategory}
 }
 
-// ruleMatches evaluates a single rule. When containerName is set, only that
-// container is considered. It checks the first non-nil matcher:
-// conditions > exit codes > termination message.
-func ruleMatches(r rule, containers []containerInfo, podReason string) bool {
+// ruleMatches evaluates a single rule. Container-level matchers honor the
+// rule's containerName scope (when set); onPodError ignores it because the
+// pod-level error has no container attribution. Exactly one matcher is set
+// per rule (validated at NewClassifier).
+func ruleMatches(r rule, containers []containerInfo, podReason, podErrorMessage string) bool {
 	filtered := containers
 	if r.containerName != "" {
 		filtered = filterByName(containers, r.containerName)
@@ -182,6 +229,9 @@ func ruleMatches(r rule, containers []containerInfo, podReason string) bool {
 	if r.onTerminationMessage != nil {
 		return matchesTerminationMessage(r.onTerminationMessage, filtered)
 	}
+	if r.onPodError != nil {
+		return podErrorMessage != "" && errormatch.MatchPattern(r.onPodError, podErrorMessage)
+	}
 	return false
 }
 

diff --git a/internal/executor/categorizer/classifier_test.go b/internal/executor/categorizer/classifier_test.go
@@ -15,6 +15,7 @@ func TestClassify(t *testing.T) {
 	tests := map[string]struct {
 		config              ErrorCategoriesConfig
 		pod                 *v1.Pod
+		podErrorMessage     string
 		expectedCategory    string
 		expectedSubcategory string
 	}{
@@ -281,19 +282,130 @@ func TestClassify(t *testing.T) {
 			pod:              podWithTerminatedContainer(1, "Error", ""),
 			expectedCategory: "",
 		},
+		"onPodError matches the captured kubelet error": {
+			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
+				{Name: "infrastructure", Rules: []CategoryRule{
+					{OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"},
+				}},
+			}},
+			pod: &v1.Pod{Status: v1.PodStatus{
+				Phase: v1.PodPending,
+				ContainerStatuses: []v1.ContainerStatus{
+					{Name: "main", State: v1.ContainerState{
+						Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
+					}},
+				},
+			}},
+			podErrorMessage:     `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
+			expectedCategory:    "infrastructure",
+			expectedSubcategory: "platform_mismatch",
+		},
+		"empty podErrorMessage does not match onPodError": {
+			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
+				{Name: "infrastructure", Rules: []CategoryRule{
+					{OnPodError: &errormatch.RegexMatcher{Pattern: "anything"}, Subcategory: "x"},
+				}},
+			}},
+			pod:              &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
+			podErrorMessage:  "",
+			expectedCategory: "",
+		},
+		// Pins the public contract that ClassifyContainerError never matches onPodError rules,
+		// even with a pattern (".*") that would otherwise match empty input. The guard is enforced
+		// at two layers (ruleMatches and errormatch.MatchPattern); this test fails only if both go,
+		// which is the right level to assert the contract regardless of internal layering.
+		"ClassifyContainerError must not match onPodError even when regex matches empty": {
+			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
+				{Name: "infrastructure", Rules: []CategoryRule{
+					{OnPodError: &errormatch.RegexMatcher{Pattern: ".*"}, Subcategory: "should_not_fire"},
+				}},
+			}},
+			pod:              &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
+			podErrorMessage:  "",
+			expectedCategory: "",
+		},
+		"onPodError ignores ContainerName scope (pod-level error has no container attribution)": {
+			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
+				{Name: "infrastructure", Rules: []CategoryRule{
+					{ContainerName: "init", OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"},
+				}},
+			}},
+			pod: &v1.Pod{Status: v1.PodStatus{
+				Phase: v1.PodPending,
+				ContainerStatuses: []v1.ContainerStatus{
+					{Name: "main", State: v1.ContainerState{
+						Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
+					}},
+				},
+			}},
+			podErrorMessage:     `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
+			expectedCategory:    "infrastructure",
+			expectedSubcategory: "platform_mismatch",
+		},
+		"onTerminationMessage does not match pod-level podErrorMessage": {
+			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
+				{Name: "infrastructure", Rules: []CategoryRule{
+					{OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "should_not_fire"},
+				}},
+			}},
+			pod: &v1.Pod{Status: v1.PodStatus{
+				Phase: v1.PodPending,
+				ContainerStatuses: []v1.ContainerStatus{
+					{Name: "main", State: v1.ContainerState{
+						Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
+					}},
+				},
+			}},
+			podErrorMessage:  `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
+			expectedCategory: "",
+		},
 	}
 
 	for name, tc := range tests {
 		t.Run(name, func(t *testing.T) {
 			classifier, err := NewClassifier(tc.config)
 			require.NoError(t, err)
-			result := classifier.Classify(tc.pod)
+			var result ClassifyResult
+			if tc.podErrorMessage == "" {
+				result = classifier.ClassifyContainerError(tc.pod)
+			} else {
+				result = classifier.ClassifyPodError(tc.pod, tc.podErrorMessage)
+			}
 			assert.Equal(t, tc.expectedCategory, result.Category)
 			assert.Equal(t, tc.expectedSubcategory, result.Subcategory)
 		})
 	}
 }
 
+func TestClassifyResult_AppendHint(t *testing.T) {
+	tests := map[string]struct {
+		result   ClassifyResult
+		message  string
+		expected string
+	}{
+		"empty hint returns message unchanged": {
+			result:   ClassifyResult{Category: "x", Subcategory: "y"},
+			message:  "raw runtime error",
+			expected: "raw runtime error",
+		},
+		"non-empty hint is appended after a blank line": {
+			result:   ClassifyResult{Hint: "operator guidance"},
+			message:  "raw runtime error",
+			expected: "raw runtime error\n\noperator guidance",
+		},
+		"empty message with hint preserves separator": {
+			result:   ClassifyResult{Hint: "guidance"},
+			message:  "",
+			expected: "\n\nguidance",
+		},
+	}
+	for name, tc := range tests {
+		t.Run(name, func(t *testing.T) {
+			assert.Equal(t, tc.expected, tc.result.AppendHint(tc.message))
+		})
+	}
+}
+
 func TestNewClassifier_ValidationErrors(t *testing.T) {
 	tests := map[string]struct {
 		config      ErrorCategoriesConfig
@@ -346,13 +458,21 @@ func TestNewClassifier_ValidationErrors(t *testing.T) {
 			}},
 			errContains: "requires at least one value",
 		},
-		"invalid regex": {
+		"invalid onTerminationMessage regex": {
 			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
 				{Name: "bad", Rules: []CategoryRule{
 					{OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "[invalid"}},
 				}},
 			}},
-			errContains: "invalid regex",
+			errContains: "invalid onTerminationMessage regex",
+		},
+		"invalid onPodError regex": {
+			config: ErrorCategoriesConfig{Categories: []CategoryConfig{
+				{Name: "bad", Rules: []CategoryRule{
+					{OnPodError: &errormatch.RegexMatcher{Pattern: "[invalid"}},
+				}},
+			}},
+			errContains: "invalid onPodError regex",
 		},
 		"empty rules": {
 			config:      ErrorCategoriesConfig{Categories: []CategoryConfig{{Name: "empty", Rules: nil}}},

diff --git a/internal/executor/categorizer/doc.go b/internal/executor/categorizer/doc.go
@@ -11,9 +11,19 @@
 // category name and the rule's optional subcategory.
 //
 // Each rule uses exactly one matcher:
-//   - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded, AppError)
+//   - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded)
 //   - OnExitCodes: matches non-zero container exit codes using In/NotIn set operators
 //   - OnTerminationMessage: matches container termination messages against a regex
+//   - OnPodError: matches a pod-level error message captured by the executor
+//     against a regex; covers failures with no useful container terminationMessage
+//     (image pull, missing volume, stuck terminating, deadline exceeded, etc.)
+//
+// Container-level matchers honor ContainerName scoping when set. OnPodError
+// ignores it because pod-level error text has no container attribution.
+//
+// Each rule may also set Hint, an optional user-facing string that the executor
+// appends to the failure message. Hints land in lookoutdb.job_run.error and
+// are surfaced to users in Lookout alongside the raw runtime error.
 //
 // Exit code 0 is always skipped. Both regular and init containers are checked.
 //
@@ -28,8 +38,13 @@
 //	        rules:
 //	          - onConditions: ["OOMKilled"]
 //	            subcategory: "oom"
+//	            hint: "Increase the memory request in your job spec"
 //	          - onConditions: ["Evicted"]
 //	            subcategory: "eviction"
+//	          - onPodError:
+//	              pattern: "no match for platform in manifest"
+//	            subcategory: "platform_mismatch"
+//	            hint: "Build the image for the cluster's CPU architecture (typically x64/arm64 mismatch)"
 //	      - name: user_code
 //	        rules:
 //	          - onExitCodes:
@@ -52,5 +67,11 @@
 //	if err != nil {
 //	    // handle invalid config
 //	}
-//	result := classifier.Classify(pod) // result.Category = "infrastructure", result.Subcategory = "oom"
+//
+//	// Terminated pod: container state carries the relevant termination signals.
+//	result := classifier.ClassifyContainerError(pod)
+//
+//	// Pre-terminal failure: an executor-captured error message is matched
+//	// against onPodError rules in addition to pod state.
+//	result = classifier.ClassifyPodError(pod, podErrorMessage)
 package categorizer
diff --git a/internal/executor/categorizer/types.go b/internal/executor/categorizer/types.go
@@ -23,12 +23,27 @@ type CategoryConfig struct {
 
 // CategoryRule defines a single matching condition. Exactly one matcher must
 // be set per rule (validated by NewClassifier). Rules within a category are OR'd.
-// When ContainerName is set, only failures from that container are considered.
-// When empty, failures from any container can match (default).
+//
+// Container-level matchers (OnConditions, OnExitCodes, OnTerminationMessage)
+// inspect per-container state from pod.Status; ContainerName scopes them to a
+// specific container when set, otherwise any container can match.
+//
+// OnPodError is pod-level: it matches a regex against the failure message the
+// executor captured for the issue. Use it for failures where no container has
+// a useful terminationMessage, including kubelet/runtime errors (image pull,
+// missing volume, missing config) and Armada-detected conditions (stuck
+// terminating, active deadline exceeded, externally deleted). ContainerName
+// is ignored for OnPodError because the message has no container attribution.
 type CategoryRule struct {
 	ContainerName        string                      `yaml:"containerName,omitempty"`
 	OnExitCodes          *errormatch.ExitCodeMatcher `yaml:"onExitCodes,omitempty"`
 	OnTerminationMessage *errormatch.RegexMatcher    `yaml:"onTerminationMessage,omitempty"`
+	OnPodError           *errormatch.RegexMatcher    `yaml:"onPodError,omitempty"`
 	OnConditions         []string                    `yaml:"onConditions,omitempty"`
 	Subcategory          string                      `yaml:"subcategory,omitempty"`
+	// Hint is operator-supplied user-facing copy describing this failure mode.
+	// When set, it is appended to the failure message that lands in
+	// lookoutdb.job_run.error so end users see actionable guidance alongside the
+	// raw runtime error. Optional; empty means no hint is added.
+	Hint string `yaml:"hint,omitempty"`
 }