Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 65 additions & 15 deletions internal/executor/categorizer/classifier.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,29 @@ type rule struct {
containerName string
onExitCodes *errormatch.ExitCodeMatcher
onTerminationMessage *regexp.Regexp
onPodError *regexp.Regexp
onConditions []string
subcategory string
hint string
}

// ClassifyResult holds the classification output for a failed pod.
type ClassifyResult struct {
Category string
Subcategory string
// Hint is operator-supplied user-facing copy attached to the matching rule.
// Use AppendHint to attach it to the failure message before emitting events.
Hint string
}

// AppendHint returns the message with this result's hint appended after a blank
// line. Returns the message unchanged when no hint is set. Centralizing the
// format here keeps both event-reporting call sites consistent.
func (r ClassifyResult) AppendHint(message string) string {
if r.Hint == "" {
return message
}
return fmt.Sprintf("%s\n\n%s", message, r.Hint)
}

// Classifier evaluates pods against a set of category rules and returns
Expand Down Expand Up @@ -99,11 +114,14 @@ func buildRule(cfg CategoryRule) (rule, error) {
if cfg.OnTerminationMessage != nil {
matcherCount++
}
if cfg.OnPodError != nil {
matcherCount++
}
if matcherCount == 0 {
return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, or onTerminationMessage")
return rule{}, fmt.Errorf("rule must specify one of onConditions, onExitCodes, onTerminationMessage, or onPodError")
}
if matcherCount > 1 {
return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, or onTerminationMessage")
return rule{}, fmt.Errorf("rule must specify only one of onConditions, onExitCodes, onTerminationMessage, or onPodError")
}

for _, cond := range cfg.OnConditions {
Expand All @@ -126,29 +144,57 @@ func buildRule(cfg CategoryRule) (rule, error) {
}
}

var compiledRegex *regexp.Regexp
var terminationRegex *regexp.Regexp
if cfg.OnTerminationMessage != nil {
re, err := regexp.Compile(cfg.OnTerminationMessage.Pattern)
if err != nil {
return rule{}, fmt.Errorf("invalid regex %q: %w", cfg.OnTerminationMessage.Pattern, err)
return rule{}, fmt.Errorf("invalid onTerminationMessage regex %q: %w", cfg.OnTerminationMessage.Pattern, err)
}
terminationRegex = re
}

var podErrorRegex *regexp.Regexp
if cfg.OnPodError != nil {
re, err := regexp.Compile(cfg.OnPodError.Pattern)
if err != nil {
return rule{}, fmt.Errorf("invalid onPodError regex %q: %w", cfg.OnPodError.Pattern, err)
}
compiledRegex = re
podErrorRegex = re
}

return rule{
containerName: cfg.ContainerName,
onExitCodes: cfg.OnExitCodes,
onConditions: cfg.OnConditions,
onTerminationMessage: compiledRegex,
onTerminationMessage: terminationRegex,
onPodError: podErrorRegex,
subcategory: cfg.Subcategory,
hint: cfg.Hint,
}, nil
}

// Classify returns the category and subcategory for the given pod.
// Rules are evaluated in config order; the first matching rule wins.
// ClassifyContainerError returns the category and subcategory for a pod whose
// failure is described by its own state: terminated containers, exit codes,
// and Kubernetes conditions. Use it for terminated pods (PodFailed phase).
// Returns empty result if the receiver is nil or the pod is nil.
// Returns (defaultCategory, defaultSubcategory) if no rules match.
func (c *Classifier) ClassifyContainerError(pod *v1.Pod) ClassifyResult {
return c.classify(pod, "")
}

// ClassifyPodError returns the category and subcategory for a pod-level failure
// captured by the executor (image pull, missing volume, stuck terminating,
// active deadline exceeded, etc.). It additionally matches podErrorMessage
// against onPodError rules (see CategoryRule.OnPodError); all other rule types
// are evaluated against pod state, preserving first-match-wins across config order.
// Returns empty result if the receiver is nil or the pod is nil.
// Returns (defaultCategory, defaultSubcategory) if no rules match.
func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult {
func (c *Classifier) ClassifyPodError(pod *v1.Pod, podErrorMessage string) ClassifyResult {
return c.classify(pod, podErrorMessage)
}

// Rules are evaluated in config order; the first matching rule wins.
func (c *Classifier) classify(pod *v1.Pod, podErrorMessage string) ClassifyResult {
if c == nil || pod == nil {
return ClassifyResult{}
}
Expand All @@ -157,18 +203,19 @@ func (c *Classifier) Classify(pod *v1.Pod) ClassifyResult {

for _, cat := range c.categories {
for _, r := range cat.rules {
if ruleMatches(r, containers, podReason) {
return ClassifyResult{Category: cat.name, Subcategory: r.subcategory}
if ruleMatches(r, containers, podReason, podErrorMessage) {
return ClassifyResult{Category: cat.name, Subcategory: r.subcategory, Hint: r.hint}
}
}
}
return ClassifyResult{Category: c.defaultCategory, Subcategory: c.defaultSubcategory}
}

// ruleMatches evaluates a single rule. When containerName is set, only that
// container is considered. It checks the first non-nil matcher:
// conditions > exit codes > termination message.
func ruleMatches(r rule, containers []containerInfo, podReason string) bool {
// ruleMatches evaluates a single rule. Container-level matchers honor the
// rule's containerName scope (when set); onPodError ignores it because the
// pod-level error has no container attribution. Exactly one matcher is set
// per rule (validated at NewClassifier).
func ruleMatches(r rule, containers []containerInfo, podReason, podErrorMessage string) bool {
filtered := containers
if r.containerName != "" {
filtered = filterByName(containers, r.containerName)
Expand All @@ -182,6 +229,9 @@ func ruleMatches(r rule, containers []containerInfo, podReason string) bool {
if r.onTerminationMessage != nil {
return matchesTerminationMessage(r.onTerminationMessage, filtered)
}
if r.onPodError != nil {
return podErrorMessage != "" && errormatch.MatchPattern(r.onPodError, podErrorMessage)
}
return false
}

Expand Down
126 changes: 123 additions & 3 deletions internal/executor/categorizer/classifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ func TestClassify(t *testing.T) {
tests := map[string]struct {
config ErrorCategoriesConfig
pod *v1.Pod
podErrorMessage string
expectedCategory string
expectedSubcategory string
}{
Expand Down Expand Up @@ -281,19 +282,130 @@ func TestClassify(t *testing.T) {
pod: podWithTerminatedContainer(1, "Error", ""),
expectedCategory: "",
},
"onPodError matches the captured kubelet error": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{
Phase: v1.PodPending,
ContainerStatuses: []v1.ContainerStatus{
{Name: "main", State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
}},
},
}},
podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
expectedCategory: "infrastructure",
expectedSubcategory: "platform_mismatch",
},
"empty podErrorMessage does not match onPodError": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: "anything"}, Subcategory: "x"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
podErrorMessage: "",
expectedCategory: "",
},
// Pins the public contract that ClassifyContainerError never matches onPodError rules,
// even with a pattern (".*") that would otherwise match empty input. The guard is enforced
// at two layers (ruleMatches and errormatch.MatchPattern); this test fails only if both go,
// which is the right level to assert the contract regardless of internal layering.
"ClassifyContainerError must not match onPodError even when regex matches empty": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: ".*"}, Subcategory: "should_not_fire"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{Phase: v1.PodPending}},
podErrorMessage: "",
expectedCategory: "",
},
"onPodError ignores ContainerName scope (pod-level error has no container attribution)": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{ContainerName: "init", OnPodError: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "platform_mismatch"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{
Phase: v1.PodPending,
ContainerStatuses: []v1.ContainerStatus{
{Name: "main", State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
}},
},
}},
podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
expectedCategory: "infrastructure",
expectedSubcategory: "platform_mismatch",
},
"onTerminationMessage does not match pod-level podErrorMessage": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "infrastructure", Rules: []CategoryRule{
{OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "no match for platform in manifest"}, Subcategory: "should_not_fire"},
}},
}},
pod: &v1.Pod{Status: v1.PodStatus{
Phase: v1.PodPending,
ContainerStatuses: []v1.ContainerStatus{
{Name: "main", State: v1.ContainerState{
Waiting: &v1.ContainerStateWaiting{Reason: "ImagePullBackOff", Message: "Back-off pulling image"},
}},
},
}},
podErrorMessage: `Failed to pull image "amd64/busybox:latest": no match for platform in manifest: not found`,
expectedCategory: "",
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
classifier, err := NewClassifier(tc.config)
require.NoError(t, err)
result := classifier.Classify(tc.pod)
var result ClassifyResult
if tc.podErrorMessage == "" {
result = classifier.ClassifyContainerError(tc.pod)
} else {
result = classifier.ClassifyPodError(tc.pod, tc.podErrorMessage)
}
assert.Equal(t, tc.expectedCategory, result.Category)
assert.Equal(t, tc.expectedSubcategory, result.Subcategory)
})
}
}

func TestClassifyResult_AppendHint(t *testing.T) {
tests := map[string]struct {
result ClassifyResult
message string
expected string
}{
"empty hint returns message unchanged": {
result: ClassifyResult{Category: "x", Subcategory: "y"},
message: "raw runtime error",
expected: "raw runtime error",
},
"non-empty hint is appended after a blank line": {
result: ClassifyResult{Hint: "operator guidance"},
message: "raw runtime error",
expected: "raw runtime error\n\noperator guidance",
},
"empty message with hint preserves separator": {
result: ClassifyResult{Hint: "guidance"},
message: "",
expected: "\n\nguidance",
},
}
for name, tc := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, tc.expected, tc.result.AppendHint(tc.message))
})
}
}

func TestNewClassifier_ValidationErrors(t *testing.T) {
tests := map[string]struct {
config ErrorCategoriesConfig
Expand Down Expand Up @@ -346,13 +458,21 @@ func TestNewClassifier_ValidationErrors(t *testing.T) {
}},
errContains: "requires at least one value",
},
"invalid regex": {
"invalid onTerminationMessage regex": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "bad", Rules: []CategoryRule{
{OnTerminationMessage: &errormatch.RegexMatcher{Pattern: "[invalid"}},
}},
}},
errContains: "invalid regex",
errContains: "invalid onTerminationMessage regex",
},
"invalid onPodError regex": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{
{Name: "bad", Rules: []CategoryRule{
{OnPodError: &errormatch.RegexMatcher{Pattern: "[invalid"}},
}},
}},
errContains: "invalid onPodError regex",
},
"empty rules": {
config: ErrorCategoriesConfig{Categories: []CategoryConfig{{Name: "empty", Rules: nil}}},
Expand Down
25 changes: 23 additions & 2 deletions internal/executor/categorizer/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,19 @@
// category name and the rule's optional subcategory.
//
// Each rule uses exactly one matcher:
// - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded, AppError)
// - OnConditions: matches Kubernetes failure signals (OOMKilled, Evicted, DeadlineExceeded)
// - OnExitCodes: matches non-zero container exit codes using In/NotIn set operators
// - OnTerminationMessage: matches container termination messages against a regex
// - OnPodError: matches a pod-level error message captured by the executor
// against a regex; covers failures with no useful container terminationMessage
// (image pull, missing volume, stuck terminating, deadline exceeded, etc.)
//
// Container-level matchers honor ContainerName scoping when set. OnPodError
// ignores it because pod-level error text has no container attribution.
//
// Each rule may also set Hint, an optional user-facing string that the executor
// appends to the failure message. Hints land in lookoutdb.job_run.error and
// are surfaced to users in Lookout alongside the raw runtime error.
//
// Exit code 0 is always skipped. Both regular and init containers are checked.
//
Expand All @@ -28,8 +38,13 @@
// rules:
// - onConditions: ["OOMKilled"]
// subcategory: "oom"
// hint: "Increase the memory request in your job spec"
// - onConditions: ["Evicted"]
// subcategory: "eviction"
// - onPodError:
// pattern: "no match for platform in manifest"
// subcategory: "platform_mismatch"
// hint: "Build the image for the cluster's CPU architecture (typically x64/arm64 mismatch)"
// - name: user_code
// rules:
// - onExitCodes:
Expand All @@ -52,5 +67,11 @@
// if err != nil {
// // handle invalid config
// }
// result := classifier.Classify(pod) // result.Category = "infrastructure", result.Subcategory = "oom"
//
// // Terminated pod: container state carries the relevant termination signals.
// result := classifier.ClassifyContainerError(pod)
//
// // Pre-terminal failure: an executor-captured error message is matched
// // against onPodError rules in addition to pod state.
// result = classifier.ClassifyPodError(pod, podErrorMessage)
package categorizer
19 changes: 17 additions & 2 deletions internal/executor/categorizer/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,27 @@ type CategoryConfig struct {

// CategoryRule defines a single matching condition. Exactly one matcher must
// be set per rule (validated by NewClassifier). Rules within a category are OR'd.
// When ContainerName is set, only failures from that container are considered.
// When empty, failures from any container can match (default).
//
// Container-level matchers (OnConditions, OnExitCodes, OnTerminationMessage)
// inspect per-container state from pod.Status; ContainerName scopes them to a
// specific container when set, otherwise any container can match.
//
// OnPodError is pod-level: it matches a regex against the failure message the
// executor captured for the issue. Use it for failures where no container has
// a useful terminationMessage, including kubelet/runtime errors (image pull,
// missing volume, missing config) and Armada-detected conditions (stuck
// terminating, active deadline exceeded, externally deleted). ContainerName
// is ignored for OnPodError because the message has no container attribution.
type CategoryRule struct {
ContainerName string `yaml:"containerName,omitempty"`
OnExitCodes *errormatch.ExitCodeMatcher `yaml:"onExitCodes,omitempty"`
OnTerminationMessage *errormatch.RegexMatcher `yaml:"onTerminationMessage,omitempty"`
OnPodError *errormatch.RegexMatcher `yaml:"onPodError,omitempty"`
OnConditions []string `yaml:"onConditions,omitempty"`
Subcategory string `yaml:"subcategory,omitempty"`
// Hint is operator-supplied user-facing copy describing this failure mode.
// When set, it is appended to the failure message that lands in
// lookoutdb.job_run.error so end users see actionable guidance alongside the
// raw runtime error. Optional; empty means no hint is added.
Hint string `yaml:"hint,omitempty"`
}
Loading
Loading