From bed4ffb19b2f4f68b16a0ef4e3f24506a4ed2cac Mon Sep 17 00:00:00 2001 From: Yasmine Hines Date: Tue, 28 Apr 2026 14:34:35 -0500 Subject: [PATCH 1/7] chore: update the active button for the Job and Job Sets tab Signed-off-by: Yasmine Hines --- internal/lookoutui/src/app/NavBar.css | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/internal/lookoutui/src/app/NavBar.css b/internal/lookoutui/src/app/NavBar.css index 573a7d3c548..97ddb066c37 100644 --- a/internal/lookoutui/src/app/NavBar.css +++ b/internal/lookoutui/src/app/NavBar.css @@ -21,6 +21,15 @@ margin-bottom: 0; } +.toolbar a.MuiButton-root { + transition: background-color 0.15s ease; +} + +.toolbar a.MuiButton-root.active { + background-color: rgba(255, 255, 255, 0.32); + font-weight: bold; +} + .toolbar .nav-end { display: flex; flex-direction: row; From 78dd1932ec2db4197093a3060af1800f9ccce748 Mon Sep 17 00:00:00 2001 From: Yasmine Hines Date: Tue, 28 Apr 2026 15:43:30 -0500 Subject: [PATCH 2/7] chore: update the back-ground active state to be a bit darker Signed-off-by: Yasmine Hines yasmine.hines@nmc2.ai Signed-off-by: Yasmine Hines --- internal/lookoutui/src/app/NavBar.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/lookoutui/src/app/NavBar.css b/internal/lookoutui/src/app/NavBar.css index 97ddb066c37..00c5537a6d1 100644 --- a/internal/lookoutui/src/app/NavBar.css +++ b/internal/lookoutui/src/app/NavBar.css @@ -26,7 +26,7 @@ } .toolbar a.MuiButton-root.active { - background-color: rgba(255, 255, 255, 0.32); + background-color: rgba(255, 255, 255, 0.33); font-weight: bold; } From fd7980837de208a04b71f8d00c94d7a7159c0c58 Mon Sep 17 00:00:00 2001 From: Dejan Zele Pejchev Date: Wed, 29 Apr 2026 15:33:53 +0200 Subject: [PATCH 3/7] Drop failure_info column from job_run table (#4855) Drops the failure_info jsonb column from job_run. Nothing writes or reads it after #4843 and #4853, and the column was never populated in production outside the opt-in flag path anyway. Also drops the unused FailureInfo field from the queryapi sqlc model. Only merge after #4843 and #4853 have been deployed long enough that we are sure no consumer still depends on the column. Signed-off-by: Dejan Zele Pejchev Signed-off-by: Yasmine Hines --- .../migrations/033_drop_failure_info_from_job_run.sql | 1 + internal/lookouthc/schema/migrations/001_initial_schema.sql | 1 - internal/server/queryapi/database/models.go | 1 - internal/server/queryapi/database/query.sql.go | 6 ++---- 4 files changed, 3 insertions(+), 6 deletions(-) create mode 100644 internal/lookout/schema/migrations/033_drop_failure_info_from_job_run.sql diff --git a/internal/lookout/schema/migrations/033_drop_failure_info_from_job_run.sql b/internal/lookout/schema/migrations/033_drop_failure_info_from_job_run.sql new file mode 100644 index 00000000000..0fd4d21e278 --- /dev/null +++ b/internal/lookout/schema/migrations/033_drop_failure_info_from_job_run.sql @@ -0,0 +1 @@ +ALTER TABLE job_run DROP COLUMN IF EXISTS failure_info; diff --git a/internal/lookouthc/schema/migrations/001_initial_schema.sql b/internal/lookouthc/schema/migrations/001_initial_schema.sql index 3a5aaafebf3..ba848ba2d73 100644 --- a/internal/lookouthc/schema/migrations/001_initial_schema.sql +++ b/internal/lookouthc/schema/migrations/001_initial_schema.sql @@ -117,7 +117,6 @@ CREATE TABLE job_run ( debug bytea NULL, pool text NULL, ingress_addresses jsonb NULL, - failure_info jsonb NULL, failure_category varchar(63) NULL, failure_subcategory varchar(63) NULL ) WITH (fillfactor = 70); diff --git a/internal/server/queryapi/database/models.go b/internal/server/queryapi/database/models.go index 52f6a64edaf..9aca2755812 100644 --- a/internal/server/queryapi/database/models.go +++ b/internal/server/queryapi/database/models.go @@ -60,7 +60,6 @@ type JobRun struct { Debug []byte `db:"debug"` Pool *string `db:"pool"` IngressAddresses []byte `db:"ingress_addresses"` - FailureInfo []byte `db:"failure_info"` FailureCategory *string `db:"failure_category"` FailureSubcategory *string `db:"failure_subcategory"` } diff --git a/internal/server/queryapi/database/query.sql.go b/internal/server/queryapi/database/query.sql.go index aec6fce4c8b..fd8d26da00e 100644 --- a/internal/server/queryapi/database/query.sql.go +++ b/internal/server/queryapi/database/query.sql.go @@ -126,7 +126,7 @@ func (q *Queries) GetJobErrorsByJobIds(ctx context.Context, jobIds []string) ([] } const getJobRunsByJobIds = `-- name: GetJobRunsByJobIds :many -SELECT run_id, job_id, cluster, node, pending, started, finished, job_run_state, error, exit_code, leased, debug, pool, ingress_addresses, failure_info, failure_category, failure_subcategory FROM job_run WHERE job_id = ANY($1::text[]) order by leased desc +SELECT run_id, job_id, cluster, node, pending, started, finished, job_run_state, error, exit_code, leased, debug, pool, ingress_addresses, failure_category, failure_subcategory FROM job_run WHERE job_id = ANY($1::text[]) order by leased desc ` func (q *Queries) GetJobRunsByJobIds(ctx context.Context, jobIds []string) ([]JobRun, error) { @@ -153,7 +153,6 @@ func (q *Queries) GetJobRunsByJobIds(ctx context.Context, jobIds []string) ([]Jo &i.Debug, &i.Pool, &i.IngressAddresses, - &i.FailureInfo, &i.FailureCategory, &i.FailureSubcategory, ); err != nil { @@ -168,7 +167,7 @@ func (q *Queries) GetJobRunsByJobIds(ctx context.Context, jobIds []string) ([]Jo } const getJobRunsByRunIds = `-- name: GetJobRunsByRunIds :many -SELECT run_id, job_id, cluster, node, pending, started, finished, job_run_state, error, exit_code, leased, debug, pool, ingress_addresses, failure_info, failure_category, failure_subcategory FROM job_run WHERE run_id = ANY($1::text[]) +SELECT run_id, job_id, cluster, node, pending, started, finished, job_run_state, error, exit_code, leased, debug, pool, ingress_addresses, failure_category, failure_subcategory FROM job_run WHERE run_id = ANY($1::text[]) ` func (q *Queries) GetJobRunsByRunIds(ctx context.Context, runIds []string) ([]JobRun, error) { @@ -195,7 +194,6 @@ func (q *Queries) GetJobRunsByRunIds(ctx context.Context, runIds []string) ([]Jo &i.Debug, &i.Pool, &i.IngressAddresses, - &i.FailureInfo, &i.FailureCategory, &i.FailureSubcategory, ); err != nil { From 5b5c745994f62b436ca55de893b1e97da60870e8 Mon Sep 17 00:00:00 2001 From: dslear <48934402+dslear@users.noreply.github.com> Date: Wed, 29 Apr 2026 10:34:08 -0500 Subject: [PATCH 4/7] Add hot cold config flag and lookout pruner optimizations (#4885) #### What type of PR is this? #### What this PR does / why we need it Updating Lookout to include a hot/cold flag for utilizing the hot/cold partitioned jobs database, as well as updating the lookout pruner to only prune jobs from the `job_terminated` table when hot/cold is in use #### Which issue(s) this PR fixes Fixes # #### Special notes for your reviewer --------- Signed-off-by: David Slear Signed-off-by: Yasmine Hines --- cmd/lookout/main.go | 16 ++- internal/lookout/configuration/types.go | 2 + internal/lookout/pruner/pruner.go | 25 +++-- internal/lookout/pruner/pruner_test.go | 123 ++++++++++++++++++++++-- 4 files changed, 150 insertions(+), 16 deletions(-) diff --git a/cmd/lookout/main.go b/cmd/lookout/main.go index 24ed5b7c0cb..094a988e080 100644 --- a/cmd/lookout/main.go +++ b/cmd/lookout/main.go @@ -18,7 +18,8 @@ import ( "github.com/armadaproject/armada/internal/lookout/configuration" "github.com/armadaproject/armada/internal/lookout/gen/restapi" "github.com/armadaproject/armada/internal/lookout/pruner" - "github.com/armadaproject/armada/internal/lookout/schema" + lookoutschema "github.com/armadaproject/armada/internal/lookout/schema" + lookouthcschema "github.com/armadaproject/armada/internal/lookouthc/schema" armada_config "github.com/armadaproject/armada/internal/server/configuration" ) @@ -61,12 +62,19 @@ func makeContext() (*armadacontext.Context, func()) { } func migrate(ctx *armadacontext.Context, config configuration.LookoutConfig) { + var err error + var migrations []database.Migration + db, err := database.OpenPgxPool(config.Postgres) if err != nil { panic(err) } - migrations, err := schema.LookoutMigrations() + if config.ExperimentalHotColdSplit { + migrations, err = lookouthcschema.LookoutHCMigrations() + } else { + migrations, err = lookoutschema.LookoutMigrations() + } if err != nil { panic(err) } @@ -110,7 +118,9 @@ func prune(ctx *armadacontext.Context, config configuration.LookoutConfig) { config.PrunerConfig.ExpireAfter, config.PrunerConfig.DeduplicationExpireAfter, config.PrunerConfig.BatchSize, - clock.RealClock{}) + clock.RealClock{}, + config.ExperimentalHotColdSplit, + ) if err != nil { panic(err) } diff --git a/internal/lookout/configuration/types.go b/internal/lookout/configuration/types.go index 0e7b65feb45..0908f29ec63 100644 --- a/internal/lookout/configuration/types.go +++ b/internal/lookout/configuration/types.go @@ -22,6 +22,8 @@ type LookoutConfig struct { PrunerConfig PrunerConfig + ExperimentalHotColdSplit bool + UIConfig } diff --git a/internal/lookout/pruner/pruner.go b/internal/lookout/pruner/pruner.go index 425185a3e33..e8b3d41d9fc 100644 --- a/internal/lookout/pruner/pruner.go +++ b/internal/lookout/pruner/pruner.go @@ -1,6 +1,7 @@ package pruner import ( + "fmt" "time" "github.com/hashicorp/go-multierror" @@ -19,10 +20,11 @@ func PruneDb( deduplicationLifetime time.Duration, batchLimit int, clock clock.Clock, + hotColdSplit bool, ) error { var result *multierror.Error - if err := deleteJobs(ctx, db, jobLifetime, batchLimit, clock); err != nil { + if err := deleteJobs(ctx, db, jobLifetime, batchLimit, clock, hotColdSplit); err != nil { result = multierror.Append(result, err) } @@ -44,10 +46,10 @@ func deleteDeduplications(ctx *armadacontext.Context, db *pgx.Conn, deduplicatio return nil } -func deleteJobs(ctx *armadacontext.Context, db *pgx.Conn, jobLifetime time.Duration, batchLimit int, clock clock.Clock) error { +func deleteJobs(ctx *armadacontext.Context, db *pgx.Conn, jobLifetime time.Duration, batchLimit int, clock clock.Clock, hotColdSplit bool) error { now := clock.Now() cutOffTime := now.Add(-jobLifetime) - totalJobsToDelete, err := createJobIdsToDeleteTempTable(ctx, db, cutOffTime) + totalJobsToDelete, err := createJobIdsToDeleteTempTable(ctx, db, cutOffTime, hotColdSplit) if err != nil { return errors.WithStack(err) } @@ -94,10 +96,16 @@ func deleteJobs(ctx *armadacontext.Context, db *pgx.Conn, jobLifetime time.Durat } // Returns total number of jobs to delete -func createJobIdsToDeleteTempTable(ctx *armadacontext.Context, db *pgx.Conn, cutOffTime time.Time) (int, error) { - _, err := db.Exec(ctx, ` +func createJobIdsToDeleteTempTable(ctx *armadacontext.Context, db *pgx.Conn, cutOffTime time.Time, hotColdSplit bool) (int, error) { + table := "job" + if hotColdSplit { + table = "job_terminated" + } + + // Using interpolation for table name as parameterized queries do not allow it. The table name is controlled by us and not user input, so this is safe. + query := fmt.Sprintf(` CREATE TEMP TABLE job_ids_to_delete AS ( - SELECT job_id FROM job + SELECT job_id FROM %s WHERE last_transition_time < $1 AND state in ( 4, -- Succeeded @@ -106,7 +114,9 @@ func createJobIdsToDeleteTempTable(ctx *armadacontext.Context, db *pgx.Conn, cut 7, -- Preempted 9 -- Rejected ) - )`, cutOffTime) + )`, table) + + _, err := db.Exec(ctx, query, cutOffTime) if err != nil { return -1, errors.WithStack(err) } @@ -135,6 +145,7 @@ func deleteBatch(ctx *armadacontext.Context, tx pgx.Tx, batchLimit int) (int, er DELETE FROM job WHERE job_id in (SELECT job_id from batch); DELETE FROM job_spec WHERE job_id in (SELECT job_id from batch); DELETE FROM job_run WHERE job_id in (SELECT job_id from batch); + DELETE FROM job_error WHERE job_id in (SELECT job_id from batch); DELETE FROM job_ids_to_delete WHERE job_id in (SELECT job_id from batch); TRUNCATE TABLE batch;`) if err != nil { diff --git a/internal/lookout/pruner/pruner_test.go b/internal/lookout/pruner/pruner_test.go index 474e63239fe..f82f2174a2d 100644 --- a/internal/lookout/pruner/pruner_test.go +++ b/internal/lookout/pruner/pruner_test.go @@ -31,10 +31,13 @@ type testJob struct { func TestPruneDb(t *testing.T) { type testCase struct { - testName string - expireAfter time.Duration - jobs []testJob - jobIdsLeft []string + testName string + expireAfter time.Duration + jobs []testJob + jobIdsLeft []string + activeJobIdsLeft []string // HC only: when non-nil, assert job_active contains exactly these + terminatedJobIdsLeft []string // HC only: when non-nil, assert job_terminated contains exactly these + jobErrorIdsLeft []string // when non-nil, assert job_error contains exactly these } nIds := 100 @@ -129,6 +132,79 @@ func TestPruneDb(t *testing.T) { ), jobIdsLeft: sampleJobIds[50:], }, + { + testName: "delete from job_terminated when hot cold split enabled", + expireAfter: 10 * time.Hour, + jobs: []testJob{ + { + jobId: sampleJobIds[0], + ts: baseTime.Add(-(10*time.Hour + 1*time.Minute)), + state: lookout.JobSucceeded, + }, + }, + jobIdsLeft: []string{}, + terminatedJobIdsLeft: []string{}, + }, + { + testName: "active jobs in job_active are never touched by the pruner", + expireAfter: 10 * time.Hour, + jobs: []testJob{ + { + jobId: sampleJobIds[0], + ts: baseTime.Add(-11 * time.Hour), + state: lookout.JobRunning, // active, old — must NOT be pruned + }, + { + jobId: sampleJobIds[1], + ts: baseTime.Add(-11 * time.Hour), + state: lookout.JobSucceeded, // terminal, old — must be pruned + }, + }, + jobIdsLeft: []string{sampleJobIds[0]}, + activeJobIdsLeft: []string{sampleJobIds[0]}, + terminatedJobIdsLeft: []string{}, + }, + { + testName: "terminal jobs in job_terminated older than the cutoff are deleted", + expireAfter: 10 * time.Hour, + jobs: []testJob{ + { + jobId: sampleJobIds[0], + ts: baseTime.Add(-(10*time.Hour + 1*time.Minute)), + state: lookout.JobSucceeded, + }, + { + jobId: sampleJobIds[1], + ts: baseTime.Add(-(10*time.Hour + 1*time.Minute)), + state: lookout.JobFailed, + }, + { + jobId: sampleJobIds[2], + ts: baseTime.Add(-(10*time.Hour + 1*time.Minute)), + state: lookout.JobCancelled, + }, + { + jobId: sampleJobIds[3], + ts: baseTime.Add(-(10*time.Hour + 1*time.Minute)), + state: lookout.JobPreempted, + }, + }, + jobIdsLeft: []string{}, + terminatedJobIdsLeft: []string{}, + }, + { + testName: "related job_run, job_spec, job_error rows are also deleted", + expireAfter: 10 * time.Hour, + jobs: []testJob{ + { + jobId: sampleJobIds[0], + ts: baseTime.Add(-(10*time.Hour + 1*time.Minute)), + state: lookout.JobRejected, + }, + }, + jobIdsLeft: []string{}, + jobErrorIdsLeft: []string{}, + }, } for _, tc := range testCases { @@ -145,7 +221,8 @@ func TestPruneDb(t *testing.T) { dbConn, err := db.Acquire(ctx) assert.NoError(t, err) - err = PruneDb(ctx, dbConn.Conn(), tc.expireAfter, 0, 10, clock.NewFakeClock(baseTime)) + isHC := isHotColdSchema(ctx, db) + err = PruneDb(ctx, dbConn.Conn(), tc.expireAfter, 0, 10, clock.NewFakeClock(baseTime), isHC) assert.NoError(t, err) queriedJobIdsPerTable := []map[string]bool{ @@ -160,6 +237,19 @@ func TestPruneDb(t *testing.T) { assert.True(t, ok) } } + + if isHC { + if tc.activeJobIdsLeft != nil { + assertJobIds(t, db, "SELECT job_id FROM job_active", tc.activeJobIdsLeft) + } + if tc.terminatedJobIdsLeft != nil { + assertJobIds(t, db, "SELECT job_id FROM job_terminated", tc.terminatedJobIdsLeft) + } + } + if tc.jobErrorIdsLeft != nil { + assertJobIds(t, db, "SELECT job_id FROM job_error", tc.jobErrorIdsLeft) + } + return nil }) assert.NoError(t, err) @@ -202,7 +292,8 @@ func storeJob(job testJob, db *lookoutdb.LookoutDb, converter *instructions.Inst Build() case lookout.JobRejected: simulator. - Rejected("invalid", job.ts) + Rejected("invalid", job.ts). + Build() case lookout.JobRunning: simulator. Build() @@ -211,6 +302,26 @@ func storeJob(job testJob, db *lookoutdb.LookoutDb, converter *instructions.Inst } } +func isHotColdSchema(ctx *armadacontext.Context, db *pgxpool.Pool) bool { + var exists bool + err := db.QueryRow(ctx, ` + SELECT EXISTS ( + SELECT 1 FROM information_schema.tables + WHERE table_name = 'job_active' + )`).Scan(&exists) + return err == nil && exists +} + +func assertJobIds(t *testing.T, db *pgxpool.Pool, query string, expected []string) { + t.Helper() + got := selectStringSet(t, db, query) + assert.Equal(t, len(expected), len(got)) + for _, id := range expected { + _, ok := got[id] + assert.True(t, ok) + } +} + func selectStringSet(t *testing.T, db *pgxpool.Pool, query string) map[string]bool { t.Helper() rows, err := db.Query(armadacontext.TODO(), query) From bafe4465146a54c5e8d0460c2173b371b297bc41 Mon Sep 17 00:00:00 2001 From: Ian Hockett <32877705+ianhockett@users.noreply.github.com> Date: Fri, 1 May 2026 10:08:03 -0500 Subject: [PATCH 5/7] Add lookoutingestor metrics to track state changes (#4894) #### What type of PR is this? feature / observability #### What this PR does / why we need it Adds a Prometheus counter metric to the lookout ingester to track job state updates processed by UpdateJobs(). The metric (`lookout_ingester_job_state_updates_total`) is labeled by state, allowing operators to observe transition rates and specifically monitor terminal state updates, which trigger cross-partition row movement in the Lookout database. Also fixes error categorization schema in `_local/executor` configs that was broken by a prior change. #### Special notes for your reviewer The `terminal_state_updates_total` counter was initially added as a standalone metric but was removed in favor of deriving it via PromQL from the per-state counter (sum by state where state is terminal). --------- Signed-off-by: Ian Hockett Signed-off-by: Yasmine Hines --- _local/executor/config-auth.yaml | 12 +++++++ _local/executor/config.yaml | 19 ++++++----- .../lookoutingester/lookoutdb/insertion.go | 33 +++++++++++++++++++ .../lookoutdb/insertion_test.go | 24 ++++++++++++++ internal/lookoutingester/metrics/metrics.go | 14 ++++++++ .../lookoutingester/metrics/metrics_test.go | 22 +++++++++++++ 6 files changed, 116 insertions(+), 8 deletions(-) create mode 100644 internal/lookoutingester/metrics/metrics_test.go diff --git a/_local/executor/config-auth.yaml b/_local/executor/config-auth.yaml index 59026fe10e3..de2f10a1d38 100644 --- a/_local/executor/config-auth.yaml +++ b/_local/executor/config-auth.yaml @@ -12,3 +12,15 @@ metric: application: clusterId: "local-cluster" pool: "default" + enableJobErrorCategorization: true + errorCategories: + defaultCategory: uncategorized + categories: + - name: oom + rules: + - onConditions: ["OOMKilled"] + - name: user_error + rules: + - onExitCodes: + operator: In + values: [1, 2, 126, 127] diff --git a/_local/executor/config.yaml b/_local/executor/config.yaml index 2270916fdf6..3f2f3da3702 100644 --- a/_local/executor/config.yaml +++ b/_local/executor/config.yaml @@ -7,12 +7,15 @@ metric: application: clusterId: "local-cluster" pool: "default" + enableJobErrorCategorization: true errorCategories: - - name: oom - rules: - - onConditions: ["OOMKilled"] - - name: user_error - rules: - - onExitCodes: - operator: In - values: [1, 2, 126, 127] + defaultCategory: uncategorized + categories: + - name: oom + rules: + - onConditions: ["OOMKilled"] + - name: user_error + rules: + - onExitCodes: + operator: In + values: [1, 2, 126, 127] diff --git a/internal/lookoutingester/lookoutdb/insertion.go b/internal/lookoutingester/lookoutdb/insertion.go index 6f92e79aef1..03c0b5779dc 100644 --- a/internal/lookoutingester/lookoutdb/insertion.go +++ b/internal/lookoutingester/lookoutdb/insertion.go @@ -141,6 +141,7 @@ func (l *LookoutDb) UpdateJobs(ctx *armadacontext.Context, instructions []*model taken := time.Since(start) l.metrics.RecordAvRowChangeTimeByOperation("job", commonmetrics.DBOperationUpdate, len(instructions), taken) l.metrics.RecordRowsChange("job", commonmetrics.DBOperationUpdate, len(instructions)) + l.recordStateUpdates(instructions) log.Infof("Updated %d jobs in %s", len(instructions), taken) } @@ -176,6 +177,38 @@ func (l *LookoutDb) UpdateJobRuns(ctx *armadacontext.Context, instructions []*mo log.Infof("Updated %d job runs in %s", len(instructions), taken) } +func (l *LookoutDb) recordStateUpdates(instructions []*model.UpdateJobInstruction) { + counts := make(map[string]int) + for _, instruction := range instructions { + if instruction.State == nil { + continue + } + switch *instruction.State { + case lookout.JobQueuedOrdinal: + counts["queued"]++ + case lookout.JobPendingOrdinal: + counts["pending"]++ + case lookout.JobRunningOrdinal: + counts["running"]++ + case lookout.JobLeasedOrdinal: + counts["leased"]++ + case lookout.JobSucceededOrdinal: + counts["succeeded"]++ + case lookout.JobFailedOrdinal: + counts["failed"]++ + case lookout.JobCancelledOrdinal: + counts["cancelled"]++ + case lookout.JobPreemptedOrdinal: + counts["preempted"]++ + case lookout.JobRejectedOrdinal: + counts["rejected"]++ + } + } + for state, count := range counts { + l.metrics.RecordStateUpdates(state, count) + } +} + func (l *LookoutDb) CreateJobErrors(ctx *armadacontext.Context, instructions []*model.CreateJobErrorInstruction) { if len(instructions) == 0 { return diff --git a/internal/lookoutingester/lookoutdb/insertion_test.go b/internal/lookoutingester/lookoutdb/insertion_test.go index 2bdcf3cf771..0f53b5dfffe 100644 --- a/internal/lookoutingester/lookoutdb/insertion_test.go +++ b/internal/lookoutingester/lookoutdb/insertion_test.go @@ -939,6 +939,30 @@ func TestStoreEventsForAlreadyTerminalJobs(t *testing.T) { assert.NoError(t, err) } +func TestRecordTerminalStateUpdates(t *testing.T) { + ldb := NewLookoutDb(nil, fatalErrors, m, 10, 10) + + instructions := []*model.UpdateJobInstruction{ + {JobId: "job1", State: pointer.Int32(lookout.JobSucceededOrdinal)}, + {JobId: "job2", State: pointer.Int32(lookout.JobFailedOrdinal)}, + {JobId: "job3", State: pointer.Int32(lookout.JobCancelledOrdinal)}, + {JobId: "job4", State: pointer.Int32(lookout.JobRunningOrdinal)}, + {JobId: "job5", State: pointer.Int32(lookout.JobPreemptedOrdinal)}, + {JobId: "job6", State: pointer.Int32(lookout.JobRejectedOrdinal)}, + {JobId: "job7"}, + } + + // Should not panic; counts 6 states (job7=nil state is skipped) + ldb.recordStateUpdates(instructions) +} + +func TestRecordTerminalStateUpdates_Empty(t *testing.T) { + ldb := NewLookoutDb(nil, fatalErrors, m, 10, 10) + // Neither nil nor empty should panic + ldb.recordStateUpdates(nil) + ldb.recordStateUpdates([]*model.UpdateJobInstruction{}) +} + func makeCreateJobInstruction(jobId string) *model.CreateJobInstruction { return &model.CreateJobInstruction{ JobId: jobId, diff --git a/internal/lookoutingester/metrics/metrics.go b/internal/lookoutingester/metrics/metrics.go index 774c0c0dd07..aa39e155e28 100644 --- a/internal/lookoutingester/metrics/metrics.go +++ b/internal/lookoutingester/metrics/metrics.go @@ -35,6 +35,14 @@ var rowsChangedCounter = promauto.NewCounterVec( []string{"table", "operation"}, ) +var stateUpdatesCounter = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: metrics.ArmadaLookoutIngesterMetricsPrefix + "state_updates", + Help: "Number of job state updates, labelled by state", + }, + []string{"state"}, +) + type Metrics struct { *metrics.Metrics } @@ -64,3 +72,9 @@ func (m *Metrics) RecordRowsChange(table string, operation metrics.DBOperation, With(map[string]string{"table": table, "operation": string(operation)}). Add(float64(numRows)) } + +func (m *Metrics) RecordStateUpdates(state string, count int) { + stateUpdatesCounter. + With(map[string]string{"state": state}). + Add(float64(count)) +} diff --git a/internal/lookoutingester/metrics/metrics_test.go b/internal/lookoutingester/metrics/metrics_test.go new file mode 100644 index 00000000000..4372defc9c2 --- /dev/null +++ b/internal/lookoutingester/metrics/metrics_test.go @@ -0,0 +1,22 @@ +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/stretchr/testify/assert" +) + +func TestRecordStateUpdates(t *testing.T) { + m := Get() + + states := []string{"queued", "pending", "running", "leased", "succeeded", "failed", "cancelled", "preempted", "rejected"} + for _, state := range states { + t.Run(state, func(t *testing.T) { + before := testutil.ToFloat64(stateUpdatesCounter.WithLabelValues(state)) + m.RecordStateUpdates(state, 3) + after := testutil.ToFloat64(stateUpdatesCounter.WithLabelValues(state)) + assert.Equal(t, float64(3), after-before) + }) + } +} From 60bc2e18c44817d2abf5d3fcc903fd180c9efe6e Mon Sep 17 00:00:00 2001 From: Yasmine Hines Date: Mon, 4 May 2026 12:40:01 -0500 Subject: [PATCH 6/7] chore: change from hardcoded color Signed-off-by: Yasmine Hines --- internal/lookoutui/src/app/NavBar.css | 2 +- internal/lookoutui/src/app/NavBar.tsx | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/internal/lookoutui/src/app/NavBar.css b/internal/lookoutui/src/app/NavBar.css index 00c5537a6d1..a04b8ad2bc1 100644 --- a/internal/lookoutui/src/app/NavBar.css +++ b/internal/lookoutui/src/app/NavBar.css @@ -26,7 +26,7 @@ } .toolbar a.MuiButton-root.active { - background-color: rgba(255, 255, 255, 0.33); + background-color: var(--nav-active-bg); font-weight: bold; } diff --git a/internal/lookoutui/src/app/NavBar.tsx b/internal/lookoutui/src/app/NavBar.tsx index b80a94d6f4f..cc21b8e7675 100644 --- a/internal/lookoutui/src/app/NavBar.tsx +++ b/internal/lookoutui/src/app/NavBar.tsx @@ -1,7 +1,7 @@ import { forwardRef } from "react" import { Settings } from "@mui/icons-material" -import { AppBar, Button, IconButton, Stack, styled, Toolbar, Typography } from "@mui/material" +import { alpha, AppBar, Button, IconButton, Stack, styled, Toolbar, Typography } from "@mui/material" import { Link, NavLink, NavLinkProps } from "react-router-dom" import { SPACING } from "../common/spacing" @@ -18,6 +18,7 @@ const StyledAppBar = styled(AppBar)(({ theme }) => ({ backgroundColor: theme.palette.appBar.main, backgroundImage: "unset", color: theme.palette.appBar.contrastText, + "--nav-active-bg": alpha(theme.palette.appBar.contrastText, 0.33), })) interface Page { From f70ad1acab76266fcbc37ab89d4b5b56df62921d Mon Sep 17 00:00:00 2001 From: Yasmine Hines Date: Thu, 7 May 2026 10:36:46 -0500 Subject: [PATCH 7/7] chore: update based on comments Signed-off-by: Yasmine Hines --- internal/lookoutui/src/app/NavBar.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/lookoutui/src/app/NavBar.tsx b/internal/lookoutui/src/app/NavBar.tsx index cc21b8e7675..72cdfdbf0e3 100644 --- a/internal/lookoutui/src/app/NavBar.tsx +++ b/internal/lookoutui/src/app/NavBar.tsx @@ -24,12 +24,14 @@ const StyledAppBar = styled(AppBar)(({ theme }) => ({ interface Page { title: string location: string + end?: boolean } const PAGES: Page[] = [ { title: "Jobs", location: JOBS, + end: true, }, { title: "Job Sets", @@ -60,7 +62,7 @@ export const NavBar = ({ customTitle }: NavBarProps) => { - {PAGES.map(({ location, title }) => ( + {PAGES.map(({ location, title, end }) => (