From 57bd1cca4fa85657661177a0b7536e3b56fa32bb Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 17 Mar 2026 12:52:28 +0400 Subject: [PATCH 01/27] scale set --- .gitignore | 1 + src/go.mod | 8 +- src/go.sum | 8 + src/lib/config/config.go | 1 + src/lib/githubClient/githubClient.go | 8 + src/lib/jobQueue/changeEvent.go | 12 - src/lib/jobQueue/jobQueue.go | 107 ----- src/lib/jobQueue/jobQueue_test.go | 373 ---------------- src/lib/jobQueue/queueManager.go | 200 --------- src/lib/jobQueue/queueManager_test.go | 399 ------------------ src/lib/jobs/job.go | 32 -- src/lib/jobs/jobStatus.go | 19 - src/lib/metrics/metrics.go | 54 ++- .../repositories/iRestarterRepository.go | 12 +- .../mongodbRestarterRepository.go | 18 +- src/lib/restarter/workflowRestarter.go | 117 +++-- src/lib/scaleSetClient/scaleSetClient.go | 115 +++++ src/lib/scaleSetPoller/manager.go | 26 ++ src/lib/scaleSetPoller/poller.go | 143 +++++++ src/lib/trayManager/trayManager.go | 98 ++--- src/lib/trays/repositories/iTrayRepository.go | 2 +- .../repositories/mongodbTrayRepository.go | 6 +- src/lib/trays/tray.go | 1 + src/server/handlers/agentHandler.go | 24 +- src/server/handlers/rootHandler.go | 4 +- src/server/handlers/webhookHandler.go | 243 ----------- src/server/server.go | 77 ++-- 27 files changed, 524 insertions(+), 1584 deletions(-) delete mode 100644 src/lib/jobQueue/changeEvent.go delete mode 100644 src/lib/jobQueue/jobQueue.go delete mode 100644 src/lib/jobQueue/jobQueue_test.go delete mode 100644 src/lib/jobQueue/queueManager.go delete mode 100644 src/lib/jobQueue/queueManager_test.go delete mode 100644 src/lib/jobs/job.go delete mode 100644 src/lib/jobs/jobStatus.go create mode 100644 src/lib/scaleSetClient/scaleSetClient.go create mode 100644 src/lib/scaleSetPoller/manager.go create mode 100644 src/lib/scaleSetPoller/poller.go delete mode 100644 src/server/handlers/webhookHandler.go diff --git a/.gitignore b/.gitignore index d00a2fb..77e7742 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ bin /stuff/ .DS_Store +.claude \ No newline at end of file diff --git a/src/go.mod b/src/go.mod index 021a078..d3e7ad4 100644 --- a/src/go.mod +++ b/src/go.mod @@ -23,9 +23,10 @@ require ( cloud.google.com/go/auth v0.18.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect + github.com/actions/scaleset v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/davecgh/go-spew v1.1.1 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect @@ -36,14 +37,17 @@ require ( github.com/google/go-github/v75 v75.0.0 // indirect github.com/google/go-querystring v1.2.0 // indirect github.com/google/s2a-go v0.1.9 // indirect + github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.9 // indirect github.com/googleapis/gax-go/v2 v2.16.0 // indirect + github.com/hashicorp/go-cleanhttp v0.5.2 // indirect + github.com/hashicorp/go-retryablehttp v0.7.8 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/klauspost/compress v1.18.2 // indirect github.com/leodido/go-urn v1.4.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/procfs v0.19.2 // indirect diff --git a/src/go.sum b/src/go.sum index 48643d3..af5ec6f 100644 --- a/src/go.sum +++ b/src/go.sum @@ -8,6 +8,8 @@ cloud.google.com/go/compute v1.53.0 h1:dILGanjePNsYfZVYYv6K0d4+IPnKX1gn84Fk8jDPN cloud.google.com/go/compute v1.53.0/go.mod h1:zdogTa7daHhEtEX92+S5IARtQmi/RNVPUfoI8Jhl8Do= cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= +github.com/actions/scaleset v0.2.0 h1:CKsDtTjOBCwjyT4ikwiMykMttzuKejimWRAvVr8xj9w= +github.com/actions/scaleset v0.2.0/go.mod h1:ncR5vzCCTUSyLgvclAtZ5dRBgF6qwA2nbTfTXmOJp84= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/bradleyfalzon/ghinstallation/v2 v2.17.0 h1:SmbUK/GxpAspRjSQbB6ARvH+ArzlNzTtHydNyXUQ6zg= @@ -18,6 +20,7 @@ github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6N github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -60,6 +63,10 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.9 h1:TOpi/QG8iDcZlkQlGlF github.com/googleapis/enterprise-certificate-proxy v0.3.9/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= github.com/googleapis/gax-go/v2 v2.16.0 h1:iHbQmKLLZrexmb0OSsNGTeSTS0HO4YvFOG8g5E4Zd0Y= github.com/googleapis/gax-go/v2 v2.16.0/go.mod h1:o1vfQjjNZn4+dPnRdl/4ZD7S9414Y4xA+a/6Icj6l14= +github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= +github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48= +github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/klauspost/compress v1.18.2 h1:iiPHWW0YrcFgpBYhsA6D1+fqHssJscY/Tm/y2Uqnapk= @@ -78,6 +85,7 @@ github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0 github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= diff --git a/src/lib/config/config.go b/src/lib/config/config.go index b9158fd..60780bc 100644 --- a/src/lib/config/config.go +++ b/src/lib/config/config.go @@ -146,6 +146,7 @@ type DatabaseConfig struct { type GitHubOrganization struct { Name string `yaml:"name" validate:"required"` AppId int64 `yaml:"appId" validate:"required"` + AppClientId string `yaml:"appClientId" validate:"required"` InstallationId int64 `yaml:"installationId" validate:"required"` WebhookSecret string `yaml:"webhookSecret"` PrivateKeyPath string `yaml:"privateKeyPath"` diff --git a/src/lib/githubClient/githubClient.go b/src/lib/githubClient/githubClient.go index 3e1f251..c3c5903 100644 --- a/src/lib/githubClient/githubClient.go +++ b/src/lib/githubClient/githubClient.go @@ -69,6 +69,14 @@ func (gc *GithubClient) RestartFailedJobs(repoName string, workflowId int64) err return err } +func (gc *GithubClient) GetWorkflowRunStatus(repoName string, workflowRunId int64) (string, string, error) { + wr, _, err := gc.client.Actions.GetWorkflowRunByID(context.Background(), gc.Org.Name, repoName, workflowRunId) + if err != nil { + return "", "", err + } + return wr.GetStatus(), wr.GetConclusion(), nil +} + func (gc *GithubClient) CheckJobCompleted(repoName string, jobId int64) (bool, error) { wfJob, resp, err := gc.client.Actions.GetWorkflowJobByID(context.Background(), gc.Org.Name, repoName, jobId) if err != nil { diff --git a/src/lib/jobQueue/changeEvent.go b/src/lib/jobQueue/changeEvent.go deleted file mode 100644 index c534632..0000000 --- a/src/lib/jobQueue/changeEvent.go +++ /dev/null @@ -1,12 +0,0 @@ -package jobQueue - -type changeEvent[T any] struct { - OperationType string `bson:"operationType"` - DocumentKey struct { - Id int64 `bson:"_id"` - } `bson:"documentKey"` - ns struct { - db string - } `bson:"ns"` - FullDocument T `bson:"fullDocument"` -} diff --git a/src/lib/jobQueue/jobQueue.go b/src/lib/jobQueue/jobQueue.go deleted file mode 100644 index 162b1ed..0000000 --- a/src/lib/jobQueue/jobQueue.go +++ /dev/null @@ -1,107 +0,0 @@ -package jobQueue - -import ( - "cattery/lib/jobs" - "cattery/lib/metrics" - "sync" -) - -type JobQueue struct { - rwMutex *sync.RWMutex - jobs map[int64]jobs.Job - groups map[string]map[int64]jobs.Job -} - -func NewJobQueue() *JobQueue { - return &JobQueue{ - rwMutex: &sync.RWMutex{}, - jobs: make(map[int64]jobs.Job), - groups: make(map[string]map[int64]jobs.Job), - } -} - -func (qm *JobQueue) GetGroup(groupName string) map[int64]jobs.Job { - qm.rwMutex.RLock() - defer qm.rwMutex.RUnlock() - - return qm.getGroup(groupName) -} - -func (qm *JobQueue) getGroup(groupName string) map[int64]jobs.Job { - if group, ok := qm.groups[groupName]; ok { - return group - } - - newGroup := make(map[int64]jobs.Job) - qm.groups[groupName] = newGroup - return newGroup -} - -func (qm *JobQueue) GetJobsCount() map[string]int { - result := make(map[string]int) - qm.rwMutex.RLock() - defer qm.rwMutex.RUnlock() - for groupName, group := range qm.groups { - result[groupName] = len(group) - } - return result -} - -func (qm *JobQueue) Get(jobId int64) *jobs.Job { - qm.rwMutex.RLock() - defer qm.rwMutex.RUnlock() - - if job, ok := qm.jobs[jobId]; ok { - return &job - } - - return nil -} - -func (qm *JobQueue) Add(job *jobs.Job) { - qm.rwMutex.Lock() - defer qm.rwMutex.Unlock() - - if _, exists := qm.jobs[job.Id]; exists { - // TODO: handle error or return - return // Job already exists - } - - qm.jobs[job.Id] = *job - - var group = qm.getGroup(job.TrayType) - group[job.Id] = *job - - metrics.JobsInQueueAdd(job.Organization, job.Repository, job.Name, job.TrayType, 1) -} - -func (qm *JobQueue) Delete(jobId int64) { - qm.rwMutex.Lock() - defer qm.rwMutex.Unlock() - - if job, exists := qm.jobs[jobId]; exists { - - delete(qm.jobs, jobId) - - var group = qm.getGroup(job.TrayType) - delete(group, job.Id) - - metrics.JobsInQueueAdd(job.Organization, job.Repository, job.Name, job.TrayType, -1) - } -} - -func (qm *JobQueue) DeleteJobsByWorkflowRunId(workflowRunId int64) { - qm.rwMutex.Lock() - defer qm.rwMutex.Unlock() - - for jobId, job := range qm.jobs { - if job.WorkflowId == workflowRunId { - delete(qm.jobs, jobId) - - var group = qm.getGroup(job.TrayType) - delete(group, job.Id) - - metrics.JobsInQueueAdd(job.Organization, job.Repository, job.Name, job.TrayType, -1) - } - } -} diff --git a/src/lib/jobQueue/jobQueue_test.go b/src/lib/jobQueue/jobQueue_test.go deleted file mode 100644 index 5775447..0000000 --- a/src/lib/jobQueue/jobQueue_test.go +++ /dev/null @@ -1,373 +0,0 @@ -package jobQueue - -import ( - "cattery/lib/jobs" - "sync" - "testing" -) - -func TestNewJobQueue(t *testing.T) { - queue := NewJobQueue() - - if queue == nil { - t.Error("Expected non-nil JobQueue") - } - - if queue.jobs == nil { - t.Error("Expected non-nil jobs map") - } - - if queue.groups == nil { - t.Error("Expected non-nil groups map") - } - - if queue.rwMutex == nil { - t.Error("Expected non-nil rwMutex") - } - - if len(queue.jobs) != 0 { - t.Errorf("Expected empty jobs map, got %d items", len(queue.jobs)) - } - - if len(queue.groups) != 0 { - t.Errorf("Expected empty groups map, got %d items", len(queue.groups)) - } -} - -func TestAdd(t *testing.T) { - queue := NewJobQueue() - job := &jobs.Job{ - Id: 1, - Name: "Test Job", - TrayType: "TestTray", - } - - // Test adding a job - queue.Add(job) - - if len(queue.jobs) != 1 { - t.Errorf("Expected 1 job, got %d", len(queue.jobs)) - } - - if len(queue.groups) != 1 { - t.Errorf("Expected 1 group, got %d", len(queue.groups)) - } - - if len(queue.groups["TestTray"]) != 1 { - t.Errorf("Expected 1 job in TestTray group, got %d", len(queue.groups["TestTray"])) - } - - // Test adding a duplicate job (should be ignored) - queue.Add(job) - - if len(queue.jobs) != 1 { - t.Errorf("Expected still 1 job after duplicate add, got %d", len(queue.jobs)) - } - - // Test adding a different job with the same tray type - job2 := &jobs.Job{ - Id: 2, - Name: "Test Job 2", - TrayType: "TestTray", - } - - queue.Add(job2) - - if len(queue.jobs) != 2 { - t.Errorf("Expected 2 jobs, got %d", len(queue.jobs)) - } - - if len(queue.groups["TestTray"]) != 2 { - t.Errorf("Expected 2 jobs in TestTray group, got %d", len(queue.groups["TestTray"])) - } - - // Test adding a job with a different tray type - job3 := &jobs.Job{ - Id: 3, - Name: "Test Job 3", - TrayType: "AnotherTray", - } - - queue.Add(job3) - - if len(queue.jobs) != 3 { - t.Errorf("Expected 3 jobs, got %d", len(queue.jobs)) - } - - if len(queue.groups) != 2 { - t.Errorf("Expected 2 groups, got %d", len(queue.groups)) - } - - if len(queue.groups["AnotherTray"]) != 1 { - t.Errorf("Expected 1 job in AnotherTray group, got %d", len(queue.groups["AnotherTray"])) - } -} - -func TestGet(t *testing.T) { - queue := NewJobQueue() - job := &jobs.Job{ - Id: 1, - Name: "Test Job", - TrayType: "TestTray", - } - - queue.Add(job) - - // Test getting an existing job - retrievedJob := queue.Get(1) - - if retrievedJob == nil { - t.Error("Expected non-nil job") - return - } - - if retrievedJob.Id != 1 { - t.Errorf("Expected job ID 1, got %d", retrievedJob.Id) - } - - if retrievedJob.Name != "Test Job" { - t.Errorf("Expected job name 'Test Job', got '%s'", retrievedJob.Name) - } - - if retrievedJob.TrayType != "TestTray" { - t.Errorf("Expected tray type 'TestTray', got '%s'", retrievedJob.TrayType) - } - - // Test getting a non-existent job - nonExistentJob := queue.Get(999) - - if nonExistentJob != nil { - t.Error("Expected nil for non-existent job") - } -} - -func TestGetGroup(t *testing.T) { - queue := NewJobQueue() - job1 := &jobs.Job{ - Id: 1, - Name: "Test Job 1", - TrayType: "TestTray", - } - - job2 := &jobs.Job{ - Id: 2, - Name: "Test Job 2", - TrayType: "TestTray", - } - - queue.Add(job1) - queue.Add(job2) - - // Test getting an existing group - group := queue.GetGroup("TestTray") - - if len(group) != 2 { - t.Errorf("Expected 2 jobs in group, got %d", len(group)) - } - - if _, exists := group[1]; !exists { - t.Error("Expected job with ID 1 in group") - } - - if _, exists := group[2]; !exists { - t.Error("Expected job with ID 2 in group") - } - - // Test getting a non-existent group (should create an empty group) - nonExistentGroup := queue.GetGroup("NonExistentTray") - - if nonExistentGroup == nil { - t.Error("Expected non-nil group for non-existent tray type") - } - - if len(nonExistentGroup) != 0 { - t.Errorf("Expected empty group for non-existent tray type, got %d items", len(nonExistentGroup)) - } - - // Verify the new group was created - if len(queue.groups) != 2 { - t.Errorf("Expected 2 groups after getting non-existent group, got %d", len(queue.groups)) - } -} - -func TestGetJobsCount(t *testing.T) { - queue := NewJobQueue() - - // Test with empty queue - counts := queue.GetJobsCount() - - if len(counts) != 0 { - t.Errorf("Expected empty counts map for empty queue, got %d items", len(counts)) - } - - // Add some jobs - job1 := &jobs.Job{ - Id: 1, - Name: "Test Job 1", - TrayType: "TestTray1", - } - - job2 := &jobs.Job{ - Id: 2, - Name: "Test Job 2", - TrayType: "TestTray1", - } - - job3 := &jobs.Job{ - Id: 3, - Name: "Test Job 3", - TrayType: "TestTray2", - } - - queue.Add(job1) - queue.Add(job2) - queue.Add(job3) - - // Test with populated queue - counts = queue.GetJobsCount() - - if len(counts) != 2 { - t.Errorf("Expected 2 items in counts map, got %d", len(counts)) - } - - if counts["TestTray1"] != 2 { - t.Errorf("Expected 2 jobs in TestTray1, got %d", counts["TestTray1"]) - } - - if counts["TestTray2"] != 1 { - t.Errorf("Expected 1 job in TestTray2, got %d", counts["TestTray2"]) - } -} - -func TestDelete(t *testing.T) { - queue := NewJobQueue() - job1 := &jobs.Job{ - Id: 1, - Name: "Test Job 1", - TrayType: "TestTray", - } - - job2 := &jobs.Job{ - Id: 2, - Name: "Test Job 2", - TrayType: "TestTray", - } - - queue.Add(job1) - queue.Add(job2) - - // Verify initial state - if len(queue.jobs) != 2 { - t.Errorf("Expected 2 jobs initially, got %d", len(queue.jobs)) - } - - if len(queue.groups["TestTray"]) != 2 { - t.Errorf("Expected 2 jobs in TestTray group initially, got %d", len(queue.groups["TestTray"])) - } - - // Test deleting an existing job - queue.Delete(1) - - if len(queue.jobs) != 1 { - t.Errorf("Expected 1 job after deletion, got %d", len(queue.jobs)) - } - - if len(queue.groups["TestTray"]) != 1 { - t.Errorf("Expected 1 job in TestTray group after deletion, got %d", len(queue.groups["TestTray"])) - } - - if _, exists := queue.jobs[1]; exists { - t.Error("Expected job with ID 1 to be deleted from jobs map") - } - - if _, exists := queue.groups["TestTray"][1]; exists { - t.Error("Expected job with ID 1 to be deleted from TestTray group") - } - - // Test deleting a non-existent job (should not cause errors) - queue.Delete(999) - - if len(queue.jobs) != 1 { - t.Errorf("Expected still 1 job after non-existent deletion, got %d", len(queue.jobs)) - } - - // Delete the last job - queue.Delete(2) - - if len(queue.jobs) != 0 { - t.Errorf("Expected 0 jobs after deleting all jobs, got %d", len(queue.jobs)) - } - - if len(queue.groups["TestTray"]) != 0 { - t.Errorf("Expected 0 jobs in TestTray group after deleting all jobs, got %d", len(queue.groups["TestTray"])) - } -} - -func TestConcurrentOperations(t *testing.T) { - queue := NewJobQueue() - - // Number of concurrent operations - const numOperations = 100 - - // WaitGroup to wait for all goroutines to finish - var wg sync.WaitGroup - wg.Add(numOperations * 3) // Add, Get, Delete operations - - // Test concurrent Add operations - for i := 0; i < numOperations; i++ { - go func(id int64) { - defer wg.Done() - job := &jobs.Job{ - Id: id, - Name: "Concurrent Job", - TrayType: "ConcurrentTray", - } - queue.Add(job) - }(int64(i + 1)) - } - - // Test concurrent Get operations - for i := 0; i < numOperations; i++ { - go func(id int64) { - defer wg.Done() - // Get may return nil if the job hasn't been added yet, which is fine - _ = queue.Get(id) - }(int64(i + 1)) - } - - // Test concurrent Delete operations - for i := 0; i < numOperations; i++ { - go func(id int64) { - defer wg.Done() - queue.Delete(id) - }(int64(i + 1)) - } - - // Wait for all goroutines to finish - wg.Wait() - - // Verify final state - // Since we're adding and deleting the same jobs concurrently, - // we can't predict exactly how many will be in the queue at the end. - // But we can verify that the queue is in a consistent state. - - // Get the count of jobs in each group - counts := queue.GetJobsCount() - - // Verify that the count in the ConcurrentTray group matches the actual number of jobs - if counts["ConcurrentTray"] != len(queue.GetGroup("ConcurrentTray")) { - t.Errorf("Inconsistent state: count %d doesn't match actual group size %d", - counts["ConcurrentTray"], len(queue.GetGroup("ConcurrentTray"))) - } - - // Verify that the total number of jobs matches the sum of jobs in all groups - totalJobsInGroups := 0 - for _, count := range counts { - totalJobsInGroups += count - } - - if len(queue.jobs) != totalJobsInGroups { - t.Errorf("Inconsistent state: total jobs %d doesn't match sum of jobs in groups %d", - len(queue.jobs), totalJobsInGroups) - } -} diff --git a/src/lib/jobQueue/queueManager.go b/src/lib/jobQueue/queueManager.go deleted file mode 100644 index 5e100c1..0000000 --- a/src/lib/jobQueue/queueManager.go +++ /dev/null @@ -1,200 +0,0 @@ -package jobQueue - -import ( - "cattery/lib/githubClient" - "cattery/lib/jobs" - "cattery/lib/metrics" - "context" - "errors" - "sync" - - log "github.com/sirupsen/logrus" - "go.mongodb.org/mongo-driver/v2/bson" - "go.mongodb.org/mongo-driver/v2/mongo" - "go.mongodb.org/mongo-driver/v2/mongo/options" -) - -type QueueManager struct { - jobQueue *JobQueue - waitGroup sync.WaitGroup - - collection *mongo.Collection - changeStream *mongo.ChangeStream - - logger *log.Entry -} - -func NewQueueManager() *QueueManager { - return &QueueManager{ - jobQueue: NewJobQueue(), - waitGroup: sync.WaitGroup{}, - logger: log.WithFields(log.Fields{"name": "QueueManager"}), - } -} - -func (qm *QueueManager) Connect(collection *mongo.Collection) { - qm.collection = collection -} - -func (qm *QueueManager) Load() error { - qm.waitGroup.Add(1) - defer qm.waitGroup.Done() - - collection := qm.collection - - ctx, _ := context.WithCancel(context.Background()) - - changeStream, err := collection.Watch(ctx, mongo.Pipeline{}, options.ChangeStream().SetFullDocument(options.UpdateLookup)) - if err != nil { - return err - } - qm.changeStream = changeStream - //options.ChangeStream().SetFullDocumentBeforeChange(options.UpdateLookup) - allJobs, err := collection.Find(context.Background(), bson.M{}) - if err != nil { - return err - } - - for allJobs.Next(nil) { - var job jobs.Job - decodeErr := allJobs.Decode(&job) - if decodeErr != nil { - return err - } - - qm.jobQueue.Add(&job) - } - - go func() { - for qm.changeStream.Next(ctx) { - qm.logger.Debug("changeStream event") - var event changeEvent[jobs.Job] - decodeErr := qm.changeStream.Decode(&event) - if decodeErr != nil { - qm.logger.Error("Failed to decode change stream: ", decodeErr) - continue - } - - switch event.OperationType { - case "replace": - fallthrough - case "update": - fallthrough - case "insert": - qm.jobQueue.Add(&event.FullDocument) - qm.logger.Debug("Inserted object from changeStream: ", event.FullDocument) - case "delete": - qm.jobQueue.Delete(event.DocumentKey.Id) - qm.logger.Debug("Deleted object from changeStream: ", event.DocumentKey.Id) - default: - qm.logger.Warn("Unknown operation type: ", event.OperationType) - } - } - qm.logger.Debug("changeStream finished") - if err := qm.changeStream.Err(); err != nil { - qm.logger.Error("changeStream error: ", err) - } - changeStream.Close(nil) - }() - - return nil -} - -func (qm *QueueManager) AddJob(job *jobs.Job) error { - qm.jobQueue.Add(job) - _, err := qm.collection.InsertOne(context.Background(), job) - if err != nil { - return err - } - - return nil -} - -func (qm *QueueManager) JobInProgress(jobId int64) error { - //TODO: remove method, use UpdateJobStatus - job := qm.jobQueue.Get(jobId) - if job == nil { - qm.logger.Errorf("No job found with id %v", jobId) - return errors.New("No job found with id ") - } - - err := qm.deleteJob(jobId) - if err != nil { - return err - } - - return nil -} - -func (qm *QueueManager) UpdateJobStatus(jobId int64, status jobs.JobStatus) error { - - job := qm.jobQueue.Get(jobId) - if job == nil { - return nil - } - - switch status { - case jobs.JobStatusInProgress: - err := qm.deleteJob(jobId) - if err != nil { - return err - } - case jobs.JobStatusFinished: - err := qm.deleteJob(jobId) - if err != nil { - return err - } - default: - return nil - } - - return nil -} - -func (qm *QueueManager) deleteJob(jobId int64) error { - qm.jobQueue.Delete(jobId) - _, err := qm.collection.DeleteOne(context.Background(), bson.M{"_id": jobId}) - if err != nil { - return err - } - - return nil -} - -func (qm *QueueManager) GetJobsCount() map[string]int { - return qm.jobQueue.GetJobsCount() -} - -func (qm *QueueManager) CleanupByWorkflowRun(workflowRunId int64) error { - qm.jobQueue.DeleteJobsByWorkflowRunId(workflowRunId) - _, err := qm.collection.DeleteMany(context.Background(), bson.M{"workflowRunId": workflowRunId}) - if err != nil { - return err - } - - return nil -} - -func (qm *QueueManager) CleanupCompletedJobs() error { - - for _, job := range qm.jobQueue.jobs { - var ghClient, err = githubClient.NewGithubClientWithOrgName(job.Organization) - if err != nil { - return err - } - - completed, err := ghClient.CheckJobCompleted(job.Repository, job.Id) - if err != nil { - return err - } - - if completed { - qm.logger.Warn("Removed completed job: ", job.Id) - qm.deleteJob(job.Id) - - metrics.StaleJobsInc(job.Organization, job.Repository, job.Name, job.TrayType) - } - } - - return nil -} diff --git a/src/lib/jobQueue/queueManager_test.go b/src/lib/jobQueue/queueManager_test.go deleted file mode 100644 index 68258da..0000000 --- a/src/lib/jobQueue/queueManager_test.go +++ /dev/null @@ -1,399 +0,0 @@ -package jobQueue - -import ( - "cattery/lib/jobs" - "context" - "testing" - "time" - - "go.mongodb.org/mongo-driver/v2/bson" - "go.mongodb.org/mongo-driver/v2/mongo" - "go.mongodb.org/mongo-driver/v2/mongo/options" -) - -// setupTestCollection creates a test collection and returns a client and collection -func setupTestCollection(t *testing.T) (*mongo.Client, *mongo.Collection) { - t.Helper() - - // Connect to MongoDB - serverAPI := options.ServerAPI(options.ServerAPIVersion1) - opts := options.Client().ApplyURI("mongodb://localhost").SetServerAPIOptions(serverAPI) - - client, err := mongo.Connect(opts) - if err != nil { - t.Fatalf("Failed to connect to MongoDB: %v", err) - } - - // Ping the database to verify connection - err = client.Ping(context.Background(), nil) - if err != nil { - t.Fatalf("Failed to ping MongoDB: %v", err) - } - - // Create a test collection - collection := client.Database("test").Collection("jobs_test_queue_manager") - - // Clear the collection - err = collection.Drop(context.Background()) - if err != nil { - t.Fatalf("Failed to drop collection: %v", err) - } - - return client, collection -} - -// createTestJob creates a test job with the given parameters -func createTestJob(id int64, name string, trayType string) *jobs.Job { - return &jobs.Job{ - Id: id, - Name: name, - TrayType: trayType, - } -} - -// insertTestJobs inserts test jobs into the collection -func insertTestJobs(t *testing.T, collection *mongo.Collection, jobs []*jobs.Job) { - t.Helper() - - for _, job := range jobs { - _, err := collection.InsertOne(context.Background(), job) - if err != nil { - t.Fatalf("Failed to insert test job: %v", err) - } - } -} - -// deleteTestJobs deletes test jobs from the collection -func deleteTestJobs(t *testing.T, collection *mongo.Collection, jobIds []int64) { - t.Helper() - - for _, id := range jobIds { - _, err := collection.DeleteOne(context.Background(), bson.M{"_id": id}) - if err != nil { - t.Fatalf("Failed to insert test job: %v", err) - } - } -} - -// TestNewQueueManager tests the NewQueueManager function -func TestNewQueueManager(t *testing.T) { - qm := NewQueueManager() - if qm == nil { - t.Error("Expected non-nil QueueManager") - } - if qm.jobQueue == nil { - t.Error("Expected non-nil jobQueue") - } -} - -// TestConnect tests the Connect method -func TestConnect(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - qm := NewQueueManager() - qm.Connect(collection) - - if qm.collection != collection { - t.Error("Expected collection to be set") - } -} - -// TestLoad tests the Load method -func TestLoad(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - // Create test jobs - job1 := createTestJob(1, "Test Job 1", "TestTray") - job2 := createTestJob(2, "Test Job 2", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job1, job2}) - - qm := NewQueueManager() - qm.Connect(collection) - err := qm.Load() - if err != nil { - t.Fatalf("Load failed: %v", err) - } - - // Verify jobs were loaded - if qm.jobQueue.Get(1) == nil { - t.Error("Expected job 1 to be loaded") - } - if qm.jobQueue.Get(2) == nil { - t.Error("Expected job 2 to be loaded") - } - - job3 := createTestJob(3, "Test Job 3", "TestTray") - job4 := createTestJob(4, "Test Job 4", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job3, job4}) - - time.Sleep(2 * time.Second) - - // Verify jobs were sync - if qm.jobQueue.Get(3) == nil { - t.Error("Expected job 3 to be sync (add)") - } - if qm.jobQueue.Get(4) == nil { - t.Error("Expected job 4 to be sync (add)") - } - - deleteTestJobs(t, collection, []int64{3, 4}) - - time.Sleep(2 * time.Second) - - // Verify jobs were sync - if qm.jobQueue.Get(3) != nil { - t.Error("Expected job 3 to be sync (delete)") - } - if qm.jobQueue.Get(4) != nil { - t.Error("Expected job 4 to be sync (delete)") - } - - // Note: Change stream listening is now always enabled but requires a MongoDB replica set - // In a real environment, this would be tested with a properly configured MongoDB replica set - t.Log("Change stream listening requires a MongoDB replica set") -} - -// TestAddJob tests the AddJob method -func TestAddJob(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - qm := NewQueueManager() - qm.Connect(collection) - - // Create a test job - job := createTestJob(1, "Test Job", "TestTray") - - // Test AddJob - err := qm.AddJob(job) - if err != nil { - t.Fatalf("AddJob failed: %v", err) - } - - // Verify job was added to the queue - if qm.jobQueue.Get(1) == nil { - t.Error("Expected job to be added to the queue") - } - - // Verify job was added to the database - var dbJob jobs.Job - err = collection.FindOne(context.Background(), bson.M{"_id": 1}).Decode(&dbJob) - if err != nil { - t.Fatalf("Failed to find job in database: %v", err) - } - - if dbJob.Id != 1 { - t.Errorf("Expected job ID 1, got %d", dbJob.Id) - } - if dbJob.Name != "Test Job" { - t.Errorf("Expected job name 'Test Job', got '%s'", dbJob.Name) - } - if dbJob.TrayType != "TestTray" { - t.Errorf("Expected tray type 'TestTray', got '%s'", dbJob.TrayType) - } -} - -// TestJobInProgress tests the JobInProgress method -func TestJobInProgress(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - qm := NewQueueManager() - qm.Connect(collection) - - // Create and add a test job - job := createTestJob(1, "Test Job", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job}) - qm.jobQueue.Add(job) - - // Test JobInProgress - err := qm.JobInProgress(1) - if err != nil { - t.Fatalf("JobInProgress failed: %v", err) - } - - // Verify job was removed from the queue - if qm.jobQueue.Get(1) != nil { - t.Error("Expected job to be removed from the queue") - } - - // Verify job was removed from the database - count, err := collection.CountDocuments(context.Background(), bson.M{"id": 1}) - if err != nil { - t.Fatalf("Failed to count documents: %v", err) - } - if count != 0 { - t.Errorf("Expected 0 jobs in database, got %d", count) - } - - // Test JobInProgress with non-existent job - err = qm.JobInProgress(999) - if err == nil { - t.Error("Expected error for non-existent job, got nil") - } -} - -// TestUpdateJobStatus tests the UpdateJobStatus method -func TestUpdateJobStatus(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - qm := NewQueueManager() - qm.Connect(collection) - - // Create and add a test job - job := createTestJob(1, "Test Job", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job}) - qm.jobQueue.Add(job) - - // Test UpdateJobStatus with JobStatusInProgress - err := qm.UpdateJobStatus(1, jobs.JobStatusInProgress) - if err != nil { - t.Fatalf("UpdateJobStatus failed: %v", err) - } - - // Verify job was removed from the queue - if qm.jobQueue.Get(1) != nil { - t.Error("Expected job to be removed from the queue") - } - - // Verify job was removed from the database - count, err := collection.CountDocuments(context.Background(), bson.M{"id": 1}) - if err != nil { - t.Fatalf("Failed to count documents: %v", err) - } - if count != 0 { - t.Errorf("Expected 0 jobs in database, got %d", count) - } - - // Add the job back for the next test - job = createTestJob(2, "Test Job", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job}) - qm.jobQueue.Add(job) - - // Test UpdateJobStatus with JobStatusFinished - err = qm.UpdateJobStatus(2, jobs.JobStatusFinished) - if err != nil { - t.Fatalf("UpdateJobStatus failed: %v", err) - } - - // Verify job was removed from the queue - if qm.jobQueue.Get(2) != nil { - t.Error("Expected job to be removed from the queue") - } - - // Verify job was removed from the database - count, err = collection.CountDocuments(context.Background(), bson.M{"_id": 2}) - if err != nil { - t.Fatalf("Failed to count documents: %v", err) - } - if count != 0 { - t.Errorf("Expected 0 jobs in database, got %d", count) - } - - // Add the job back for the next test - job = createTestJob(3, "Test Job", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job}) - qm.jobQueue.Add(job) - - // Test UpdateJobStatus with other status (should do nothing) - err = qm.UpdateJobStatus(3, jobs.JobStatusQueued) - if err != nil { - t.Fatalf("UpdateJobStatus failed: %v", err) - } - - // Verify job is still in the queue - if qm.jobQueue.Get(3) == nil { - t.Error("Expected job to still be in the queue") - } - - // Verify job is still in the database - count, err = collection.CountDocuments(context.Background(), bson.M{"_id": 3}) - if err != nil { - t.Fatalf("Failed to count documents: %v", err) - } - if count != 1 { - t.Errorf("Expected 1 job in database, got %d", count) - } - - // Test UpdateJobStatus with non-existent job - err = qm.UpdateJobStatus(999, jobs.JobStatusInProgress) - if err != nil { - t.Fatalf("Did not expect error for non-existent job, got: %v", err) - } -} - -// TestDeleteJob tests the deleteJob method indirectly through JobInProgress -func TestDeleteJob(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - qm := NewQueueManager() - qm.Connect(collection) - - // Create and add a test job - job := createTestJob(1, "Test Job", "TestTray") - insertTestJobs(t, collection, []*jobs.Job{job}) - qm.jobQueue.Add(job) - - // Test deleteJob through JobInProgress - err := qm.JobInProgress(1) - if err != nil { - t.Fatalf("JobInProgress failed: %v", err) - } - - // Verify job was removed from the queue - if qm.jobQueue.Get(1) != nil { - t.Error("Expected job to be removed from the queue") - } - - // Verify job was removed from the database - count, err := collection.CountDocuments(context.Background(), bson.M{"id": 1}) - if err != nil { - t.Fatalf("Failed to count documents: %v", err) - } - if count != 0 { - t.Errorf("Expected 0 jobs in database, got %d", count) - } -} - -// TestQueueManagerGetJobsCount tests the GetJobsCount method -func TestQueueManagerGetJobsCount(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - qm := NewQueueManager() - qm.Connect(collection) - - // Test with empty queue - counts := qm.GetJobsCount() - if len(counts) != 0 { - t.Errorf("Expected empty counts map for empty queue, got %d items", len(counts)) - } - - // Add some jobs - job1 := createTestJob(1, "Test Job 1", "TestTray1") - job2 := createTestJob(2, "Test Job 2", "TestTray1") - job3 := createTestJob(3, "Test Job 3", "TestTray2") - - qm.jobQueue.Add(job1) - qm.jobQueue.Add(job2) - qm.jobQueue.Add(job3) - - // Test with populated queue - counts = qm.GetJobsCount() - - if len(counts) != 2 { - t.Errorf("Expected 2 items in counts map, got %d", len(counts)) - } - - if counts["TestTray1"] != 2 { - t.Errorf("Expected 2 jobs in TestTray1, got %d", counts["TestTray1"]) - } - - if counts["TestTray2"] != 1 { - t.Errorf("Expected 1 job in TestTray2, got %d", counts["TestTray2"]) - } -} diff --git a/src/lib/jobs/job.go b/src/lib/jobs/job.go deleted file mode 100644 index 933cd8e..0000000 --- a/src/lib/jobs/job.go +++ /dev/null @@ -1,32 +0,0 @@ -package jobs - -import "github.com/google/go-github/v70/github" - -type Job struct { - Id int64 `bson:"_id"` - Name string `bson:"name"` - Action string `bson:"action"` - WorkflowId int64 `bson:"workflowId"` - WorkflowName string `bson:"workflowName"` - Repository string `bson:"repository"` - Organization string `bson:"organization"` - Labels []string `bson:"labels"` - RunnerName string `bson:"runnerName"` - TrayType string `bson:"trayType"` - CreatedAt github.Timestamp `bson:"createdAt"` -} - -func FromGithubModel(workflowJobEvent *github.WorkflowJobEvent) *Job { - return &Job{ - Id: workflowJobEvent.GetWorkflowJob().GetID(), - Name: workflowJobEvent.GetWorkflowJob().GetName(), - Action: workflowJobEvent.GetAction(), - WorkflowId: workflowJobEvent.GetWorkflowJob().GetRunID(), - WorkflowName: workflowJobEvent.GetWorkflowJob().GetWorkflowName(), - Repository: workflowJobEvent.GetRepo().GetName(), - Organization: workflowJobEvent.GetOrg().GetLogin(), - RunnerName: workflowJobEvent.GetWorkflowJob().GetRunnerName(), - Labels: workflowJobEvent.GetWorkflowJob().Labels, - CreatedAt: workflowJobEvent.GetWorkflowJob().GetCreatedAt(), - } -} diff --git a/src/lib/jobs/jobStatus.go b/src/lib/jobs/jobStatus.go deleted file mode 100644 index a04cbe2..0000000 --- a/src/lib/jobs/jobStatus.go +++ /dev/null @@ -1,19 +0,0 @@ -package jobs - -type JobStatus int - -const ( - JobStatusQueued JobStatus = iota - JobStatusInProgress - JobStatusFinished -) - -var stateName = map[JobStatus]string{ - JobStatusQueued: "queued", - JobStatusInProgress: "in_progress", - JobStatusFinished: "finished", -} - -func (js JobStatus) String() string { - return stateName[js] -} diff --git a/src/lib/metrics/metrics.go b/src/lib/metrics/metrics.go index a37f1e5..f990b2a 100644 --- a/src/lib/metrics/metrics.go +++ b/src/lib/metrics/metrics.go @@ -10,35 +10,35 @@ var ( staleTraysCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "cattery_stale_trays_count", - Help: "", + Help: "Number of stale trays cleaned up", }, []string{"org", "tray_type"}) - staleJobsCount = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "cattery_stale_jobs_count", - Help: "", - }, []string{"org", "repository", "job_name", "tray_type"}) - preemptedTraysCount = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "cattery_preempted_trays_count", - Help: "", + Help: "Number of preempted trays", }, []string{"org", "tray_type"}) trayProviderErrors = promauto.NewCounterVec(prometheus.CounterOpts{ Name: "cattery_tray_provider_errors", - Help: "", + Help: "Number of provider errors during tray operations", }, []string{"org", "provider", "tray_type", "operation_type"}) + scaleSetPollErrors = promauto.NewCounterVec(prometheus.CounterOpts{ + Name: "cattery_scaleset_poll_errors", + Help: "Number of scale set polling errors", + }, []string{"org", "tray_type"}) + // Gauges registeredTraysTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{ Name: "cattery_registered_trays", - Help: "", + Help: "Number of currently registered trays", }, []string{"org", "tray_type"}) - jobsInQueueTotal = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cattery_jobs_in_queue", - Help: "", - }, []string{"org", "repository", "job_name", "tray_type"}) + scaleSetPendingJobs = promauto.NewGaugeVec(prometheus.GaugeOpts{ + Name: "cattery_scaleset_pending_jobs", + Help: "Number of pending jobs reported by scale set statistics", + }, []string{"org", "tray_type"}) ) // StaleTrays @@ -61,30 +61,24 @@ func PreemptedTraysInc(org string, trayType string) { PreemptedTraysAdd(org, trayType, 1) } -// StaleJobs - -func StaleJobsAdd(org string, repository string, jobName string, trayType string, count int) { - staleJobsCount.WithLabelValues(org, repository, jobName, trayType).Add(float64(count)) -} - -func StaleJobsInc(org string, repository string, jobName string, trayType string) { - StaleJobsAdd(org, repository, jobName, trayType, 1) -} - -// registeredTraysTotal +// RegisteredTrays func RegisteredTraysAdd(org string, trayType string, count int) { registeredTraysTotal.WithLabelValues(org, trayType).Add(float64(count)) } -// jobsInQueueTotal - -func JobsInQueueAdd(org string, repository string, jobName string, trayType string, count int) { - jobsInQueueTotal.WithLabelValues(org, repository, jobName, trayType).Add(float64(count)) -} - // TrayProviderErrors func TrayProviderErrors(org string, provider, trayType string, operationType string) { trayProviderErrors.WithLabelValues(org, provider, trayType, operationType).Inc() } + +// ScaleSet metrics + +func ScaleSetPollErrorsInc(org string, trayType string) { + scaleSetPollErrors.WithLabelValues(org, trayType).Inc() +} + +func ScaleSetPendingJobsSet(org string, trayType string, count int) { + scaleSetPendingJobs.WithLabelValues(org, trayType).Set(float64(count)) +} diff --git a/src/lib/restarter/repositories/iRestarterRepository.go b/src/lib/restarter/repositories/iRestarterRepository.go index 753924e..9971dfa 100644 --- a/src/lib/restarter/repositories/iRestarterRepository.go +++ b/src/lib/restarter/repositories/iRestarterRepository.go @@ -1,7 +1,17 @@ package repositories +import "time" + +type RestartRequest struct { + WorkflowRunId int64 `bson:"workflowRunId"` + OrgName string `bson:"orgName"` + RepoName string `bson:"repoName"` + CreatedAt time.Time `bson:"createdAt"` +} + type IRestarterRepository interface { - SaveRestartRequest(workflowRunId int64) error + SaveRestartRequest(workflowRunId int64, orgName string, repoName string) error DeleteRestartRequest(workflowRunId int64) error CheckRestartRequest(workflowRunId int64) (bool, error) + GetAllPendingRestartRequests() ([]RestartRequest, error) } diff --git a/src/lib/restarter/repositories/mongodbRestarterRepository.go b/src/lib/restarter/repositories/mongodbRestarterRepository.go index b6ba2e8..ff1703e 100644 --- a/src/lib/restarter/repositories/mongodbRestarterRepository.go +++ b/src/lib/restarter/repositories/mongodbRestarterRepository.go @@ -22,7 +22,7 @@ func (m *MongodbRestarterRepository) Connect(collection *mongo.Collection) { m.collection = collection } -func (m *MongodbRestarterRepository) SaveRestartRequest(workflowRunId int64) error { +func (m *MongodbRestarterRepository) SaveRestartRequest(workflowRunId int64, orgName string, repoName string) error { _, err := m.collection.UpdateOne( context.Background(), bson.M{ @@ -31,6 +31,8 @@ func (m *MongodbRestarterRepository) SaveRestartRequest(workflowRunId int64) err bson.M{ "$set": bson.M{ "workflowRunId": workflowRunId, + "orgName": orgName, + "repoName": repoName, "createdAt": time.Now().UTC(), }, }, @@ -50,7 +52,6 @@ func (m *MongodbRestarterRepository) DeleteRestartRequest(workflowRunId int64) e } func (m *MongodbRestarterRepository) CheckRestartRequest(workflowRunId int64) (bool, error) { - // log.Debugf("Checking restart request for workflow run id %d in MongoDB", workflowRunId) dbResult := m.collection.FindOne( context.Background(), bson.M{ @@ -68,3 +69,16 @@ func (m *MongodbRestarterRepository) CheckRestartRequest(workflowRunId int64) (b return true, nil } + +func (m *MongodbRestarterRepository) GetAllPendingRestartRequests() ([]RestartRequest, error) { + cursor, err := m.collection.Find(context.Background(), bson.M{}) + if err != nil { + return nil, err + } + + var requests []RestartRequest + if err := cursor.All(context.Background(), &requests); err != nil { + return nil, err + } + return requests, nil +} diff --git a/src/lib/restarter/workflowRestarter.go b/src/lib/restarter/workflowRestarter.go index c27974a..aa52e1c 100644 --- a/src/lib/restarter/workflowRestarter.go +++ b/src/lib/restarter/workflowRestarter.go @@ -3,6 +3,8 @@ package restarter import ( "cattery/lib/githubClient" "cattery/lib/restarter/repositories" + "context" + "time" log "github.com/sirupsen/logrus" ) @@ -17,64 +19,89 @@ func NewWorkflowRestarter(repository repositories.IRestarterRepository) *Workflo } } -func (wr *WorkflowRestarter) RequestRestart(workflowRunId int64) error { - log.Debugf("Requesting restart for workflow run id %d", workflowRunId) - return wr.repository.SaveRestartRequest(workflowRunId) +func (wr *WorkflowRestarter) RequestRestart(workflowRunId int64, orgName string, repoName string) error { + log.Debugf("Requesting restart for workflow run id %d (%s/%s)", workflowRunId, orgName, repoName) + return wr.repository.SaveRestartRequest(workflowRunId, orgName, repoName) } -func (wr *WorkflowRestarter) Restart(workflowRunId int64, ghOrg string, repoName string) error { +// StartPoller starts a background goroutine that periodically checks pending restart +// requests and triggers restarts when workflows have completed with failure. +// This replaces the webhook-based workflow_run event handling. +func (wr *WorkflowRestarter) StartPoller(ctx context.Context) { + const pollInterval = 30 * time.Second + const requestTTL = 1 * time.Hour - // check that workflow is in db - log.Debugf("Checking restart request for workflow run id %d", workflowRunId) - exists, err := wr.repository.CheckRestartRequest(workflowRunId) + logger := log.WithField("component", "restarterPoller") + + go func() { + for { + select { + case <-ctx.Done(): + logger.Info("Restart poller shutting down") + return + default: + time.Sleep(pollInterval) + wr.pollPendingRestarts(logger, requestTTL) + } + } + }() + + logger.Info("Restart poller started") +} + +func (wr *WorkflowRestarter) pollPendingRestarts(logger *log.Entry, ttl time.Duration) { + requests, err := wr.repository.GetAllPendingRestartRequests() if err != nil { - log.Errorf("Failed to check restart request: %s", err.Error()) - return err + logger.Errorf("Failed to get pending restart requests: %v", err) + return } - if !exists { - log.Debugf("No restart request found for workflow run id %d", workflowRunId) - return nil - } - ghClient, err := githubClient.NewGithubClientWithOrgName(ghOrg) - if err != nil { - log.Errorf("Failed to get GitHub client: %s", err.Error()) - return err + + for _, req := range requests { + // TTL safety net: delete stale requests + if time.Since(req.CreatedAt) > ttl { + logger.Warnf("Restart request for workflow %d expired (age: %v), deleting", req.WorkflowRunId, time.Since(req.CreatedAt)) + _ = wr.repository.DeleteRestartRequest(req.WorkflowRunId) + continue + } + + wr.handleRestartRequest(logger, req) } - log.Debugf("Restarting failed jobs for workflow run id %d", workflowRunId) - err = ghClient.RestartFailedJobs(repoName, workflowRunId) +} + +func (wr *WorkflowRestarter) handleRestartRequest(logger *log.Entry, req repositories.RestartRequest) { + ghClient, err := githubClient.NewGithubClientWithOrgName(req.OrgName) if err != nil { - log.Errorf("Failed to restart workflow run id %d: %v", workflowRunId, err) - return err + logger.Errorf("Failed to get GitHub client for org %s: %v", req.OrgName, err) + return } - log.Debugf("Successfully restarted failed jobs for workflow run id %d, removing restart request from DB", workflowRunId) - err = wr.repository.DeleteRestartRequest(workflowRunId) + + status, conclusion, err := ghClient.GetWorkflowRunStatus(req.RepoName, req.WorkflowRunId) if err != nil { - log.Errorf("Failed to delete restart request for workflow run id %d: %v", workflowRunId, err) - return err + logger.Errorf("Failed to get workflow run status for %d: %v", req.WorkflowRunId, err) + return } - log.Debugf("Finished restart request for workflow run id %d", workflowRunId) - return nil -} -// Cleanup clean db on cancelled or completed workflow runs -func (wr *WorkflowRestarter) Cleanup(workflowRunId int64, ghOrg string, repoName string) error { - log.Debugf("Cleanup for workflow run id %d", workflowRunId) - log.Debugf("Checking restart request for workflow run id %d", workflowRunId) - exists, err := wr.repository.CheckRestartRequest(workflowRunId) - if err != nil { - log.Errorf("Failed to check restart request: %s", err.Error()) - return err + if status != "completed" { + // Workflow still running, skip for now + return } - if !exists { - log.Debugf("No restart request found for workflow run id %d", workflowRunId) - return nil + + switch conclusion { + case "failure": + logger.Infof("Restarting failed jobs for workflow run %d (%s/%s)", req.WorkflowRunId, req.OrgName, req.RepoName) + err = ghClient.RestartFailedJobs(req.RepoName, req.WorkflowRunId) + if err != nil { + logger.Errorf("Failed to restart workflow run %d: %v", req.WorkflowRunId, err) + return + } + logger.Infof("Successfully restarted failed jobs for workflow run %d", req.WorkflowRunId) + default: + // success, cancelled, or other — just clean up + logger.Debugf("Workflow run %d completed with conclusion '%s', cleaning up restart request", req.WorkflowRunId, conclusion) } - log.Debugf("Successfully cleaned up restart request for workflow run id %d, removing restart request from DB", workflowRunId) - err = wr.repository.DeleteRestartRequest(workflowRunId) + + err = wr.repository.DeleteRestartRequest(req.WorkflowRunId) if err != nil { - log.Errorf("Failed to delete restart request for workflow run id %d: %v", workflowRunId, err) - return err + logger.Errorf("Failed to delete restart request for workflow %d: %v", req.WorkflowRunId, err) } - log.Debugf("Finished cleanup restart request for workflow run id %d", workflowRunId) - return nil } diff --git a/src/lib/scaleSetClient/scaleSetClient.go b/src/lib/scaleSetClient/scaleSetClient.go new file mode 100644 index 0000000..3c13b85 --- /dev/null +++ b/src/lib/scaleSetClient/scaleSetClient.go @@ -0,0 +1,115 @@ +package scaleSetClient + +import ( + "cattery/lib/config" + "context" + "fmt" + "os" + + "github.com/actions/scaleset" + log "github.com/sirupsen/logrus" +) + +type ScaleSetClient struct { + client *scaleset.Client + session *scaleset.MessageSessionClient + scaleSet *scaleset.RunnerScaleSet + org *config.GitHubOrganization + trayType *config.TrayType + logger *log.Entry +} + +func NewScaleSetClient(org *config.GitHubOrganization, trayType *config.TrayType) (*ScaleSetClient, error) { + privateKey, err := os.ReadFile(org.PrivateKeyPath) + if err != nil { + return nil, fmt.Errorf("failed to read private key: %w", err) + } + + client, err := scaleset.NewClientWithGitHubApp(scaleset.ClientWithGitHubAppConfig{ + GitHubConfigURL: fmt.Sprintf("https://github.com/%s", org.Name), + GitHubAppAuth: scaleset.GitHubAppAuth{ + ClientID: org.AppClientId, + InstallationID: org.InstallationId, + PrivateKey: string(privateKey), + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to create scale set client: %w", err) + } + + return &ScaleSetClient{ + client: client, + org: org, + trayType: trayType, + logger: log.WithFields(log.Fields{ + "component": "scaleSetClient", + "trayType": trayType.Name, + "org": org.Name, + }), + }, nil +} + +func (sc *ScaleSetClient) EnsureScaleSet(ctx context.Context) error { + existing, err := sc.client.GetRunnerScaleSet(ctx, int(sc.trayType.RunnerGroupId), sc.trayType.Name) + if err == nil && existing != nil { + sc.scaleSet = existing + sc.logger.Infof("Found existing scale set: %s (ID: %d)", existing.Name, existing.ID) + return nil + } + + sc.logger.Infof("Creating new scale set: %s", sc.trayType.Name) + created, err := sc.client.CreateRunnerScaleSet(ctx, &scaleset.RunnerScaleSet{ + Name: sc.trayType.Name, + RunnerGroupID: int(sc.trayType.RunnerGroupId), + Labels: []scaleset.Label{ + {Name: sc.trayType.Name, Type: "User"}, + }, + }) + if err != nil { + return fmt.Errorf("failed to create scale set: %w", err) + } + + sc.scaleSet = created + sc.logger.Infof("Created scale set: %s (ID: %d)", created.Name, created.ID) + return nil +} + +func (sc *ScaleSetClient) CreateSession(ctx context.Context) error { + hostname, _ := os.Hostname() + session, err := sc.client.MessageSessionClient(ctx, sc.scaleSet.ID, hostname) + if err != nil { + return fmt.Errorf("failed to create message session: %w", err) + } + sc.session = session + sc.logger.Info("Message session created") + return nil +} + +func (sc *ScaleSetClient) Poll(ctx context.Context, lastMessageID int, maxCapacity int) (*scaleset.RunnerScaleSetMessage, error) { + return sc.session.GetMessage(ctx, lastMessageID, maxCapacity) +} + +func (sc *ScaleSetClient) Ack(ctx context.Context, messageID int) error { + return sc.session.DeleteMessage(ctx, messageID) +} + +func (sc *ScaleSetClient) GenerateJitRunnerConfig(ctx context.Context, runnerName string) (*scaleset.RunnerScaleSetJitRunnerConfig, error) { + return sc.client.GenerateJitRunnerConfig(ctx, &scaleset.RunnerScaleSetJitRunnerSetting{ + Name: runnerName, + WorkFolder: "_work", + }, sc.scaleSet.ID) +} + +func (sc *ScaleSetClient) Close(ctx context.Context) error { + if sc.session != nil { + return sc.session.Close(ctx) + } + return nil +} + +func (sc *ScaleSetClient) GetScaleSetID() int { + if sc.scaleSet != nil { + return sc.scaleSet.ID + } + return 0 +} diff --git a/src/lib/scaleSetPoller/manager.go b/src/lib/scaleSetPoller/manager.go new file mode 100644 index 0000000..0b3027d --- /dev/null +++ b/src/lib/scaleSetPoller/manager.go @@ -0,0 +1,26 @@ +package scaleSetPoller + +import "sync" + +type Manager struct { + mu sync.RWMutex + pollers map[string]*Poller +} + +func NewManager() *Manager { + return &Manager{ + pollers: make(map[string]*Poller), + } +} + +func (m *Manager) Register(trayTypeName string, poller *Poller) { + m.mu.Lock() + defer m.mu.Unlock() + m.pollers[trayTypeName] = poller +} + +func (m *Manager) GetPoller(trayTypeName string) *Poller { + m.mu.RLock() + defer m.mu.RUnlock() + return m.pollers[trayTypeName] +} diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go new file mode 100644 index 0000000..3d6f459 --- /dev/null +++ b/src/lib/scaleSetPoller/poller.go @@ -0,0 +1,143 @@ +package scaleSetPoller + +import ( + "cattery/lib/config" + "cattery/lib/metrics" + "cattery/lib/scaleSetClient" + "cattery/lib/trayManager" + "context" + "fmt" + "strconv" + + "github.com/actions/scaleset" + "github.com/actions/scaleset/listener" + log "github.com/sirupsen/logrus" +) + +type Poller struct { + client *scaleSetClient.ScaleSetClient + trayType *config.TrayType + trayManager *trayManager.TrayManager + logger *log.Entry +} + +func NewPoller( + client *scaleSetClient.ScaleSetClient, + trayType *config.TrayType, + tm *trayManager.TrayManager, +) *Poller { + return &Poller{ + client: client, + trayType: trayType, + trayManager: tm, + logger: log.WithFields(log.Fields{ + "component": "scaleSetPoller", + "trayType": trayType.Name, + }), + } +} + +func (p *Poller) Client() *scaleSetClient.ScaleSetClient { + return p.client +} + +func (p *Poller) Run(ctx context.Context) error { + p.logger.Info("Starting scale set poller") + + if err := p.client.EnsureScaleSet(ctx); err != nil { + return fmt.Errorf("failed to ensure scale set: %w", err) + } + + if err := p.client.CreateSession(ctx); err != nil { + return fmt.Errorf("failed to create session: %w", err) + } + defer p.client.Close(ctx) + + scaleSetID := p.client.GetScaleSetID() + + l, err := listener.New( + &sessionAdapter{client: p.client}, + listener.Config{ + ScaleSetID: scaleSetID, + MaxRunners: p.trayType.MaxTrays, + }, + ) + if err != nil { + return fmt.Errorf("failed to create listener: %w", err) + } + + scaler := &catteryScaler{ + poller: p, + } + + p.logger.Info("Entering listener loop") + return l.Run(ctx, scaler) +} + +// sessionAdapter adapts our ScaleSetClient to the listener.Client interface. +type sessionAdapter struct { + client *scaleSetClient.ScaleSetClient +} + +func (s *sessionAdapter) GetMessage(ctx context.Context, lastMessageID, maxCapacity int) (*scaleset.RunnerScaleSetMessage, error) { + return s.client.Poll(ctx, lastMessageID, maxCapacity) +} + +func (s *sessionAdapter) DeleteMessage(ctx context.Context, messageID int) error { + return s.client.Ack(ctx, messageID) +} + +func (s *sessionAdapter) Session() scaleset.RunnerScaleSetSession { + // The listener needs this for logging/metadata only. + return scaleset.RunnerScaleSetSession{} +} + +// catteryScaler implements the listener.Scaler interface. +type catteryScaler struct { + poller *Poller +} + +func (cs *catteryScaler) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) { + err := cs.poller.trayManager.ScaleForDemand(cs.poller.trayType, count) + if err != nil { + cs.poller.logger.Errorf("Failed to scale for demand (%d): %v", count, err) + return 0, err + } + + total, err := cs.poller.trayManager.CountTrays(cs.poller.trayType.Name) + if err != nil { + return 0, err + } + return total, nil +} + +func (cs *catteryScaler) HandleJobStarted(ctx context.Context, jobInfo *scaleset.JobStarted) error { + cs.poller.logger.Infof("Job started: %s on runner %s (workflow run %d)", + jobInfo.JobDisplayName, jobInfo.RunnerName, jobInfo.WorkflowRunID) + + jobID, _ := strconv.ParseInt(jobInfo.JobID, 10, 64) + repository := fmt.Sprintf("%s/%s", jobInfo.OwnerName, jobInfo.RepositoryName) + + _, err := cs.poller.trayManager.SetJob(jobInfo.RunnerName, jobID, jobInfo.WorkflowRunID, repository) + if err != nil { + cs.poller.logger.Errorf("Failed to set job on tray %s: %v", jobInfo.RunnerName, err) + return err + } + + metrics.RegisteredTraysAdd(cs.poller.trayType.GitHubOrg, cs.poller.trayType.Name, 0) + return nil +} + +func (cs *catteryScaler) HandleJobCompleted(ctx context.Context, jobInfo *scaleset.JobCompleted) error { + cs.poller.logger.Infof("Job completed: %s on runner %s (result: %s)", + jobInfo.JobDisplayName, jobInfo.RunnerName, jobInfo.Result) + + _, err := cs.poller.trayManager.DeleteTray(jobInfo.RunnerName) + if err != nil { + cs.poller.logger.Errorf("Failed to delete tray %s: %v", jobInfo.RunnerName, err) + return err + } + + metrics.RegisteredTraysAdd(cs.poller.trayType.GitHubOrg, cs.poller.trayType.Name, -1) + return nil +} diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 80dd02e..303c1ad 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -3,7 +3,6 @@ package trayManager import ( "cattery/lib/config" "cattery/lib/githubClient" - "cattery/lib/jobQueue" "cattery/lib/metrics" "cattery/lib/trays" "cattery/lib/trays/providers" @@ -18,14 +17,11 @@ import ( type TrayManager struct { trayRepository repositories.ITrayRepository - - isStaleTraysFound bool } func NewTrayManager(trayRepository repositories.ITrayRepository) *TrayManager { return &TrayManager{ - trayRepository: trayRepository, - isStaleTraysFound: false, + trayRepository: trayRepository, } } @@ -81,7 +77,7 @@ func (tm *TrayManager) GetTrayById(trayId string) (*trays.Tray, error) { } func (tm *TrayManager) Registering(trayId string) (*trays.Tray, error) { - tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRegistering, 0, 0, 0) + tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRegistering, 0, 0, 0, "") if err != nil { return nil, err } @@ -94,7 +90,7 @@ func (tm *TrayManager) Registering(trayId string) (*trays.Tray, error) { } func (tm *TrayManager) Registered(trayId string, ghRunnerId int64) (*trays.Tray, error) { - tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRegistered, 0, 0, ghRunnerId) + tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRegistered, 0, 0, ghRunnerId, "") if err != nil { return nil, err } @@ -106,8 +102,8 @@ func (tm *TrayManager) Registered(trayId string, ghRunnerId int64) (*trays.Tray, return tray, nil } -func (tm *TrayManager) SetJob(trayId string, jobRunId int64, workflowRunId int64) (*trays.Tray, error) { - tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRunning, jobRunId, workflowRunId, 0) +func (tm *TrayManager) SetJob(trayId string, jobRunId int64, workflowRunId int64, repository string) (*trays.Tray, error) { + tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRunning, jobRunId, workflowRunId, 0, repository) if err != nil { return nil, err } @@ -121,7 +117,7 @@ func (tm *TrayManager) SetJob(trayId string, jobRunId int64, workflowRunId int64 func (tm *TrayManager) DeleteTray(trayId string) (*trays.Tray, error) { - var tray, err = tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusDeleting, 0, 0, 0) + var tray, err = tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusDeleting, 0, 0, 0, "") if err != nil { return nil, err } @@ -180,7 +176,6 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { if len(stale) > 0 { log.Infof("Found %d stale trays: %v", len(stale), stale) - tm.isStaleTraysFound = true } for _, tray := range stale { @@ -198,84 +193,49 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { }() } -func (tm *TrayManager) HandleJobsQueue(ctx context.Context, manager *jobQueue.QueueManager) { - go func() { - for { - select { - case <-ctx.Done(): - return - default: - - if tm.isStaleTraysFound { - err := manager.CleanupCompletedJobs() - if err != nil { - log.Errorf("Failed to cleanup completed jobs: %v", err) - } - tm.isStaleTraysFound = false - } - - var groups = manager.GetJobsCount() - for typeName, jobsCount := range groups { - err := tm.handleType(typeName, jobsCount) - if err != nil { - log.Error(err) - } - } - - time.Sleep(10 * time.Second) - } - } - }() -} - -func (tm *TrayManager) handleType(trayTypeName string, jobsInQueue int) error { - // log.Debugf("Handling tray type %s with %d jobs in queue", trayTypeName, jobsInQueue) - countByStatus, total, err := tm.trayRepository.CountByTrayType(trayTypeName) +// ScaleForDemand scales trays for a given tray type based on pending job count. +// Called by the scale set poller with statistics from GitHub. +func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, pendingJobs int) error { + countByStatus, total, err := tm.trayRepository.CountByTrayType(trayType.Name) if err != nil { - log.Errorf("Failed to count trays for type %s: %v", trayTypeName, err) + log.Errorf("Failed to count trays for type %s: %v", trayType.Name, err) return err } - var traysWithNoJob = countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + countByStatus[trays.TrayStatusRegistered] - // log.Debugf("Tray type %s has %d trays, %d with no job", trayTypeName, total, traysWithNoJob) - if jobsInQueue > traysWithNoJob { - var trayType = getTrayType(trayTypeName) - if trayType == nil { - log.Warnf("Tray type '%s' not found in config; skipping creation", trayTypeName) - return nil - } + traysWithNoJob := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + countByStatus[trays.TrayStatusRegistered] - var remainingTrays = trayType.MaxTrays - total - var traysToCreate = jobsInQueue - traysWithNoJob - if traysToCreate > remainingTrays { - traysToCreate = remainingTrays + if pendingJobs > traysWithNoJob { + remainingCapacity := trayType.MaxTrays - total + traysToCreate := pendingJobs - traysWithNoJob + if traysToCreate > remainingCapacity { + traysToCreate = remainingCapacity } - - err := tm.createTrays(trayType, traysToCreate) - if err != nil { - return err + if traysToCreate > 0 { + err := tm.createTrays(trayType, traysToCreate) + if err != nil { + return err + } } } - if jobsInQueue < traysWithNoJob { - var traysToDelete = traysWithNoJob - jobsInQueue - redundant, err := tm.trayRepository.MarkRedundant(trayTypeName, traysToDelete) + if pendingJobs < traysWithNoJob { + traysToDelete := traysWithNoJob - pendingJobs + redundant, err := tm.trayRepository.MarkRedundant(trayType.Name, traysToDelete) if err != nil { return err } - for _, tray := range redundant { if _, delErr := tm.DeleteTray(tray.Id); delErr != nil { log.Errorf("Failed to delete redundant tray %s: %v", tray.Id, delErr) } } - } return nil } -func getTrayType(trayTypeName string) *config.TrayType { - var trayType = config.AppConfig.GetTrayType(trayTypeName) - return trayType +// CountTrays returns the total number of trays for a given tray type. +func (tm *TrayManager) CountTrays(trayTypeName string) (int, error) { + _, total, err := tm.trayRepository.CountByTrayType(trayTypeName) + return total, err } diff --git a/src/lib/trays/repositories/iTrayRepository.go b/src/lib/trays/repositories/iTrayRepository.go index fd22ed7..a979f3c 100644 --- a/src/lib/trays/repositories/iTrayRepository.go +++ b/src/lib/trays/repositories/iTrayRepository.go @@ -9,7 +9,7 @@ type ITrayRepository interface { GetById(trayId string) (*trays.Tray, error) Save(tray *trays.Tray) error Delete(trayId string) error - UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64) (*trays.Tray, error) + UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) CountByTrayType(trayType string) (map[trays.TrayStatus]int, int, error) MarkRedundant(trayType string, limit int) ([]*trays.Tray, error) GetStale(d time.Duration, rd time.Duration) ([]*trays.Tray, error) diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index 9164820..c9c388c 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -113,7 +113,7 @@ func (m *MongodbTrayRepository) Save(tray *trays.Tray) error { return nil } -func (m *MongodbTrayRepository) UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64) (*trays.Tray, error) { +func (m *MongodbTrayRepository) UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) { var setQuery = bson.M{"status": status, "statusChanged": time.Now().UTC()} @@ -129,6 +129,10 @@ func (m *MongodbTrayRepository) UpdateStatus(trayId string, status trays.TraySta setQuery["workflowRunId"] = workflowRunId } + if repository != "" { + setQuery["repository"] = repository + } + dbResult := m.collection.FindOneAndUpdate( context.Background(), bson.M{"id": trayId}, diff --git a/src/lib/trays/tray.go b/src/lib/trays/tray.go index 1aa2c48..338ceb0 100644 --- a/src/lib/trays/tray.go +++ b/src/lib/trays/tray.go @@ -18,6 +18,7 @@ type Tray struct { GitHubRunnerId int64 `bson:"gitHubRunnerId"` JobRunId int64 `bson:"jobRunId"` WorkflowRunId int64 `bson:"workflowRunId"` + Repository string `bson:"repository"` Status TrayStatus `bson:"status"` StatusChanged time.Time `bson:"statusChanged"` diff --git a/src/server/handlers/agentHandler.go b/src/server/handlers/agentHandler.go index 589b5cf..639cabf 100644 --- a/src/server/handlers/agentHandler.go +++ b/src/server/handlers/agentHandler.go @@ -3,7 +3,6 @@ package handlers import ( "cattery/lib/agents" "cattery/lib/config" - "cattery/lib/githubClient" "cattery/lib/messages" "cattery/lib/metrics" "cattery/lib/trays" @@ -60,33 +59,26 @@ func AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { logger.Debugf("Found tray %s for agent %s, with organization %s", tray.GetId(), agentId, tray.GetGitHubOrgName()) - // TODO handle - client, err := githubClient.NewGithubClientWithOrgName(tray.GetGitHubOrgName()) - if err != nil { - var errMsg = fmt.Sprintf("Organization '%s' is invalid: %v", tray.GetGitHubOrgName(), err) + poller := ScaleSetManager.GetPoller(trayType.Name) + if poller == nil { + var errMsg = fmt.Sprintf("No scale set poller found for tray type '%s'", trayType.Name) logger.Error(errMsg) http.Error(responseWriter, errMsg, http.StatusInternalServerError) return } - logger = logger.WithFields(log.Fields{"githubOrg": tray.GetGitHubOrgName()}) - - jitRunnerConfig, err := client.CreateJITConfig( - tray.GetId(), - trayType.RunnerGroupId, - []string{trayType.Name}, - ) + jitRunnerConfig, err := poller.Client().GenerateJitRunnerConfig(r.Context(), tray.GetId()) if err != nil { logger.Errorf("Failed to generate jitRunnerConfig: %v", err) http.Error(responseWriter, "Failed to generate jitRunnerConfig", http.StatusInternalServerError) return } - var jitConfig = jitRunnerConfig.GetEncodedJITConfig() + var jitConfig = jitRunnerConfig.EncodedJITConfig var newAgent = agents.Agent{ AgentId: agentId, - RunnerId: jitRunnerConfig.GetRunner().GetID(), + RunnerId: int64(jitRunnerConfig.Runner.ID), Shutdown: trayType.Shutdown, } @@ -102,7 +94,7 @@ func AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { return } - _, err = TrayManager.Registered(agentId, jitRunnerConfig.GetRunner().GetID()) + _, err = TrayManager.Registered(agentId, int64(jitRunnerConfig.Runner.ID)) if err != nil { logger.Errorf("%v", err) } @@ -326,5 +318,5 @@ func AgentInterrupt(responseWriter http.ResponseWriter, r *http.Request) { return } workflowRunId := tray.WorkflowRunId - RestartManager.RequestRestart(workflowRunId) + RestartManager.RequestRestart(workflowRunId, tray.GitHubOrgName, tray.Repository) } diff --git a/src/server/handlers/rootHandler.go b/src/server/handlers/rootHandler.go index ff1c54e..d3eaab6 100644 --- a/src/server/handlers/rootHandler.go +++ b/src/server/handlers/rootHandler.go @@ -1,15 +1,15 @@ package handlers import ( - "cattery/lib/jobQueue" "cattery/lib/restarter" + "cattery/lib/scaleSetPoller" "cattery/lib/trayManager" "net/http" ) -var QueueManager *jobQueue.QueueManager var TrayManager *trayManager.TrayManager var RestartManager *restarter.WorkflowRestarter +var ScaleSetManager *scaleSetPoller.Manager func Index(responseWriter http.ResponseWriter, r *http.Request) { return diff --git a/src/server/handlers/webhookHandler.go b/src/server/handlers/webhookHandler.go deleted file mode 100644 index d701ad7..0000000 --- a/src/server/handlers/webhookHandler.go +++ /dev/null @@ -1,243 +0,0 @@ -package handlers - -import ( - "cattery/lib/config" - "cattery/lib/jobs" - "fmt" - "net/http" - - "github.com/google/go-github/v70/github" - log "github.com/sirupsen/logrus" -) - -func Webhook(responseWriter http.ResponseWriter, r *http.Request) { - - var logger = log.WithFields( - log.Fields{ - "handler": "webhook", - "call": "Webhook", - }, - ) - - logger.Tracef("Webhook received") - - if r.Method != http.MethodPost { - http.Error(responseWriter, "Method not allowed", http.StatusMethodNotAllowed) - return - } - - event := r.Header.Get("X-GitHub-Event") - - switch event { - case "workflow_job": - handleWorkflowJobWebhook(responseWriter, r, logger) - case "workflow_run": - handleWorkflowRunWebhook(responseWriter, r, logger) - default: - logger.Debugf("Ignoring webhook request: X-GitHub-Event is not 'workflow_job' or 'workflow_run', got '%s'", event) - return - } -} - -func handleWorkflowJobWebhook(responseWriter http.ResponseWriter, r *http.Request, logger *log.Entry) { - var webhookData *github.WorkflowJobEvent - - var organizationName = r.PathValue("org") - var org = config.AppConfig.GetGitHubOrg(organizationName) - if org == nil { - var errMsg = fmt.Sprintf("Organization '%s' not found in config", organizationName) - logger.Error(errMsg) - http.Error(responseWriter, errMsg, http.StatusBadRequest) - return - } - logger = logger.WithField("githubOrg", organizationName) - logger = logger.WithField("type", "workflow_job") - - payload, err := github.ValidatePayload(r, []byte(org.WebhookSecret)) - if err != nil { - logger.Errorf("Failed to validate payload: %v", err) - http.Error(responseWriter, "Failed to validate payload", http.StatusBadRequest) - return - } - - hook, err := github.ParseWebHook(r.Header.Get("X-GitHub-Event"), payload) - if err != nil { - logger.Errorf("Failed to parse webhook: %v", err) - return - } - webhookData, ok := hook.(*github.WorkflowJobEvent) - if !ok { - logger.Errorf("Webhook payload is not WorkflowJobEvent") - return - } - - logger.Tracef("Event payload: %v", payload) - - trayType := getTrayType(webhookData) - if trayType == nil { - logger.Tracef("Ignoring action: '%s', for job '%s', no tray type found for labels: %v", webhookData.GetAction(), *webhookData.WorkflowJob.Name, webhookData.WorkflowJob.Labels) - return - } - logger = logger.WithField("jobRunId", webhookData.WorkflowJob.GetID()) - - logger.Debugf("Action: %s", webhookData.GetAction()) - - job := jobs.FromGithubModel(webhookData) - job.TrayType = trayType.Name - - logger = logger.WithField("trayType", trayType.Name) - - switch webhookData.GetAction() { - case "queued": - handleQueuedWorkflowJob(responseWriter, logger, job) - case "in_progress": - handleInProgressWorkflowJob(responseWriter, logger, job) - case "completed": - handleCompletedWorkflowJob(responseWriter, logger, job) - default: - logger.Debugf("Ignoring action: '%s', for job '%s'", webhookData.GetAction(), *webhookData.WorkflowJob.Name) - return - } -} - -func handleWorkflowRunWebhook(responseWriter http.ResponseWriter, r *http.Request, logger *log.Entry) { - log.Debugf("Received workflow_run webhook") - logger = logger.WithField("type", "workflow_run") - var webhookData *github.WorkflowRunEvent - organizationName := r.PathValue("org") - org := config.AppConfig.GetGitHubOrg(organizationName) - if org == nil { - errMsg := fmt.Sprintf("Organization '%s' not found in config", organizationName) - logger.Error(errMsg) - http.Error(responseWriter, errMsg, http.StatusBadRequest) - return - } - payload, err := github.ValidatePayload(r, []byte(org.WebhookSecret)) - if err != nil { - logger.Errorf("Error validating payload: %v", err) - http.Error(responseWriter, "Error validating payload", http.StatusBadRequest) - return - } - hook, err := github.ParseWebHook(r.Header.Get("X-GitHub-Event"), payload) - if err != nil { - logger.Errorf("Error parsing webhook: %v", err) - http.Error(responseWriter, "Error parsing webhook", http.StatusBadRequest) - return - } - webhookData, ok := hook.(*github.WorkflowRunEvent) - if !ok { - logger.Errorf("Webhook payload is not WorkflowRunEvent") - http.Error(responseWriter, "Webhook payload is not WorkflowRunEvent", http.StatusBadRequest) - return - } - conclusion := webhookData.GetWorkflowRun().GetConclusion() - repoName := webhookData.GetRepo().GetName() - orgName := webhookData.GetOrg().GetLogin() - logger.Debugf("Action: %s, Org: %s, Repo: %s, Workflow run ID: %d, conclusion: %s", webhookData.GetAction(), orgName, repoName, webhookData.GetWorkflowRun().GetID(), conclusion) - - // On "completed" action and "failure" conclusion trigger restart - if webhookData.GetAction() == "completed" && conclusion == "failure" { - logger.Infof("Requesting restart for failed jobs in workflow run ID: %d", webhookData.GetWorkflowRun().GetID()) - err := RestartManager.Restart(*webhookData.WorkflowRun.ID, orgName, repoName) - if err != nil { - logger.Errorf("Failed to request restart: %v", err) - http.Error(responseWriter, "Failed to request restart", http.StatusInternalServerError) - } - return - } - // On "completed" action and "cancelled" or "success" conclusion trigger cleanup - if webhookData.GetAction() == "completed" && (conclusion == "cancelled" || conclusion == "success") { - if conclusion == "cancelled" { - logger.Infof("Cleaning up jobs for workflow run ID: %d", webhookData.GetWorkflowRun().GetID()) - err := QueueManager.CleanupByWorkflowRun(*webhookData.WorkflowRun.ID) - if err != nil { - logger.Errorf("Failed to cleanup jobs: %v", err) - http.Error(responseWriter, "Failed to cleanup jobs", http.StatusInternalServerError) - } - } - logger.Infof("Cleaning up restart requests for workflow run ID: %d", webhookData.GetWorkflowRun().GetID()) - err = RestartManager.Cleanup(*webhookData.WorkflowRun.ID, orgName, repoName) - if err != nil { - logger.Errorf("Failed to cleanup restart requests: %v", err) - http.Error(responseWriter, "Failed to cleanup restart requests", http.StatusInternalServerError) - } - return - } - -} - -// handleCompletedWorkflowJob -// handles the 'completed' action of the workflow job event -func handleCompletedWorkflowJob(responseWriter http.ResponseWriter, logger *log.Entry, job *jobs.Job) { - - err := QueueManager.UpdateJobStatus(job.Id, jobs.JobStatusFinished) - if err != nil { - logger.Errorf("Failed to update job status: %v", err) - } - - _, err = TrayManager.DeleteTray(job.RunnerName) - if err != nil { - logger.Errorf("Failed to delete tray: %v", err) - } -} - -// handleInProgressWorkflowJob -// handles the 'in_progress' action of the workflow job event -func handleInProgressWorkflowJob(responseWriter http.ResponseWriter, logger *log.Entry, job *jobs.Job) { - - err := QueueManager.JobInProgress(job.Id) - if err != nil { - var errMsg = fmt.Sprintf("Failed to mark job '%s/%s' as in progress: %v", job.WorkflowName, job.Name, err) - logger.Error(errMsg) - http.Error(responseWriter, errMsg, http.StatusInternalServerError) - } - - tray, err := TrayManager.SetJob(job.RunnerName, job.Id, job.WorkflowId) - if tray == nil { - logger.Errorf("Failed to set job '%s/%s' as in progress to tray, tray not found: %v", job.WorkflowName, job.Name, err) - } - if err != nil { - logger.Errorf("Failed to set job '%s/%s' as in progress to tray: %v", job.WorkflowName, job.Name, err) - } - - logger.Infof("Tray '%s' is running '%s/%s/%s/%s'", - job.RunnerName, - job.Organization, job.Repository, job.WorkflowName, job.Name, - ) -} - -// handleQueuedWorkflowJob -// handles the 'handleQueuedWorkflowJob' action of the workflow job event -func handleQueuedWorkflowJob(responseWriter http.ResponseWriter, logger *log.Entry, job *jobs.Job) { - err := QueueManager.AddJob(job) - if err != nil { - var errMsg = fmt.Sprintf("Failed to enqueue job '%s/%s/%s': %v", job.Repository, job.WorkflowName, job.Name, err) - logger.Error(errMsg) - http.Error(responseWriter, errMsg, http.StatusInternalServerError) - return - } - - logger.Infof("Enqueued job %s/%s/%s/%s ", job.Organization, job.Repository, job.WorkflowName, job.Name) -} - -func getTrayType(webhookData *github.WorkflowJobEvent) *config.TrayType { - - if len(webhookData.WorkflowJob.Labels) != 1 { - // Cattery only support one label for now - return nil - } - - // find tray type based on labels (runs_on) - var label = webhookData.WorkflowJob.Labels[0] - var trayType = config.AppConfig.GetTrayType(label) - - if trayType == nil { - return nil - } - - if trayType.GitHubOrg != webhookData.GetOrg().GetLogin() { - return nil - } - - return trayType -} diff --git a/src/server/server.go b/src/server/server.go index 6583f98..fd37329 100644 --- a/src/server/server.go +++ b/src/server/server.go @@ -2,9 +2,10 @@ package server import ( "cattery/lib/config" - "cattery/lib/jobQueue" "cattery/lib/restarter" restarterRepo "cattery/lib/restarter/repositories" + "cattery/lib/scaleSetClient" + "cattery/lib/scaleSetPoller" "cattery/lib/trayManager" "cattery/lib/trays/repositories" "cattery/server/handlers" @@ -24,33 +25,33 @@ func Start() { var logger = log.New() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + sigs := make(chan os.Signal, 1) signal.Notify(sigs, syscall.SIGINT) signal.Notify(sigs, syscall.SIGTERM) signal.Notify(sigs, syscall.SIGKILL) - var webhookMux = http.NewServeMux() - webhookMux.HandleFunc("/{$}", handlers.Index) - webhookMux.HandleFunc("GET /agent/register/{id}", handlers.AgentRegister) - webhookMux.HandleFunc("POST /agent/unregister/{id}", handlers.AgentUnregister) - webhookMux.HandleFunc("GET /agent/download", handlers.AgentDownloadBinary) - webhookMux.HandleFunc("POST /agent/interrupt/{id}", handlers.AgentInterrupt) - webhookMux.HandleFunc("POST /agent/ping/{id}", handlers.AgentPing) - - webhookMux.HandleFunc("POST /github/{org}", handlers.Webhook) + var mux = http.NewServeMux() + mux.HandleFunc("/{$}", handlers.Index) + mux.HandleFunc("GET /agent/register/{id}", handlers.AgentRegister) + mux.HandleFunc("POST /agent/unregister/{id}", handlers.AgentUnregister) + mux.HandleFunc("GET /agent/download", handlers.AgentDownloadBinary) + mux.HandleFunc("POST /agent/interrupt/{id}", handlers.AgentInterrupt) + mux.HandleFunc("POST /agent/ping/{id}", handlers.AgentPing) + mux.HandleFunc("/metrics", promhttp.Handler().ServeHTTP) - webhookMux.HandleFunc("/metrics", promhttp.Handler().ServeHTTP) - - var webhookServer = &http.Server{ + var httpServer = &http.Server{ Addr: config.AppConfig.Server.ListenAddress, - Handler: webhookMux, + Handler: mux, } // Db connection serverAPI := options.ServerAPI(options.ServerAPIVersion1) opts := options.Client(). ApplyURI(config.AppConfig.Database.Uri). - SetServerAPIOptions(serverAPI) //.SetTimeout(3 * time.Second) + SetServerAPIOptions(serverAPI) client, err := mongo.Connect(opts) if err != nil { @@ -68,31 +69,46 @@ func Start() { // Initialize tray manager and repository var trayRepository = repositories.NewMongodbTrayRepository() trayRepository.Connect(database.Collection("trays")) - handlers.TrayManager = trayManager.NewTrayManager(trayRepository) - //QueueManager initialization - handlers.QueueManager = jobQueue.NewQueueManager() - handlers.QueueManager.Connect(database.Collection("jobs")) - - // Initialize restarter repository + // Initialize restarter var restartManagerRepository = restarterRepo.NewMongodbRestarterRepository() restartManagerRepository.Connect(database.Collection("restarters")) - handlers.RestartManager = restarter.NewWorkflowRestarter(restartManagerRepository) - err = handlers.QueueManager.Load() - if err != nil { - logger.Errorf("Failed to load queue manager: %v", err) + // Initialize scale set pollers — one per TrayType + handlers.ScaleSetManager = scaleSetPoller.NewManager() + for _, trayType := range config.AppConfig.TrayTypes { + org := config.AppConfig.GetGitHubOrg(trayType.GitHubOrg) + if org == nil { + logger.Fatalf("GitHub organization '%s' not found for tray type '%s'", trayType.GitHubOrg, trayType.Name) + } + + ssClient, err := scaleSetClient.NewScaleSetClient(org, trayType) + if err != nil { + logger.Fatalf("Failed to create scale set client for tray type '%s': %v", trayType.Name, err) + } + + poller := scaleSetPoller.NewPoller(ssClient, trayType, handlers.TrayManager) + handlers.ScaleSetManager.Register(trayType.Name, poller) + + go func(p *scaleSetPoller.Poller, name string) { + if err := p.Run(ctx); err != nil { + logger.Errorf("Scale set poller for '%s' exited with error: %v", name, err) + } + }(poller, trayType.Name) } - handlers.TrayManager.HandleJobsQueue(context.Background(), handlers.QueueManager) - handlers.TrayManager.HandleStale(context.Background()) + // Start restart poller (replaces workflow_run webhook) + handlers.RestartManager.StartPoller(ctx) + + // Start stale tray cleanup + handlers.TrayManager.HandleStale(ctx) - // Start the server + // Start HTTP server go func() { - logger.Infof("Starting webhook server on %s", config.AppConfig.Server.ListenAddress) - err := webhookServer.ListenAndServe() + logger.Infof("Starting server on %s", config.AppConfig.Server.ListenAddress) + err := httpServer.ListenAndServe() if err != nil { logger.Fatal(err) return @@ -101,4 +117,5 @@ func Start() { sig := <-sigs logger.Info("Got signal ", sig) + cancel() } From f6630c05245ebae021ab3bacc2830c61e623035b Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 03:32:38 +0400 Subject: [PATCH 02/27] remove unused --- src/lib/maps/concurrentMap.go | 47 --------- src/lib/maps/mongoSyncMap.go | 154 ------------------------------ src/lib/maps/mongoSyncMap_test.go | 113 ---------------------- 3 files changed, 314 deletions(-) delete mode 100644 src/lib/maps/concurrentMap.go delete mode 100644 src/lib/maps/mongoSyncMap.go delete mode 100644 src/lib/maps/mongoSyncMap_test.go diff --git a/src/lib/maps/concurrentMap.go b/src/lib/maps/concurrentMap.go deleted file mode 100644 index ebcd0b7..0000000 --- a/src/lib/maps/concurrentMap.go +++ /dev/null @@ -1,47 +0,0 @@ -package maps - -import "sync" - -type ConcurrentMap[T comparable, Y interface{}] struct { - rwMutex *sync.RWMutex - _map map[T]*Y -} - -func NewConcurrentMap[T comparable, Y interface{}]() *ConcurrentMap[T, Y] { - return &ConcurrentMap[T, Y]{ - rwMutex: &sync.RWMutex{}, - _map: make(map[T]*Y), - } -} - -func (m *ConcurrentMap[T, Y]) Get(key T) *Y { - m.rwMutex.RLock() - defer m.rwMutex.RUnlock() - - if value, ok := m._map[key]; ok { - return value - } - - return nil -} - -func (m *ConcurrentMap[T, Y]) Set(key T, value *Y) { - m.rwMutex.Lock() - defer m.rwMutex.Unlock() - - m._map[key] = value -} - -func (m *ConcurrentMap[T, Y]) Delete(key T) { - m.rwMutex.Lock() - defer m.rwMutex.Unlock() - - delete(m._map, key) -} - -func (m *ConcurrentMap[T, Y]) Len() int { - m.rwMutex.RLock() - defer m.rwMutex.RUnlock() - - return len(m._map) -} diff --git a/src/lib/maps/mongoSyncMap.go b/src/lib/maps/mongoSyncMap.go deleted file mode 100644 index a2ce9b8..0000000 --- a/src/lib/maps/mongoSyncMap.go +++ /dev/null @@ -1,154 +0,0 @@ -package maps - -import ( - "context" - log "github.com/sirupsen/logrus" - "go.mongodb.org/mongo-driver/v2/bson" - "go.mongodb.org/mongo-driver/v2/mongo" - "go.mongodb.org/mongo-driver/v2/mongo/options" - "sync" -) - -type changeEvent[T any] struct { - OperationType string `bson:"operationType"` - FullDocument T `bson:"fullDocument"` -} - -type MongoSyncMap[T comparable, Y any] struct { - _map *ConcurrentMap[T, Y] - collection *mongo.Collection - idField string - listen bool - - changeStream *mongo.ChangeStream - waitGroup *sync.WaitGroup -} - -func NewMongoSyncMap[T comparable, Y any](idField string, listen bool) *MongoSyncMap[T, Y] { - return &MongoSyncMap[T, Y]{ - _map: NewConcurrentMap[T, Y](), - idField: idField, - listen: listen, - waitGroup: &sync.WaitGroup{}, - } -} - -func (m *MongoSyncMap[T, Y]) Load(collection *mongo.Collection) error { - - m.waitGroup.Add(1) - defer m.waitGroup.Done() - - m.collection = collection - - if m.listen { - changeStream, err := m.collection.Watch(nil, mongo.Pipeline{}) - if err != nil { - return err - } - m.changeStream = changeStream - } - - allTrays, err := m.collection.Find(nil, bson.M{}) - if err != nil { - return err - } - - for allTrays.Next(nil) { - var tray Y - decodeErr := allTrays.Decode(&tray) - if decodeErr != nil { - return err - } - - var id T - err := allTrays.Current.Lookup(m.idField).Unmarshal(&id) - if err != nil { - return err - } - m._map.Set(id, &tray) - } - - if m.listen { - go func() { - for m.changeStream.Next(nil) { - var event changeEvent[Y] - decodeErr := m.changeStream.Decode(&event) - if decodeErr != nil { - log.Error("Failed to decode change stream: ", decodeErr) - m.Load(collection) - } - - var id T - err := m.changeStream.Current.Lookup("fullDocument", m.idField).Unmarshal(&id) - if err != nil { - panic(err) - } - - switch event.OperationType { - case "replace": - fallthrough - case "update": - fallthrough - case "insert": - m._map.Set(id, &event.FullDocument) - case "delete": - m._map.Delete(id) - default: - log.Warn("Unknown operation type: ", event.OperationType) - } - } - }() - } - - return nil -} - -func (m *MongoSyncMap[T, Y]) Stop() error { - if m.listen { - err := m.changeStream.Close(nil) - if err != nil { - return err - } - } - return nil -} - -func (m *MongoSyncMap[T, Y]) Get(key T) *Y { - m.waitGroup.Wait() - return m._map.Get(key) -} - -func (m *MongoSyncMap[T, Y]) Set(key T, value *Y) error { - m.waitGroup.Wait() - - _, err := m.collection.UpdateOne(context.Background(), bson.M{m.idField: key}, value, options.UpdateOne().SetUpsert(true)) - if err != nil { - return err - } - - m._map.Set(key, value) - return nil -} - -func (m *MongoSyncMap[T, Y]) Delete(key T) error { - m.waitGroup.Wait() - - _, err := m.collection.DeleteOne(context.Background(), bson.M{m.idField: key}) - if err != nil { - return err - } - - m._map.Delete(key) - return nil -} - -func (m *MongoSyncMap[T, Y]) Len() int { - m.waitGroup.Wait() - - return m._map.Len() -} - -func (m *MongoSyncMap[T, Y]) GetAll() map[T]*Y { - m.waitGroup.Wait() - return m._map._map -} diff --git a/src/lib/maps/mongoSyncMap_test.go b/src/lib/maps/mongoSyncMap_test.go deleted file mode 100644 index 578137f..0000000 --- a/src/lib/maps/mongoSyncMap_test.go +++ /dev/null @@ -1,113 +0,0 @@ -package maps - -import ( - "context" - "go.mongodb.org/mongo-driver/v2/mongo" - "go.mongodb.org/mongo-driver/v2/mongo/options" - "testing" - "time" -) - -type Obj struct { - Id string - Name string -} - -func init() { - serverAPI := options.ServerAPI(options.ServerAPIVersion1) - opts := options.Client().ApplyURI("mongodb://localhost").SetServerAPIOptions(serverAPI) - - client, err := mongo.Connect(opts) - if err != nil { - panic(err) - } - - var collection = client.Database("test").Collection("test") - collection.Drop(context.Background()) - - collection.InsertOne(context.Background(), Obj{Id: "1", Name: "test"}) - collection.InsertOne(context.Background(), Obj{Id: "2", Name: "test2"}) - collection.InsertOne(context.Background(), Obj{Id: "3", Name: "test3"}) - collection.InsertOne(context.Background(), Obj{Id: "4", Name: "test4"}) - collection.InsertOne(context.Background(), Obj{Id: "5", Name: "test5"}) -} - -func TestConnectLoad(t *testing.T) { - - serverAPI := options.ServerAPI(options.ServerAPIVersion1) - opts := options.Client().ApplyURI("mongodb://localhost").SetServerAPIOptions(serverAPI) - - client, err := mongo.Connect(opts) - if err != nil { - panic(err) - } - - var collection = client.Database("test").Collection("test") - - var msm = NewMongoSyncMap[string, Obj]("id", false) - - msm.Load(collection) - - if msm.Len() != 5 { - t.Errorf("Expected 5, got %d", msm.Len()) - } -} - -func TestListen(t *testing.T) { - - serverAPI := options.ServerAPI(options.ServerAPIVersion1) - opts := options.Client().ApplyURI("mongodb://localhost").SetServerAPIOptions(serverAPI) - - client, err := mongo.Connect(opts) - if err != nil { - panic(err) - } - - var collection = client.Database("test").Collection("test") - - var msm = NewMongoSyncMap[string, Obj]("id", true) - msm.Load(collection) - - collection.InsertOne(context.Background(), Obj{Id: "6", Name: "test6"}) - collection.InsertOne(context.Background(), Obj{Id: "7", Name: "test7"}) - collection.InsertOne(context.Background(), Obj{Id: "8", Name: "test8"}) - - time.Sleep(1 * time.Second) - - if msm.Len() != 8 { - t.Errorf("Expected 8, got %d", msm.Len()) - } -} - -func TestListenMultiple(t *testing.T) { - - serverAPI := options.ServerAPI(options.ServerAPIVersion1) - opts := options.Client().ApplyURI("mongodb://localhost").SetServerAPIOptions(serverAPI) - - client, err := mongo.Connect(opts) - if err != nil { - panic(err) - } - - var collection = client.Database("test").Collection("test") - - var msm1 = NewMongoSyncMap[string, Obj]("id", true) - msm1.Load(collection) - - var msm2 = NewMongoSyncMap[string, Obj]("id", true) - msm2.Load(collection) - - msm1.Set("6", &Obj{Id: "6", Name: "test6"}) - msm1.Set("7", &Obj{Id: "7", Name: "test7"}) - msm1.Set("8", &Obj{Id: "8", Name: "test8"}) - - time.Sleep(1 * time.Second) - - if msm1.Len() != 8 { - t.Errorf("Expected 8, got %d", msm1.Len()) - } - - if msm2.Len() != 8 { - t.Errorf("Expected 8, got %d", msm2.Len()) - } -} From 8f954ac76adb87b9a8140346db34331ee3fa81ff Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 03:39:32 +0400 Subject: [PATCH 03/27] cleanup --- src/agent/Watchers/{signalWather.go => signalWatcher.go} | 0 src/agent/agent.go | 7 +------ src/lib/trays/repositories/mongodbTrayRepository.go | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) rename src/agent/Watchers/{signalWather.go => signalWatcher.go} (100%) diff --git a/src/agent/Watchers/signalWather.go b/src/agent/Watchers/signalWatcher.go similarity index 100% rename from src/agent/Watchers/signalWather.go rename to src/agent/Watchers/signalWatcher.go diff --git a/src/agent/agent.go b/src/agent/agent.go index fe571ac..40086aa 100644 --- a/src/agent/agent.go +++ b/src/agent/agent.go @@ -39,7 +39,7 @@ func NewCatteryAgent(runnerFolder string, catteryServerUrl string, agentId strin return &CatteryAgent{ mutex: sync.Mutex{}, logger: log.WithFields(log.Fields{"name": "agent", "agentId": agentId}), - catteryClient: createClient(catteryServerUrl, agentId), + catteryClient: catteryClient.NewCatteryClient(catteryServerUrl, agentId), listenerExecPath: path.Join(runnerFolder, "bin", "Runner.Listener"), agentId: agentId, interrupted: false, @@ -92,8 +92,3 @@ func (a *CatteryAgent) stop(event shutdownEvents.ShutdownEvent) { tools.Shutdown() } } - -// createClient creates a new http client -func createClient(baseUrl string, agentId string) *catteryClient.CatteryClient { - return catteryClient.NewCatteryClient(baseUrl, agentId) -} diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index c9c388c..833be1d 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -63,7 +63,6 @@ func (m *MongodbTrayRepository) GetStale(d time.Duration, rd time.Duration) ([]* func (m *MongodbTrayRepository) MarkRedundant(trayType string, limit int) ([]*trays.Tray, error) { var resultTrays = make([]*trays.Tray, 0) - var ids = make([]string, 0) for i := 0; i < limit; i++ { dbResult := m.collection.FindOneAndUpdate( @@ -82,7 +81,6 @@ func (m *MongodbTrayRepository) MarkRedundant(trayType string, limit int) ([]*tr } resultTrays = append(resultTrays, &result) - ids = append(ids, result.Id) } return resultTrays, nil From fdd8c69ec6140eb39f4c62b893356ac2e43d1f51 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 03:48:47 +0400 Subject: [PATCH 04/27] -rd --- src/lib/trayManager/trayManager.go | 2 +- src/lib/trays/repositories/iTrayRepository.go | 2 +- src/lib/trays/repositories/mongodbTrayRepository.go | 11 ++++------- .../trays/repositories/mongodbTrayRepository_test.go | 4 ++-- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 303c1ad..737ab50 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -168,7 +168,7 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { time.Sleep(interval / 2) - stale, err := tm.trayRepository.GetStale(interval, interval*2) + stale, err := tm.trayRepository.GetStale(interval) if err != nil { log.Errorf("Failed to get stale trays: %v", err) continue diff --git a/src/lib/trays/repositories/iTrayRepository.go b/src/lib/trays/repositories/iTrayRepository.go index a979f3c..85e0cdd 100644 --- a/src/lib/trays/repositories/iTrayRepository.go +++ b/src/lib/trays/repositories/iTrayRepository.go @@ -12,5 +12,5 @@ type ITrayRepository interface { UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) CountByTrayType(trayType string) (map[trays.TrayStatus]int, int, error) MarkRedundant(trayType string, limit int) ([]*trays.Tray, error) - GetStale(d time.Duration, rd time.Duration) ([]*trays.Tray, error) + GetStale(d time.Duration) ([]*trays.Tray, error) } diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index 833be1d..e041eda 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -39,14 +39,11 @@ func (m *MongodbTrayRepository) GetById(trayId string) (*trays.Tray, error) { return &result, nil } -func (m *MongodbTrayRepository) GetStale(d time.Duration, rd time.Duration) ([]*trays.Tray, error) { +func (m *MongodbTrayRepository) GetStale(d time.Duration) ([]*trays.Tray, error) { dbResult, err := m.collection.Find(context.Background(), - bson.M{"$or": []bson.M{ - { - "status": bson.M{"$ne": trays.TrayStatusRunning}, - "statusChanged": bson.M{"$lte": time.Now().UTC().Add(-d)}, - }, - }, + bson.M{ + "status": bson.M{"$ne": trays.TrayStatusRunning}, + "statusChanged": bson.M{"$lte": time.Now().UTC().Add(-d)}, }) if err != nil { return nil, err diff --git a/src/lib/trays/repositories/mongodbTrayRepository_test.go b/src/lib/trays/repositories/mongodbTrayRepository_test.go index 91f671d..ae042ee 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository_test.go +++ b/src/lib/trays/repositories/mongodbTrayRepository_test.go @@ -500,7 +500,7 @@ func TestGetStale(t *testing.T) { insertTestTrays(t, collection, []*TestTray{staleTray1, staleTray2, freshTray1, freshTray2}) // Test GetStale with 5 minute duration - staleTrays, err := repo.GetStale(5*time.Minute, 5*time.Minute) + staleTrays, err := repo.GetStale(5*time.Minute) if err != nil { t.Fatalf("GetStale failed: %v", err) } @@ -545,7 +545,7 @@ func TestGetStale(t *testing.T) { insertTestTrays(t, collection, []*TestTray{freshTray1, freshTray2}) // Test GetStale again with 5 minute duration - staleTrays, err = repo.GetStale(5*time.Minute, 5*time.Minute) + staleTrays, err = repo.GetStale(5*time.Minute) if err != nil { t.Fatalf("GetStale failed: %v", err) } From 1fe909fe20eef849760bdae72c7e90bdaa562778 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 03:50:04 +0400 Subject: [PATCH 05/27] refactor --- src/agent/Watchers/fileWatcher.go | 66 ------------ src/agent/Watchers/pingWatcher.go | 48 --------- src/agent/Watchers/signalWatcher.go | 25 ----- src/agent/agent.go | 114 ++++++++++++++++++--- src/agent/githubListener/githubListener.go | 18 ++-- src/agent/shutdownEvents/channel.go | 35 ------- src/lib/githubClient/githubClient.go | 8 +- src/server/handlers/agentHandler.go | 24 ++--- src/server/handlers/rootHandler.go | 10 +- src/server/server.go | 52 +++++----- 10 files changed, 162 insertions(+), 238 deletions(-) delete mode 100644 src/agent/Watchers/fileWatcher.go delete mode 100644 src/agent/Watchers/pingWatcher.go delete mode 100644 src/agent/Watchers/signalWatcher.go delete mode 100644 src/agent/shutdownEvents/channel.go diff --git a/src/agent/Watchers/fileWatcher.go b/src/agent/Watchers/fileWatcher.go deleted file mode 100644 index b5b2a5d..0000000 --- a/src/agent/Watchers/fileWatcher.go +++ /dev/null @@ -1,66 +0,0 @@ -package Watchers - -import ( - "cattery/agent/shutdownEvents" - "cattery/lib/messages" - "context" - "os" - - "github.com/fsnotify/fsnotify" - log "github.com/sirupsen/logrus" -) - -var filename = "./shutdown_file" - -func WatchFile(ctx context.Context) { - go func() { - watcher, err := fsnotify.NewWatcher() - if err != nil { - log.Fatal(err) - } - defer watcher.Close() - - createFile(filename) - - err = watcher.Add(filename) - if err != nil { - log.Fatal(err) - } - - var message string - - if ctx == nil { - ctx = context.Background() - } - - select { - case <-ctx.Done(): - return - case event := <-watcher.Events: - if event.Op.Has(fsnotify.Write) { - message = "Modified file: " + event.Name - } - if event.Op.Has(fsnotify.Remove) { - message = "Removed file: " + event.Name - } - if event.Op.Has(fsnotify.Rename) { - message = "Renamed file: " + event.Name - } - case err := <-watcher.Errors: - message = "File error: " + err.Error() - log.Error(message) - } - - log.Info(message) - - shutdownEvents.Emit(messages.UnregisterReasonPreempted, message) - }() -} - -func createFile(filename string) { - f, err := os.OpenFile(filename, os.O_CREATE|os.O_APPEND, 0644) - if err != nil { - log.Fatal(err) - } - f.Close() -} diff --git a/src/agent/Watchers/pingWatcher.go b/src/agent/Watchers/pingWatcher.go deleted file mode 100644 index 1979614..0000000 --- a/src/agent/Watchers/pingWatcher.go +++ /dev/null @@ -1,48 +0,0 @@ -package Watchers - -import ( - "cattery/agent/catteryClient" - "cattery/agent/shutdownEvents" - "cattery/lib/messages" - "context" - "time" - - log "github.com/sirupsen/logrus" -) - -func WatchPing(ctx context.Context, client *catteryClient.CatteryClient) { - go func() { - var msg string - var finished = false - - if ctx == nil { - ctx = context.Background() - } - - for !finished { - select { - case <-ctx.Done(): // selected when context is canceled or times out - msg = "cattery client shutdown" - finished = true - break - default: - pingResponse, err := client.Ping() - if err != nil { - msg = "error pinging controller: " + err.Error() - log.Error(msg) - continue - } - - if pingResponse.Terminate { - msg = "controller ping receive 'terminate': " + pingResponse.Message - finished = true - break - } - - time.Sleep(60 * time.Second) - } - } - - shutdownEvents.Emit(messages.UnregisterReasonControllerKill, msg) - }() -} diff --git a/src/agent/Watchers/signalWatcher.go b/src/agent/Watchers/signalWatcher.go deleted file mode 100644 index ba80a7d..0000000 --- a/src/agent/Watchers/signalWatcher.go +++ /dev/null @@ -1,25 +0,0 @@ -package Watchers - -import ( - "cattery/agent/shutdownEvents" - "cattery/lib/messages" - "os" - "os/signal" - "syscall" - - log "github.com/sirupsen/logrus" -) - -func WatchSignal() { - go func() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT) - signal.Notify(sigs, syscall.SIGTERM) - signal.Notify(sigs, syscall.SIGKILL) - - sig := <-sigs - log.Info("Got signal ", sig) - - shutdownEvents.Emit(messages.UnregisterReasonSigTerm, "Got signal "+sig.String()) - }() -} diff --git a/src/agent/agent.go b/src/agent/agent.go index 40086aa..553d652 100644 --- a/src/agent/agent.go +++ b/src/agent/agent.go @@ -1,16 +1,19 @@ package agent import ( - "cattery/agent/Watchers" "cattery/agent/catteryClient" "cattery/agent/githubListener" - "cattery/agent/shutdownEvents" "cattery/agent/tools" "cattery/lib/agents" - "context" + "cattery/lib/messages" + "os" + "os/signal" "path" "sync" + "syscall" + "time" + "github.com/fsnotify/fsnotify" log "github.com/sirupsen/logrus" ) @@ -20,7 +23,6 @@ var Id string func Start() { var catteryAgent = NewCatteryAgent(RunnerFolder, CatteryServerUrl, Id) - catteryAgent.Start() } @@ -47,28 +49,28 @@ func NewCatteryAgent(runnerFolder string, catteryServerUrl string, agentId strin } func (a *CatteryAgent) Start() { - a.logger.Info("Starting Cattery Agent") agent, jitConfig, err := a.catteryClient.RegisterAgent(a.agentId) if err != nil { - errMsg := "Failed to register agent: " + err.Error() - a.logger.Error(errMsg) + a.logger.Errorf("Failed to register agent: %v", err) return } a.agent = agent a.logger.Info("Agent registered, starting Listener") - Watchers.WatchSignal() - Watchers.WatchFile(context.Background()) - Watchers.WatchPing(context.Background(), a.catteryClient) + shutdownCh := make(chan githubListener.ShutdownEvent, 1) + + a.watchSignal(shutdownCh) + a.watchFile(shutdownCh) + a.watchPing(shutdownCh) var ghListener = githubListener.NewGithubListener(a.listenerExecPath) - ghListener.Start(jitConfig) + ghListener.Start(jitConfig, shutdownCh) - // blocking call - var event = shutdownEvents.WaitEvent() + // Block until first shutdown event + event := <-shutdownCh a.logger.Infof("Received shutdown event: %s, reason: %d", event.Message, event.Reason) @@ -76,15 +78,93 @@ func (a *CatteryAgent) Start() { a.stop(event) } -// stop stops the runner process -func (a *CatteryAgent) stop(event shutdownEvents.ShutdownEvent) { +func (a *CatteryAgent) watchSignal(ch chan<- githubListener.ShutdownEvent) { + go func() { + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + + sig := <-sigs + a.logger.Info("Got signal ", sig) + + ch <- githubListener.ShutdownEvent{ + Reason: messages.UnregisterReasonSigTerm, + Message: "Got signal " + sig.String(), + } + }() +} + +func (a *CatteryAgent) watchFile(ch chan<- githubListener.ShutdownEvent) { + const shutdownFile = "./shutdown_file" + + go func() { + watcher, err := fsnotify.NewWatcher() + if err != nil { + a.logger.Fatalf("Failed to create file watcher: %v", err) + } + defer watcher.Close() + + // Create the shutdown file if it doesn't exist + f, err := os.OpenFile(shutdownFile, os.O_CREATE|os.O_APPEND, 0644) + if err != nil { + a.logger.Fatalf("Failed to create shutdown file: %v", err) + } + f.Close() + + if err := watcher.Add(shutdownFile); err != nil { + a.logger.Fatalf("Failed to watch shutdown file: %v", err) + } + + select { + case event := <-watcher.Events: + msg := "Shutdown file changed: " + event.Name + a.logger.Info(msg) + ch <- githubListener.ShutdownEvent{ + Reason: messages.UnregisterReasonPreempted, + Message: msg, + } + case err := <-watcher.Errors: + msg := "File watcher error: " + err.Error() + a.logger.Error(msg) + ch <- githubListener.ShutdownEvent{ + Reason: messages.UnregisterReasonPreempted, + Message: msg, + } + } + }() +} +func (a *CatteryAgent) watchPing(ch chan<- githubListener.ShutdownEvent) { + go func() { + for { + pingResponse, err := a.catteryClient.Ping() + if err != nil { + a.logger.Errorf("Error pinging controller: %v", err) + time.Sleep(60 * time.Second) + continue + } + + if pingResponse.Terminate { + msg := "Controller requested termination: " + pingResponse.Message + a.logger.Info(msg) + ch <- githubListener.ShutdownEvent{ + Reason: messages.UnregisterReasonControllerKill, + Message: msg, + } + return + } + + time.Sleep(60 * time.Second) + } + }() +} + +// stop stops the runner process +func (a *CatteryAgent) stop(event githubListener.ShutdownEvent) { log.Infof("Stopping Cattery Agent with reason: %d, message: `%s`", event.Reason, event.Message) err := a.catteryClient.UnregisterAgent(a.agent, event.Reason, event.Message) if err != nil { - var errMsg = "Failed to unregister agent: " + err.Error() - a.logger.Error(errMsg) + a.logger.Errorf("Failed to unregister agent: %v", err) } if a.agent.Shutdown { diff --git a/src/agent/githubListener/githubListener.go b/src/agent/githubListener/githubListener.go index def22a0..f723b9a 100644 --- a/src/agent/githubListener/githubListener.go +++ b/src/agent/githubListener/githubListener.go @@ -1,7 +1,6 @@ package githubListener import ( - "cattery/agent/shutdownEvents" "cattery/lib/messages" "os" "os/exec" @@ -10,6 +9,11 @@ import ( log "github.com/sirupsen/logrus" ) +type ShutdownEvent struct { + Reason messages.UnregisterReason + Message string +} + type GithubListener struct { listenerPath string process *os.Process @@ -23,32 +27,34 @@ func NewGithubListener(listenerPath string) *GithubListener { } } -func (l *GithubListener) Start(jitConfig *string) { +func (l *GithubListener) Start(jitConfig *string, shutdownCh chan<- ShutdownEvent) { var commandRun = exec.Command(l.listenerPath, "run", "--jitconfig", *jitConfig) commandRun.Stdout = os.Stdout commandRun.Stderr = os.Stderr go func() { var msg = "Listener finished" + var reason = messages.UnregisterReasonDone err := commandRun.Start() if err != nil { msg = "Listener failed to start: " + err.Error() log.Error(msg) - shutdownEvents.Emit(messages.UnregisterReasonUnknown, msg) + shutdownCh <- ShutdownEvent{Reason: messages.UnregisterReasonUnknown, Message: msg} return } + l.mut.Lock() l.process = commandRun.Process + l.mut.Unlock() + err = commandRun.Wait() if err != nil { msg = "Runner failed: " + err.Error() log.Error(msg) } - //TODO: check startup errors, like deprecated runner - - shutdownEvents.Emit(messages.UnregisterReasonDone, msg) + shutdownCh <- ShutdownEvent{Reason: reason, Message: msg} }() } diff --git a/src/agent/shutdownEvents/channel.go b/src/agent/shutdownEvents/channel.go deleted file mode 100644 index 5616e50..0000000 --- a/src/agent/shutdownEvents/channel.go +++ /dev/null @@ -1,35 +0,0 @@ -package shutdownEvents - -import ( - "cattery/lib/messages" - "sync" -) - -type ShutdownEvent struct { - Reason messages.UnregisterReason - Message string -} - -var mut = new(sync.Mutex) - -var channel = make(chan ShutdownEvent, 1) -var emitted = false - -func Emit(unregisterReason messages.UnregisterReason, message string) { - mut.Lock() - defer mut.Unlock() - - var event = ShutdownEvent{ - Reason: unregisterReason, - Message: message, - } - - if !emitted { - channel <- event - emitted = true - } -} - -func WaitEvent() ShutdownEvent { - return <-channel -} diff --git a/src/lib/githubClient/githubClient.go b/src/lib/githubClient/githubClient.go index c3c5903..42c57b7 100644 --- a/src/lib/githubClient/githubClient.go +++ b/src/lib/githubClient/githubClient.go @@ -5,13 +5,17 @@ import ( "context" "errors" "net/http" + "sync" "github.com/bradleyfalzon/ghinstallation/v2" "github.com/google/go-github/v70/github" log "github.com/sirupsen/logrus" ) -var githubClients = make(map[string]*github.Client) +var ( + githubClientsMu sync.Mutex + githubClients = make(map[string]*github.Client) +) type GithubClient struct { client *github.Client @@ -94,6 +98,8 @@ func (gc *GithubClient) CheckJobCompleted(repoName string, jobId int64) (bool, e // createClient creates a new GitHub client func createClient(org *config.GitHubOrganization) *github.Client { + githubClientsMu.Lock() + defer githubClientsMu.Unlock() if githubClient, ok := githubClients[org.Name]; ok { return githubClient diff --git a/src/server/handlers/agentHandler.go b/src/server/handlers/agentHandler.go index 639cabf..36707e6 100644 --- a/src/server/handlers/agentHandler.go +++ b/src/server/handlers/agentHandler.go @@ -17,7 +17,7 @@ import ( ) // AgentRegister is a handler for agent registration requests -func AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { +func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { var logger = log.WithFields(log.Fields{ "handler": "agent", @@ -40,7 +40,7 @@ func AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { logger.Debug("Agent registration request") - var tray, err = TrayManager.Registering(agentId) + var tray, err = h.TrayManager.Registering(agentId) if err != nil { var errMsg = fmt.Sprintf("Failed to update tray status for agent '%s': %v", agentId, err) logger.Error(errMsg) @@ -59,7 +59,7 @@ func AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { logger.Debugf("Found tray %s for agent %s, with organization %s", tray.GetId(), agentId, tray.GetGitHubOrgName()) - poller := ScaleSetManager.GetPoller(trayType.Name) + poller := h.ScaleSetManager.GetPoller(trayType.Name) if poller == nil { var errMsg = fmt.Sprintf("No scale set poller found for tray type '%s'", trayType.Name) logger.Error(errMsg) @@ -94,7 +94,7 @@ func AgentRegister(responseWriter http.ResponseWriter, r *http.Request) { return } - _, err = TrayManager.Registered(agentId, int64(jitRunnerConfig.Runner.ID)) + _, err = h.TrayManager.Registered(agentId, int64(jitRunnerConfig.Runner.ID)) if err != nil { logger.Errorf("%v", err) } @@ -110,7 +110,7 @@ func validateAgentId(agentId string) string { } // AgentUnregister is a handler for agent unregister requests -func AgentUnregister(responseWriter http.ResponseWriter, r *http.Request) { +func (h *Handlers) AgentUnregister(responseWriter http.ResponseWriter, r *http.Request) { var logger = log.WithFields(log.Fields{ "handler": "agent", "call": "AgentUnregister", @@ -125,7 +125,7 @@ func AgentUnregister(responseWriter http.ResponseWriter, r *http.Request) { var trayId = r.PathValue("id") - var tray, err = TrayManager.GetTrayById(trayId) + var tray, err = h.TrayManager.GetTrayById(trayId) if err != nil { var errMsg = fmt.Sprintf("Failed to get tray for agent '%s': %v", trayId, err) logger.Error(errMsg) @@ -152,7 +152,7 @@ func AgentUnregister(responseWriter http.ResponseWriter, r *http.Request) { logger.Tracef("Agent unregister request") - _, err = TrayManager.DeleteTray(tray.Id) + _, err = h.TrayManager.DeleteTray(tray.Id) if err != nil { logger.Errorf("Failed to delete tray: %v", err) @@ -215,7 +215,7 @@ func AgentDownloadBinary(responseWriter http.ResponseWriter, r *http.Request) { logger.Infof("Binary file served: %s (%d bytes)", execPath, fileInfo.Size()) } -func AgentPing(responseWriter http.ResponseWriter, r *http.Request) { +func (h *Handlers) AgentPing(responseWriter http.ResponseWriter, r *http.Request) { var logger = log.WithFields(log.Fields{ "handler": "agent", "call": "AgentPing", @@ -231,7 +231,7 @@ func AgentPing(responseWriter http.ResponseWriter, r *http.Request) { Message: "", } - tray, err := TrayManager.GetTrayById(agentId) + tray, err := h.TrayManager.GetTrayById(agentId) if err != nil { var errMsg = fmt.Sprintf("Failed to get tray by id '%s': %v", agentId, err) logger.Error(errMsg) @@ -282,7 +282,7 @@ func writeResponse(responseWriter http.ResponseWriter, pingResponse any, logger } } -func AgentInterrupt(responseWriter http.ResponseWriter, r *http.Request) { +func (h *Handlers) AgentInterrupt(responseWriter http.ResponseWriter, r *http.Request) { var logger = log.WithFields(log.Fields{ "handler": "agent", "call": "AgentRestart", @@ -304,7 +304,7 @@ func AgentInterrupt(responseWriter http.ResponseWriter, r *http.Request) { logger.Debug("Agent restart request with id " + agentId) - tray, err := TrayManager.GetTrayById(agentId) + tray, err := h.TrayManager.GetTrayById(agentId) if err != nil { var errMsg = fmt.Sprintf("Failed to get tray by id '%s': %v", agentId, err) logger.Error(errMsg) @@ -318,5 +318,5 @@ func AgentInterrupt(responseWriter http.ResponseWriter, r *http.Request) { return } workflowRunId := tray.WorkflowRunId - RestartManager.RequestRestart(workflowRunId, tray.GitHubOrgName, tray.Repository) + h.RestartManager.RequestRestart(workflowRunId, tray.GitHubOrgName, tray.Repository) } diff --git a/src/server/handlers/rootHandler.go b/src/server/handlers/rootHandler.go index d3eaab6..965f936 100644 --- a/src/server/handlers/rootHandler.go +++ b/src/server/handlers/rootHandler.go @@ -7,10 +7,12 @@ import ( "net/http" ) -var TrayManager *trayManager.TrayManager -var RestartManager *restarter.WorkflowRestarter -var ScaleSetManager *scaleSetPoller.Manager +type Handlers struct { + TrayManager *trayManager.TrayManager + RestartManager *restarter.WorkflowRestarter + ScaleSetManager *scaleSetPoller.Manager +} -func Index(responseWriter http.ResponseWriter, r *http.Request) { +func (h *Handlers) Index(responseWriter http.ResponseWriter, r *http.Request) { return } diff --git a/src/server/server.go b/src/server/server.go index fd37329..f5f89b0 100644 --- a/src/server/server.go +++ b/src/server/server.go @@ -29,23 +29,7 @@ func Start() { defer cancel() sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT) - signal.Notify(sigs, syscall.SIGTERM) - signal.Notify(sigs, syscall.SIGKILL) - - var mux = http.NewServeMux() - mux.HandleFunc("/{$}", handlers.Index) - mux.HandleFunc("GET /agent/register/{id}", handlers.AgentRegister) - mux.HandleFunc("POST /agent/unregister/{id}", handlers.AgentUnregister) - mux.HandleFunc("GET /agent/download", handlers.AgentDownloadBinary) - mux.HandleFunc("POST /agent/interrupt/{id}", handlers.AgentInterrupt) - mux.HandleFunc("POST /agent/ping/{id}", handlers.AgentPing) - mux.HandleFunc("/metrics", promhttp.Handler().ServeHTTP) - - var httpServer = &http.Server{ - Addr: config.AppConfig.Server.ListenAddress, - Handler: mux, - } + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) // Db connection serverAPI := options.ServerAPI(options.ServerAPIVersion1) @@ -69,15 +53,15 @@ func Start() { // Initialize tray manager and repository var trayRepository = repositories.NewMongodbTrayRepository() trayRepository.Connect(database.Collection("trays")) - handlers.TrayManager = trayManager.NewTrayManager(trayRepository) + tm := trayManager.NewTrayManager(trayRepository) // Initialize restarter var restartManagerRepository = restarterRepo.NewMongodbRestarterRepository() restartManagerRepository.Connect(database.Collection("restarters")) - handlers.RestartManager = restarter.NewWorkflowRestarter(restartManagerRepository) + rm := restarter.NewWorkflowRestarter(restartManagerRepository) // Initialize scale set pollers — one per TrayType - handlers.ScaleSetManager = scaleSetPoller.NewManager() + ssm := scaleSetPoller.NewManager() for _, trayType := range config.AppConfig.TrayTypes { org := config.AppConfig.GetGitHubOrg(trayType.GitHubOrg) if org == nil { @@ -89,8 +73,8 @@ func Start() { logger.Fatalf("Failed to create scale set client for tray type '%s': %v", trayType.Name, err) } - poller := scaleSetPoller.NewPoller(ssClient, trayType, handlers.TrayManager) - handlers.ScaleSetManager.Register(trayType.Name, poller) + poller := scaleSetPoller.NewPoller(ssClient, trayType, tm) + ssm.Register(trayType.Name, poller) go func(p *scaleSetPoller.Poller, name string) { if err := p.Run(ctx); err != nil { @@ -100,10 +84,30 @@ func Start() { } // Start restart poller (replaces workflow_run webhook) - handlers.RestartManager.StartPoller(ctx) + rm.StartPoller(ctx) // Start stale tray cleanup - handlers.TrayManager.HandleStale(ctx) + tm.HandleStale(ctx) + + h := &handlers.Handlers{ + TrayManager: tm, + RestartManager: rm, + ScaleSetManager: ssm, + } + + var mux = http.NewServeMux() + mux.HandleFunc("/{$}", h.Index) + mux.HandleFunc("GET /agent/register/{id}", h.AgentRegister) + mux.HandleFunc("POST /agent/unregister/{id}", h.AgentUnregister) + mux.HandleFunc("GET /agent/download", handlers.AgentDownloadBinary) + mux.HandleFunc("POST /agent/interrupt/{id}", h.AgentInterrupt) + mux.HandleFunc("POST /agent/ping/{id}", h.AgentPing) + mux.HandleFunc("/metrics", promhttp.Handler().ServeHTTP) + + var httpServer = &http.Server{ + Addr: config.AppConfig.Server.ListenAddress, + Handler: mux, + } // Start HTTP server go func() { From c0f24a78834648de328950dd2af6f2edbe1c7d2e Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 04:23:43 +0400 Subject: [PATCH 06/27] more cleanup --- src/agent/agent.go | 5 -- .../repositories/iRestarterRepository.go | 1 - .../mongodbRestarterRepository.go | 20 ------- src/lib/trays/providers/dockerProvider.go | 10 ---- src/lib/trays/providers/gceProvider.go | 10 ---- src/lib/trays/providers/iTrayProvider.go | 6 --- .../repositories/mongodbTrayRepository.go | 15 ------ src/lib/trays/repositories/traysRepository.go | 54 ------------------- 8 files changed, 121 deletions(-) delete mode 100644 src/lib/trays/repositories/traysRepository.go diff --git a/src/agent/agent.go b/src/agent/agent.go index 553d652..0301988 100644 --- a/src/agent/agent.go +++ b/src/agent/agent.go @@ -9,7 +9,6 @@ import ( "os" "os/signal" "path" - "sync" "syscall" "time" @@ -27,24 +26,20 @@ func Start() { } type CatteryAgent struct { - mutex sync.Mutex logger *log.Entry catteryClient *catteryClient.CatteryClient agent *agents.Agent agentId string - interrupted bool listenerExecPath string } func NewCatteryAgent(runnerFolder string, catteryServerUrl string, agentId string) *CatteryAgent { return &CatteryAgent{ - mutex: sync.Mutex{}, logger: log.WithFields(log.Fields{"name": "agent", "agentId": agentId}), catteryClient: catteryClient.NewCatteryClient(catteryServerUrl, agentId), listenerExecPath: path.Join(runnerFolder, "bin", "Runner.Listener"), agentId: agentId, - interrupted: false, } } diff --git a/src/lib/restarter/repositories/iRestarterRepository.go b/src/lib/restarter/repositories/iRestarterRepository.go index 9971dfa..e24786d 100644 --- a/src/lib/restarter/repositories/iRestarterRepository.go +++ b/src/lib/restarter/repositories/iRestarterRepository.go @@ -12,6 +12,5 @@ type RestartRequest struct { type IRestarterRepository interface { SaveRestartRequest(workflowRunId int64, orgName string, repoName string) error DeleteRestartRequest(workflowRunId int64) error - CheckRestartRequest(workflowRunId int64) (bool, error) GetAllPendingRestartRequests() ([]RestartRequest, error) } diff --git a/src/lib/restarter/repositories/mongodbRestarterRepository.go b/src/lib/restarter/repositories/mongodbRestarterRepository.go index ff1703e..34b296a 100644 --- a/src/lib/restarter/repositories/mongodbRestarterRepository.go +++ b/src/lib/restarter/repositories/mongodbRestarterRepository.go @@ -2,7 +2,6 @@ package repositories import ( "context" - "errors" "time" "go.mongodb.org/mongo-driver/v2/bson" @@ -51,25 +50,6 @@ func (m *MongodbRestarterRepository) DeleteRestartRequest(workflowRunId int64) e return err } -func (m *MongodbRestarterRepository) CheckRestartRequest(workflowRunId int64) (bool, error) { - dbResult := m.collection.FindOne( - context.Background(), - bson.M{ - "workflowRunId": workflowRunId, - }, - ) - var result bson.M - err := dbResult.Decode(&result) - if err != nil { - if errors.Is(err, mongo.ErrNoDocuments) { - return false, nil - } - return false, err - } - - return true, nil -} - func (m *MongodbRestarterRepository) GetAllPendingRestartRequests() ([]RestartRequest, error) { cursor, err := m.collection.Find(context.Background(), bson.M{}) if err != nil { diff --git a/src/lib/trays/providers/dockerProvider.go b/src/lib/trays/providers/dockerProvider.go index 102b8a3..af2bfe9 100644 --- a/src/lib/trays/providers/dockerProvider.go +++ b/src/lib/trays/providers/dockerProvider.go @@ -36,16 +36,6 @@ func (d *DockerProvider) GetProviderName() string { return d.name } -func (d *DockerProvider) GetTray(id string) (*trays.Tray, error) { - //TODO implement me - panic("implement me") -} - -func (d *DockerProvider) ListTrays() ([]*trays.Tray, error) { - //TODO implement me - panic("implement me") -} - func (d *DockerProvider) RunTray(tray *trays.Tray) error { var containerName = tray.GetId() diff --git a/src/lib/trays/providers/gceProvider.go b/src/lib/trays/providers/gceProvider.go index 8076371..317c387 100644 --- a/src/lib/trays/providers/gceProvider.go +++ b/src/lib/trays/providers/gceProvider.go @@ -47,16 +47,6 @@ func (g *GceProvider) GetProviderName() string { return g.Name } -func (g *GceProvider) GetTray(id string) (*trays.Tray, error) { - //TODO implement me - panic("implement me") -} - -func (g *GceProvider) ListTrays() ([]*trays.Tray, error) { - //TODO implement me - panic("implement me") -} - func (g *GceProvider) RunTray(tray *trays.Tray) error { ctx := context.Background() diff --git a/src/lib/trays/providers/iTrayProvider.go b/src/lib/trays/providers/iTrayProvider.go index f4b3d98..f1c1018 100644 --- a/src/lib/trays/providers/iTrayProvider.go +++ b/src/lib/trays/providers/iTrayProvider.go @@ -7,12 +7,6 @@ import ( type ITrayProvider interface { GetProviderName() string - // GetTray returns the tray with the given ID. - GetTray(id string) (*trays.Tray, error) - - // ListTrays returns all trays. - ListTrays() ([]*trays.Tray, error) - // RunTray spawns a new tray. RunTray(tray *trays.Tray) error diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index e041eda..4b2fab8 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -83,21 +83,6 @@ func (m *MongodbTrayRepository) MarkRedundant(trayType string, limit int) ([]*tr return resultTrays, nil } -func (m *MongodbTrayRepository) GetByJobRunId(jobRunId int64) (*trays.Tray, error) { - dbResult := m.collection.FindOne(context.Background(), bson.M{"jobRunId": jobRunId}) - - var result trays.Tray - err := dbResult.Decode(&result) - if err != nil { - if errors.Is(err, mongo.ErrNoDocuments) { - return nil, nil - } - return nil, err - } - - return &result, nil -} - func (m *MongodbTrayRepository) Save(tray *trays.Tray) error { tray.StatusChanged = time.Now().UTC() _, err := m.collection.InsertOne(context.Background(), tray) diff --git a/src/lib/trays/repositories/traysRepository.go b/src/lib/trays/repositories/traysRepository.go deleted file mode 100644 index cd40362..0000000 --- a/src/lib/trays/repositories/traysRepository.go +++ /dev/null @@ -1,54 +0,0 @@ -package repositories - -import ( - "cattery/lib/trays" - "sync" -) - -type MemTrayRepository struct { - ITrayRepository - trays map[string]*trays.Tray - mutex sync.RWMutex -} - -func NewMemTrayRepository() *MemTrayRepository { - return &MemTrayRepository{ - trays: make(map[string]*trays.Tray), - mutex: sync.RWMutex{}, - } -} - -func (r *MemTrayRepository) GetById(trayId string) (*trays.Tray, error) { - r.mutex.RLock() - defer r.mutex.RUnlock() - - tray, exists := r.trays[trayId] - if !exists { - return nil, nil - } - - return tray, nil -} - -func (r *MemTrayRepository) Save(tray *trays.Tray) error { - r.mutex.Lock() - defer r.mutex.Unlock() - - r.trays[tray.GetId()] = tray - return nil -} - -func (r *MemTrayRepository) Delete(trayId string) error { - r.mutex.Lock() - defer r.mutex.Unlock() - - delete(r.trays, trayId) - return nil -} - -func (r *MemTrayRepository) Len() int { - r.mutex.RLock() - defer r.mutex.RUnlock() - - return len(r.trays) -} From d6dd4b427f36908d06239dad3af4e8690fc6e3f2 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 13:30:02 +0400 Subject: [PATCH 07/27] agent shutdown --- src/agent/agent.go | 142 ++++++++++++++------- src/agent/githubListener/githubListener.go | 26 +--- 2 files changed, 101 insertions(+), 67 deletions(-) diff --git a/src/agent/agent.go b/src/agent/agent.go index 0301988..f688d3c 100644 --- a/src/agent/agent.go +++ b/src/agent/agent.go @@ -6,6 +6,8 @@ import ( "cattery/agent/tools" "cattery/lib/agents" "cattery/lib/messages" + "context" + "errors" "os" "os/signal" "path" @@ -20,6 +22,14 @@ var RunnerFolder string var CatteryServerUrl string var Id string +// shutdownCause is used as context.Cause to carry the termination reason. +type shutdownCause struct { + reason messages.UnregisterReason + message string +} + +func (s *shutdownCause) Error() string { return s.message } + func Start() { var catteryAgent = NewCatteryAgent(RunnerFolder, CatteryServerUrl, Id) catteryAgent.Start() @@ -55,40 +65,83 @@ func (a *CatteryAgent) Start() { a.logger.Info("Agent registered, starting Listener") - shutdownCh := make(chan githubListener.ShutdownEvent, 1) + ctx, cancel := context.WithCancelCause(context.Background()) + defer cancel(nil) - a.watchSignal(shutdownCh) - a.watchFile(shutdownCh) - a.watchPing(shutdownCh) + a.watchSignal(ctx, cancel) + a.watchFile(ctx, cancel) + a.watchPing(ctx, cancel) var ghListener = githubListener.NewGithubListener(a.listenerExecPath) - ghListener.Start(jitConfig, shutdownCh) + ghListener.Start(ctx, cancel, jitConfig) + + // Block until any source triggers cancellation + <-ctx.Done() + + // Determine what happened + reason, msg := a.resolveShutdownCause(ctx) + a.logger.Infof("Shutdown: reason=%d, message=%s", reason, msg) + + // Kill listener if it wasn't the one that finished + if reason != messages.UnregisterReasonDone { + ghListener.Stop() + } + + a.unregisterAndShutdown(reason, msg) +} + +// resolveShutdownCause extracts the termination reason from the context cause. +// - shutdownCause: a watcher triggered shutdown (signal, file, ping) +// - nil cause: listener exited cleanly +// - other error: listener exited with error +func (a *CatteryAgent) resolveShutdownCause(ctx context.Context) (messages.UnregisterReason, string) { + cause := context.Cause(ctx) - // Block until first shutdown event - event := <-shutdownCh + var sc *shutdownCause + if errors.As(cause, &sc) { + return sc.reason, sc.message + } + + // Listener finished (cancel was called with nil or a process error) + if cause == nil { + return messages.UnregisterReasonDone, "Listener finished" + } + return messages.UnregisterReasonDone, "Listener exited: " + cause.Error() +} + +func (a *CatteryAgent) unregisterAndShutdown(reason messages.UnregisterReason, msg string) { + log.Infof("Stopping Cattery Agent with reason: %d, message: `%s`", reason, msg) - a.logger.Infof("Received shutdown event: %s, reason: %d", event.Message, event.Reason) + err := a.catteryClient.UnregisterAgent(a.agent, reason, msg) + if err != nil { + a.logger.Errorf("Failed to unregister agent: %v", err) + } - ghListener.Stop() - a.stop(event) + if a.agent.Shutdown { + a.logger.Debugf("Shutdown now") + tools.Shutdown() + } } -func (a *CatteryAgent) watchSignal(ch chan<- githubListener.ShutdownEvent) { +func (a *CatteryAgent) watchSignal(ctx context.Context, cancel context.CancelCauseFunc) { go func() { sigs := make(chan os.Signal, 1) signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - sig := <-sigs - a.logger.Info("Got signal ", sig) - - ch <- githubListener.ShutdownEvent{ - Reason: messages.UnregisterReasonSigTerm, - Message: "Got signal " + sig.String(), + select { + case <-ctx.Done(): + return + case sig := <-sigs: + a.logger.Info("Got signal ", sig) + cancel(&shutdownCause{ + reason: messages.UnregisterReasonSigTerm, + message: "Got signal " + sig.String(), + }) } }() } -func (a *CatteryAgent) watchFile(ch chan<- githubListener.ShutdownEvent) { +func (a *CatteryAgent) watchFile(ctx context.Context, cancel context.CancelCauseFunc) { const shutdownFile = "./shutdown_file" go func() { @@ -110,27 +163,35 @@ func (a *CatteryAgent) watchFile(ch chan<- githubListener.ShutdownEvent) { } select { + case <-ctx.Done(): + return case event := <-watcher.Events: msg := "Shutdown file changed: " + event.Name a.logger.Info(msg) - ch <- githubListener.ShutdownEvent{ - Reason: messages.UnregisterReasonPreempted, - Message: msg, - } - case err := <-watcher.Errors: - msg := "File watcher error: " + err.Error() + cancel(&shutdownCause{ + reason: messages.UnregisterReasonPreempted, + message: msg, + }) + case watchErr := <-watcher.Errors: + msg := "File watcher error: " + watchErr.Error() a.logger.Error(msg) - ch <- githubListener.ShutdownEvent{ - Reason: messages.UnregisterReasonPreempted, - Message: msg, - } + cancel(&shutdownCause{ + reason: messages.UnregisterReasonPreempted, + message: msg, + }) } }() } -func (a *CatteryAgent) watchPing(ch chan<- githubListener.ShutdownEvent) { +func (a *CatteryAgent) watchPing(ctx context.Context, cancel context.CancelCauseFunc) { go func() { for { + select { + case <-ctx.Done(): + return + default: + } + pingResponse, err := a.catteryClient.Ping() if err != nil { a.logger.Errorf("Error pinging controller: %v", err) @@ -141,10 +202,10 @@ func (a *CatteryAgent) watchPing(ch chan<- githubListener.ShutdownEvent) { if pingResponse.Terminate { msg := "Controller requested termination: " + pingResponse.Message a.logger.Info(msg) - ch <- githubListener.ShutdownEvent{ - Reason: messages.UnregisterReasonControllerKill, - Message: msg, - } + cancel(&shutdownCause{ + reason: messages.UnregisterReasonControllerKill, + message: msg, + }) return } @@ -152,18 +213,3 @@ func (a *CatteryAgent) watchPing(ch chan<- githubListener.ShutdownEvent) { } }() } - -// stop stops the runner process -func (a *CatteryAgent) stop(event githubListener.ShutdownEvent) { - log.Infof("Stopping Cattery Agent with reason: %d, message: `%s`", event.Reason, event.Message) - - err := a.catteryClient.UnregisterAgent(a.agent, event.Reason, event.Message) - if err != nil { - a.logger.Errorf("Failed to unregister agent: %v", err) - } - - if a.agent.Shutdown { - a.logger.Debugf("Shutdown now") - tools.Shutdown() - } -} diff --git a/src/agent/githubListener/githubListener.go b/src/agent/githubListener/githubListener.go index f723b9a..602ba3f 100644 --- a/src/agent/githubListener/githubListener.go +++ b/src/agent/githubListener/githubListener.go @@ -1,7 +1,7 @@ package githubListener import ( - "cattery/lib/messages" + "context" "os" "os/exec" "sync" @@ -9,11 +9,6 @@ import ( log "github.com/sirupsen/logrus" ) -type ShutdownEvent struct { - Reason messages.UnregisterReason - Message string -} - type GithubListener struct { listenerPath string process *os.Process @@ -27,20 +22,18 @@ func NewGithubListener(listenerPath string) *GithubListener { } } -func (l *GithubListener) Start(jitConfig *string, shutdownCh chan<- ShutdownEvent) { +// Start launches the GitHub runner listener in a background goroutine. +// When the process exits, it cancels ctx with the resulting error (nil on success). +func (l *GithubListener) Start(ctx context.Context, cancel context.CancelCauseFunc, jitConfig *string) { var commandRun = exec.Command(l.listenerPath, "run", "--jitconfig", *jitConfig) commandRun.Stdout = os.Stdout commandRun.Stderr = os.Stderr go func() { - var msg = "Listener finished" - var reason = messages.UnregisterReasonDone - err := commandRun.Start() if err != nil { - msg = "Listener failed to start: " + err.Error() - log.Error(msg) - shutdownCh <- ShutdownEvent{Reason: messages.UnregisterReasonUnknown, Message: msg} + log.Errorf("Listener failed to start: %v", err) + cancel(err) return } @@ -49,12 +42,7 @@ func (l *GithubListener) Start(jitConfig *string, shutdownCh chan<- ShutdownEven l.mut.Unlock() err = commandRun.Wait() - if err != nil { - msg = "Runner failed: " + err.Error() - log.Error(msg) - } - - shutdownCh <- ShutdownEvent{Reason: reason, Message: msg} + cancel(err) // nil means clean exit }() } From 0ef73c9e66fee6b579dd784bb81e5bdf37e5775e Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Tue, 24 Mar 2026 13:39:21 +0400 Subject: [PATCH 08/27] tests fix --- .../mongodbTrayRepository_test.go | 49 ++----------------- 1 file changed, 3 insertions(+), 46 deletions(-) diff --git a/src/lib/trays/repositories/mongodbTrayRepository_test.go b/src/lib/trays/repositories/mongodbTrayRepository_test.go index ae042ee..13afa9c 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository_test.go +++ b/src/lib/trays/repositories/mongodbTrayRepository_test.go @@ -201,7 +201,7 @@ func TestUpdateStatus(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray}) // Test UpdateStatus with jobRunId only - updatedTray, err := repo.UpdateStatus("test-tray-1", trays.TrayStatusRegistered, 123, 0, 0) + updatedTray, err := repo.UpdateStatus("test-tray-1", trays.TrayStatusRegistered, 123, 0, 0, "") if err != nil { t.Fatalf("UpdateStatus failed: %v", err) } @@ -219,7 +219,7 @@ func TestUpdateStatus(t *testing.T) { } // Test UpdateStatus with ghRunnerId - updatedTray, err = repo.UpdateStatus("test-tray-1", trays.TrayStatusRunning, 456, 333, 789) + updatedTray, err = repo.UpdateStatus("test-tray-1", trays.TrayStatusRunning, 456, 333, 789, "") if err != nil { t.Fatalf("UpdateStatus with ghRunnerId failed: %v", err) } @@ -241,7 +241,7 @@ func TestUpdateStatus(t *testing.T) { } // Test UpdateStatus with non-existent ID - updatedTray, err = repo.UpdateStatus("non-existent", trays.TrayStatusRegistered, 123, 0, 0) + updatedTray, err = repo.UpdateStatus("non-existent", trays.TrayStatusRegistered, 123, 0, 0, "") if err != nil { t.Fatalf("UpdateStatus with non-existent ID failed: %v", err) } @@ -287,49 +287,6 @@ func TestDelete(t *testing.T) { } } -// TestGetByJobRunId tests the GetByJobRunId method -func TestGetByJobRunId(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - // Create test repository - repo := NewMongodbTrayRepository() - repo.Connect(collection) - - // Insert test data - testTray1 := createTestTray("test-tray-1", "test-type", trays.TrayStatusRunning, 123) - testTray2 := createTestTray("test-tray-2", "test-type", trays.TrayStatusCreating, 0) - insertTestTrays(t, collection, []*TestTray{testTray1, testTray2}) - - // Test GetByJobRunId - tray, err := repo.GetByJobRunId(123) - if err != nil { - t.Fatalf("GetByJobRunId failed: %v", err) - } - - if tray == nil { - t.Fatal("GetByJobRunId returned nil tray") - } - - if tray.Id != "test-tray-1" { - t.Errorf("Expected tray ID 'test-tray-1', got '%s'", tray.Id) - } - - if tray.JobRunId != 123 { - t.Errorf("Expected JobRunId 123, got %d", tray.JobRunId) - } - - // Test GetByJobRunId with non-existent JobRunId - tray, err = repo.GetByJobRunId(999) - if err != nil { - t.Fatalf("GetByJobRunId with non-existent JobRunId failed: %v", err) - } - - if tray != nil { - t.Error("Expected nil tray for non-existent JobRunId, got non-nil") - } -} - // TestMarkRedundant tests the MarkRedundant method func TestMarkRedundant(t *testing.T) { client, collection := setupTestCollection(t) From 070ffb37ca52dac1fc0df2d7d7a0a81b0e90c9ca Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Wed, 25 Mar 2026 19:57:27 +0400 Subject: [PATCH 09/27] err fix --- src/agent/catteryClient/client.go | 10 ++++++++-- src/lib/githubClient/githubClient.go | 2 +- src/lib/scaleSetPoller/poller.go | 2 +- src/lib/trays/providers/trayProviderFactory.go | 4 ++++ src/server/handlers/agentHandler.go | 6 ++---- 5 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/agent/catteryClient/client.go b/src/agent/catteryClient/client.go index c245743..3065b93 100644 --- a/src/agent/catteryClient/client.go +++ b/src/agent/catteryClient/client.go @@ -103,9 +103,15 @@ func (c *CatteryClient) UnregisterAgent(agent *agents.Agent, reason messages.Unr func (c *CatteryClient) Ping() (*messages.PingResponse, error) { - var response, err = c.get("/agent", "ping", c.agentId) + requestUrl, err := url.JoinPath(c.baseURL, "/agent", "ping", c.agentId) if err != nil { - return nil, errors.New("get error: " + err.Error()) + return nil, errors.New("failed to join path: " + err.Error()) + } + + request, _ := http.NewRequest("POST", requestUrl, nil) + response, err := c.httpClient.Do(request) + if err != nil { + return nil, errors.New("post error: " + err.Error()) } defer response.Body.Close() diff --git a/src/lib/githubClient/githubClient.go b/src/lib/githubClient/githubClient.go index 42c57b7..e52ea67 100644 --- a/src/lib/githubClient/githubClient.go +++ b/src/lib/githubClient/githubClient.go @@ -66,7 +66,7 @@ func (gc *GithubClient) RestartFailedJobs(repoName string, workflowId int64) err wr, _, err := gc.client.Actions.GetWorkflowRunByID(context.Background(), gc.Org.Name, repoName, workflowId) if err != nil { log.Errorf("Failed to get workflow run by id %d: %v", workflowId, err) - // return err + return err } log.Debugf("Workflow run status: %s, conclusion: %s", wr.GetStatus(), wr.GetConclusion()) _, err = gc.client.Actions.RerunFailedJobsByID(context.Background(), gc.Org.Name, repoName, workflowId) diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index 3d6f459..1699827 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -116,7 +116,7 @@ func (cs *catteryScaler) HandleJobStarted(ctx context.Context, jobInfo *scaleset jobInfo.JobDisplayName, jobInfo.RunnerName, jobInfo.WorkflowRunID) jobID, _ := strconv.ParseInt(jobInfo.JobID, 10, 64) - repository := fmt.Sprintf("%s/%s", jobInfo.OwnerName, jobInfo.RepositoryName) + repository := jobInfo.RepositoryName _, err := cs.poller.trayManager.SetJob(jobInfo.RunnerName, jobID, jobInfo.WorkflowRunID, repository) if err != nil { diff --git a/src/lib/trays/providers/trayProviderFactory.go b/src/lib/trays/providers/trayProviderFactory.go index 0ee887f..c152668 100644 --- a/src/lib/trays/providers/trayProviderFactory.go +++ b/src/lib/trays/providers/trayProviderFactory.go @@ -56,6 +56,10 @@ func GetProvider(providerName string) (ITrayProvider, error) { return nil, errors.New(errMsg) } + if result == nil { + return nil, errors.New("failed to initialize provider: " + providerName) + } + providers[providerName] = result return result, nil diff --git a/src/server/handlers/agentHandler.go b/src/server/handlers/agentHandler.go index 36707e6..38511e7 100644 --- a/src/server/handlers/agentHandler.go +++ b/src/server/handlers/agentHandler.go @@ -235,9 +235,8 @@ func (h *Handlers) AgentPing(responseWriter http.ResponseWriter, r *http.Request if err != nil { var errMsg = fmt.Sprintf("Failed to get tray by id '%s': %v", agentId, err) logger.Error(errMsg) - http.Error(responseWriter, errMsg, http.StatusInternalServerError) - pingResponse.Message = "Failed to get tray by id: " + errMsg + pingResponse.Message = errMsg pingResponse.Terminate = true writeResponse(responseWriter, pingResponse, logger) @@ -246,9 +245,8 @@ func (h *Handlers) AgentPing(responseWriter http.ResponseWriter, r *http.Request if tray == nil { var errMsg = fmt.Sprintf("Tray with id '%s' not found", agentId) logger.Error(errMsg) - http.Error(responseWriter, errMsg, http.StatusGone) - pingResponse.Message = "Failed to get tray by id: " + errMsg + pingResponse.Message = errMsg pingResponse.Terminate = true writeResponse(responseWriter, pingResponse, logger) From dd9516025c848ee6ed32ca3d17c8136b507bf7d8 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Wed, 25 Mar 2026 20:09:12 +0400 Subject: [PATCH 10/27] RemoveRunner --- src/lib/githubClient/githubClient.go | 5 ----- src/lib/trayManager/trayManager.go | 11 ----------- 2 files changed, 16 deletions(-) diff --git a/src/lib/githubClient/githubClient.go b/src/lib/githubClient/githubClient.go index e52ea67..a461910 100644 --- a/src/lib/githubClient/githubClient.go +++ b/src/lib/githubClient/githubClient.go @@ -57,11 +57,6 @@ func (gc *GithubClient) CreateJITConfig(name string, runnerGroupId int64, labels return jitConfig, err } -func (gc *GithubClient) RemoveRunner(runnerId int64) error { - _, err := gc.client.Actions.RemoveOrganizationRunner(context.Background(), gc.Org.Name, runnerId) - return err -} - func (gc *GithubClient) RestartFailedJobs(repoName string, workflowId int64) error { wr, _, err := gc.client.Actions.GetWorkflowRunByID(context.Background(), gc.Org.Name, repoName, workflowId) if err != nil { diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 737ab50..6d30772 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -2,7 +2,6 @@ package trayManager import ( "cattery/lib/config" - "cattery/lib/githubClient" "cattery/lib/metrics" "cattery/lib/trays" "cattery/lib/trays/providers" @@ -125,16 +124,6 @@ func (tm *TrayManager) DeleteTray(trayId string) (*trays.Tray, error) { return nil, nil // Tray not found, nothing to delete } - ghClient, err := githubClient.NewGithubClientWithOrgName(tray.GetGitHubOrgName()) - if err != nil { - return nil, err - } - - err = ghClient.RemoveRunner(tray.GitHubRunnerId) - if err != nil { - return nil, err - } - provider, err := providers.GetProviderForTray(tray) if err != nil { return nil, err From fdeda389f445794ab05e2ec0969a9a47aac77e5a Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Wed, 25 Mar 2026 21:01:02 +0400 Subject: [PATCH 11/27] errs --- src/lib/trayManager/trayManager.go | 12 ++++++++---- src/lib/trays/providers/dockerProvider.go | 4 +++- src/lib/trays/repositories/mongodbTrayRepository.go | 2 +- src/lib/trays/tray.go | 6 +++++- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 6d30772..96da9b4 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -55,9 +55,12 @@ func (tm *TrayManager) CreateTray(trayType *config.TrayType) error { err = tm.trayRepository.Save(tray) if err != nil { - var errMsg = fmt.Sprintf("Failed to save tray %s: %v", trayType.Name, err) - log.Error(errMsg) - return errors.New(errMsg) + log.Errorf("Failed to save tray %s: %v — cleaning up provider resource", trayType.Name, err) + if cleanErr := provider.CleanTray(tray); cleanErr != nil { + log.Errorf("Failed to clean up tray %s after save failure: %v", tray.GetId(), cleanErr) + metrics.TrayProviderErrors(tray.GitHubOrgName, tray.ProviderName, tray.TrayTypeName, "delete") + } + return fmt.Errorf("failed to save tray %s: %w", trayType.Name, err) } return nil @@ -192,9 +195,10 @@ func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, pendingJobs int } traysWithNoJob := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + countByStatus[trays.TrayStatusRegistered] + activeTotal := total - countByStatus[trays.TrayStatusDeleting] if pendingJobs > traysWithNoJob { - remainingCapacity := trayType.MaxTrays - total + remainingCapacity := trayType.MaxTrays - activeTotal traysToCreate := pendingJobs - traysWithNoJob if traysToCreate > remainingCapacity { traysToCreate = remainingCapacity diff --git a/src/lib/trays/providers/dockerProvider.go b/src/lib/trays/providers/dockerProvider.go index af2bfe9..9e7dbd3 100644 --- a/src/lib/trays/providers/dockerProvider.go +++ b/src/lib/trays/providers/dockerProvider.go @@ -44,11 +44,13 @@ func (d *DockerProvider) RunTray(tray *trays.Tray) error { var image = trayConfig.Image + var serverUrl = config.AppConfig.Server.AdvertiseUrl + var dockerCommand = exec.Command("docker", "run", "-d", "--rm", "--add-host=host.docker.internal:host-gateway", "--name", containerName, image, - "/action-runner/cattery/cattery", "agent", "-i", tray.GetId(), "-s", "http://host.docker.internal:5137", "--runner-folder", "/action-runner") + "/action-runner/cattery/cattery", "agent", "-i", tray.GetId(), "-s", serverUrl, "--runner-folder", "/action-runner") d.logger.Info("Running docker command: ", dockerCommand.String()) err := dockerCommand.Run() diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index 4b2fab8..d0db328 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -42,7 +42,7 @@ func (m *MongodbTrayRepository) GetById(trayId string) (*trays.Tray, error) { func (m *MongodbTrayRepository) GetStale(d time.Duration) ([]*trays.Tray, error) { dbResult, err := m.collection.Find(context.Background(), bson.M{ - "status": bson.M{"$ne": trays.TrayStatusRunning}, + "status": bson.M{"$nin": bson.A{trays.TrayStatusRunning, trays.TrayStatusDeleting}}, "statusChanged": bson.M{"$lte": time.Now().UTC().Add(-d)}, }) if err != nil { diff --git a/src/lib/trays/tray.go b/src/lib/trays/tray.go index 338ceb0..23f95e0 100644 --- a/src/lib/trays/tray.go +++ b/src/lib/trays/tray.go @@ -68,7 +68,11 @@ func (tray *Tray) GetTrayType() config.TrayType { } func (tray *Tray) GetTrayConfig() config.TrayConfig { - return config.AppConfig.GetTrayType(tray.TrayTypeName).Config + tt := config.AppConfig.GetTrayType(tray.TrayTypeName) + if tt == nil { + return nil + } + return tt.Config } func (tray *Tray) String() string { From 6fca1cc348ff88e21bc1168e2c672ec21d1c52d2 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Wed, 25 Mar 2026 21:27:53 +0400 Subject: [PATCH 12/27] cleanup --- src/agent/catteryClient/client.go | 18 -------------- src/lib/githubClient/githubClient.go | 37 ---------------------------- src/lib/scaleSetPoller/poller.go | 1 - src/server/handlers/agentHandler.go | 20 --------------- 4 files changed, 76 deletions(-) diff --git a/src/agent/catteryClient/client.go b/src/agent/catteryClient/client.go index 3065b93..c586c62 100644 --- a/src/agent/catteryClient/client.go +++ b/src/agent/catteryClient/client.go @@ -6,11 +6,9 @@ import ( "cattery/lib/messages" "encoding/json" "errors" - "fmt" "io" "net/http" "net/url" - "strings" "github.com/sirupsen/logrus" ) @@ -161,19 +159,3 @@ func (c *CatteryClient) InterruptAgent(agent *agents.Agent) error { return nil } - -// get -func (c *CatteryClient) get(path ...string) (*http.Response, error) { - client := c.httpClient - requestUrl, err := url.JoinPath(c.baseURL, path...) - if err != nil { - return nil, errors.New(fmt.Sprintf("failed to join path %s, %s", strings.Join(path, " "), err.Error())) - } - - response, err := client.Get(requestUrl) - if err != nil { - return nil, errors.New(fmt.Sprintf("failed to do request %s, %s", requestUrl, err.Error())) - } - - return response, nil -} diff --git a/src/lib/githubClient/githubClient.go b/src/lib/githubClient/githubClient.go index a461910..b380a13 100644 --- a/src/lib/githubClient/githubClient.go +++ b/src/lib/githubClient/githubClient.go @@ -22,13 +22,6 @@ type GithubClient struct { Org *config.GitHubOrganization } -func NewGithubClientWithOrgConfig(org *config.GitHubOrganization) *GithubClient { - return &GithubClient{ - client: createClient(org), - Org: org, - } -} - func NewGithubClientWithOrgName(orgName string) (*GithubClient, error) { var orgConfig = config.AppConfig.GetGitHubOrg(orgName) @@ -42,21 +35,6 @@ func NewGithubClientWithOrgName(orgName string) (*GithubClient, error) { }, nil } -// CreateJITConfig creates a new JIT config -func (gc *GithubClient) CreateJITConfig(name string, runnerGroupId int64, labels []string) (*github.JITRunnerConfig, error) { - jitConfig, _, err := gc.client.Actions.GenerateOrgJITConfig( - context.Background(), - gc.Org.Name, - &github.GenerateJITConfigRequest{ - Name: name, - RunnerGroupID: runnerGroupId, - Labels: labels, - }, - ) - - return jitConfig, err -} - func (gc *GithubClient) RestartFailedJobs(repoName string, workflowId int64) error { wr, _, err := gc.client.Actions.GetWorkflowRunByID(context.Background(), gc.Org.Name, repoName, workflowId) if err != nil { @@ -76,21 +54,6 @@ func (gc *GithubClient) GetWorkflowRunStatus(repoName string, workflowRunId int6 return wr.GetStatus(), wr.GetConclusion(), nil } -func (gc *GithubClient) CheckJobCompleted(repoName string, jobId int64) (bool, error) { - wfJob, resp, err := gc.client.Actions.GetWorkflowJobByID(context.Background(), gc.Org.Name, repoName, jobId) - if err != nil { - if resp != nil && resp.StatusCode == http.StatusNotFound { - log.Tracef("Workflow job not found: %s/%s %d", gc.Org.Name, repoName, jobId) - return true, nil - } - return false, err - } - - var status = wfJob.GetStatus() - - return status == "completed", nil -} - // createClient creates a new GitHub client func createClient(org *config.GitHubOrganization) *github.Client { githubClientsMu.Lock() diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index 1699827..d3a1b32 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -124,7 +124,6 @@ func (cs *catteryScaler) HandleJobStarted(ctx context.Context, jobInfo *scaleset return err } - metrics.RegisteredTraysAdd(cs.poller.trayType.GitHubOrg, cs.poller.trayType.Name, 0) return nil } diff --git a/src/server/handlers/agentHandler.go b/src/server/handlers/agentHandler.go index 38511e7..c80ec31 100644 --- a/src/server/handlers/agentHandler.go +++ b/src/server/handlers/agentHandler.go @@ -26,11 +26,6 @@ func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Req logger.Tracef("AgentRegister: %v", r) - if r.Method != http.MethodGet { - http.Error(responseWriter, "Method not allowed", http.StatusMethodNotAllowed) - return - } - var id = r.PathValue("id") var agentId = validateAgentId(id) @@ -118,11 +113,6 @@ func (h *Handlers) AgentUnregister(responseWriter http.ResponseWriter, r *http.R logger.Tracef("AgentUnregister: %v", r) - if r.Method != http.MethodPost { - http.Error(responseWriter, "Method not allowed", http.StatusMethodNotAllowed) - return - } - var trayId = r.PathValue("id") var tray, err = h.TrayManager.GetTrayById(trayId) @@ -174,11 +164,6 @@ func AgentDownloadBinary(responseWriter http.ResponseWriter, r *http.Request) { }) logger.Tracef("AgentDownloadBinary: %v", r) - if r.Method != http.MethodGet { - http.Error(responseWriter, "Method not allowed", http.StatusMethodNotAllowed) - return - } - // Get the current executable path execPath, err := os.Executable() if err != nil { @@ -288,11 +273,6 @@ func (h *Handlers) AgentInterrupt(responseWriter http.ResponseWriter, r *http.Re logger.Tracef("AgentRestart: %v", r) - if r.Method != http.MethodPost { - http.Error(responseWriter, "Method not allowed", http.StatusMethodNotAllowed) - return - } - var id = r.PathValue("id") var agentId = validateAgentId(id) From 987074960fc03b3fbc3fa00434eed1841878268c Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Wed, 25 Mar 2026 23:59:03 +0400 Subject: [PATCH 13/27] fix --- src/lib/scaleSetClient/scaleSetClient.go | 12 +++++++++- src/lib/scaleSetPoller/poller.go | 3 +-- src/lib/trayManager/trayManager.go | 30 +++++++++++++++--------- src/server/server.go | 18 ++++++++++++-- 4 files changed, 47 insertions(+), 16 deletions(-) diff --git a/src/lib/scaleSetClient/scaleSetClient.go b/src/lib/scaleSetClient/scaleSetClient.go index 3c13b85..211bfdf 100644 --- a/src/lib/scaleSetClient/scaleSetClient.go +++ b/src/lib/scaleSetClient/scaleSetClient.go @@ -51,7 +51,10 @@ func NewScaleSetClient(org *config.GitHubOrganization, trayType *config.TrayType func (sc *ScaleSetClient) EnsureScaleSet(ctx context.Context) error { existing, err := sc.client.GetRunnerScaleSet(ctx, int(sc.trayType.RunnerGroupId), sc.trayType.Name) - if err == nil && existing != nil { + if err != nil { + return fmt.Errorf("failed to get scale set: %w", err) + } + if existing != nil { sc.scaleSet = existing sc.logger.Infof("Found existing scale set: %s (ID: %d)", existing.Name, existing.ID) return nil @@ -113,3 +116,10 @@ func (sc *ScaleSetClient) GetScaleSetID() int { } return 0 } + +func (sc *ScaleSetClient) Session() scaleset.RunnerScaleSetSession { + if sc.session != nil { + return sc.session.Session() + } + return scaleset.RunnerScaleSetSession{} +} diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index d3a1b32..15ead67 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -88,8 +88,7 @@ func (s *sessionAdapter) DeleteMessage(ctx context.Context, messageID int) error } func (s *sessionAdapter) Session() scaleset.RunnerScaleSetSession { - // The listener needs this for logging/metadata only. - return scaleset.RunnerScaleSetSession{} + return s.client.Session() } // catteryScaler implements the listener.Scaler interface. diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 96da9b4..a63003b 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -185,21 +185,22 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { }() } -// ScaleForDemand scales trays for a given tray type based on pending job count. -// Called by the scale set poller with statistics from GitHub. -func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, pendingJobs int) error { +// ScaleForDemand scales trays for a given tray type based on the desired runner count. +// The desiredCount is TotalAssignedJobs from GitHub scale set statistics — the total +// number of runners that should exist (running + idle) to serve all assigned jobs. +func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, desiredCount int) error { countByStatus, total, err := tm.trayRepository.CountByTrayType(trayType.Name) if err != nil { log.Errorf("Failed to count trays for type %s: %v", trayType.Name, err) return err } - traysWithNoJob := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + countByStatus[trays.TrayStatusRegistered] + idleTrays := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + countByStatus[trays.TrayStatusRegistered] activeTotal := total - countByStatus[trays.TrayStatusDeleting] - if pendingJobs > traysWithNoJob { + if desiredCount > activeTotal { remainingCapacity := trayType.MaxTrays - activeTotal - traysToCreate := pendingJobs - traysWithNoJob + traysToCreate := desiredCount - activeTotal if traysToCreate > remainingCapacity { traysToCreate = remainingCapacity } @@ -211,8 +212,12 @@ func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, pendingJobs int } } - if pendingJobs < traysWithNoJob { - traysToDelete := traysWithNoJob - pendingJobs + if desiredCount < activeTotal && idleTrays > 0 { + excess := activeTotal - desiredCount + traysToDelete := excess + if traysToDelete > idleTrays { + traysToDelete = idleTrays + } redundant, err := tm.trayRepository.MarkRedundant(trayType.Name, traysToDelete) if err != nil { return err @@ -227,8 +232,11 @@ func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, pendingJobs int return nil } -// CountTrays returns the total number of trays for a given tray type. +// CountTrays returns the number of active (non-deleting) trays for a given tray type. func (tm *TrayManager) CountTrays(trayTypeName string) (int, error) { - _, total, err := tm.trayRepository.CountByTrayType(trayTypeName) - return total, err + countByStatus, total, err := tm.trayRepository.CountByTrayType(trayTypeName) + if err != nil { + return 0, err + } + return total - countByStatus[trays.TrayStatusDeleting], nil } diff --git a/src/server/server.go b/src/server/server.go index f5f89b0..c3eecc3 100644 --- a/src/server/server.go +++ b/src/server/server.go @@ -14,6 +14,7 @@ import ( "os" "os/signal" "syscall" + "time" "github.com/prometheus/client_golang/prometheus/promhttp" log "github.com/sirupsen/logrus" @@ -77,8 +78,21 @@ func Start() { ssm.Register(trayType.Name, poller) go func(p *scaleSetPoller.Poller, name string) { - if err := p.Run(ctx); err != nil { - logger.Errorf("Scale set poller for '%s' exited with error: %v", name, err) + for { + if err := p.Run(ctx); err != nil { + if ctx.Err() != nil { + logger.Infof("Scale set poller for '%s' stopped: %v", name, err) + return + } + logger.Errorf("Scale set poller for '%s' exited with error: %v — restarting in 30s", name, err) + select { + case <-ctx.Done(): + return + case <-time.After(30 * time.Second): + } + continue + } + return } }(poller, trayType.Name) } From 45dad843b61ce35dfa0c76b847df69073beda9df Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 00:57:04 +0400 Subject: [PATCH 14/27] err handling --- src/lib/config/config.go | 6 +++--- src/lib/trayManager/trayManager.go | 14 ++++--------- .../trays/providers/trayProviderFactory.go | 21 ++++++++++++------- src/server/handlers/agentHandler.go | 9 +++++++- 4 files changed, 28 insertions(+), 22 deletions(-) diff --git a/src/lib/config/config.go b/src/lib/config/config.go index 60780bc..2b32cb8 100644 --- a/src/lib/config/config.go +++ b/src/lib/config/config.go @@ -92,10 +92,8 @@ func LoadConfig(configPath *string) (*CatteryConfig, error) { } } - AppConfig = appConfig - validate := validator.New() - err = validate.Struct(AppConfig) + err = validate.Struct(appConfig) if err != nil { // err is of type validator.ValidationErrors for _, fieldErr := range err.(validator.ValidationErrors) { @@ -103,6 +101,8 @@ func LoadConfig(configPath *string) (*CatteryConfig, error) { } } + AppConfig = appConfig + return appConfig, nil } diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index a63003b..5b9b8c5 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -7,7 +7,6 @@ import ( "cattery/lib/trays/providers" "cattery/lib/trays/repositories" "context" - "errors" "fmt" "time" @@ -39,9 +38,7 @@ func (tm *TrayManager) CreateTray(trayType *config.TrayType) error { provider, err := providers.GetProvider(trayType.Provider) if err != nil { - var errMsg = fmt.Sprintf("Failed to get provider for type %s: %v", trayType.Name, err) - log.Error(errMsg) - return errors.New(errMsg) + return fmt.Errorf("failed to get provider for type %s: %w", trayType.Name, err) } tray := trays.NewTray(*trayType) @@ -84,8 +81,7 @@ func (tm *TrayManager) Registering(trayId string) (*trays.Tray, error) { return nil, err } if tray == nil { - var errorMsg = fmt.Sprintf("Failed to update tray status for tray '%s'", trayId) - return nil, errors.New(errorMsg) + return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) } return tray, nil @@ -97,8 +93,7 @@ func (tm *TrayManager) Registered(trayId string, ghRunnerId int64) (*trays.Tray, return nil, err } if tray == nil { - var errorMsg = fmt.Sprintf("Failed to update tray status for tray '%s'", trayId) - return nil, errors.New(errorMsg) + return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) } return tray, nil @@ -110,8 +105,7 @@ func (tm *TrayManager) SetJob(trayId string, jobRunId int64, workflowRunId int64 return nil, err } if tray == nil { - var errorMsg = fmt.Sprintf("Failed to update tray status for tray '%s'", trayId) - return nil, errors.New(errorMsg) + return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) } return tray, nil diff --git a/src/lib/trays/providers/trayProviderFactory.go b/src/lib/trays/providers/trayProviderFactory.go index c152668..bac9b19 100644 --- a/src/lib/trays/providers/trayProviderFactory.go +++ b/src/lib/trays/providers/trayProviderFactory.go @@ -4,10 +4,15 @@ import ( "cattery/lib/config" "cattery/lib/trays" "errors" + "sync" + log "github.com/sirupsen/logrus" ) -var providers = make(map[string]ITrayProvider) +var ( + providersMu sync.RWMutex + providers = make(map[string]ITrayProvider) +) var logger = log.WithFields(log.Fields{ "name": "trayProviderFactory", @@ -28,19 +33,19 @@ func GetProviderByTrayTypeName(trayTypeName string) (ITrayProvider, error) { } func GetProvider(providerName string) (ITrayProvider, error) { - + providersMu.RLock() if existingProvider, ok := providers[providerName]; ok { + providersMu.RUnlock() return existingProvider, nil } + providersMu.RUnlock() var result ITrayProvider var p = config.AppConfig.GetProvider(providerName) if p == nil { - var err = errors.New("No provider found for " + providerName) - logger.Error(err.Error()) - return nil, err + return nil, errors.New("no provider found for " + providerName) } var provider = *p @@ -51,16 +56,16 @@ func GetProvider(providerName string) (ITrayProvider, error) { case "google": result = NewGceProvider(providerName, provider) default: - var errMsg = "Unknown provider: " + providerName - logger.Error(errMsg) - return nil, errors.New(errMsg) + return nil, errors.New("unknown provider type: " + provider["type"]) } if result == nil { return nil, errors.New("failed to initialize provider: " + providerName) } + providersMu.Lock() providers[providerName] = result + providersMu.Unlock() return result, nil } diff --git a/src/server/handlers/agentHandler.go b/src/server/handlers/agentHandler.go index c80ec31..b764f16 100644 --- a/src/server/handlers/agentHandler.go +++ b/src/server/handlers/agentHandler.go @@ -82,6 +82,7 @@ func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Req JitConfig: jitConfig, } + responseWriter.Header().Set("Content-Type", "application/json") err = json.NewEncoder(responseWriter).Encode(registerResponse) if err != nil { logger.Errorf("Failed to encode response: %v", err) @@ -146,6 +147,8 @@ func (h *Handlers) AgentUnregister(responseWriter http.ResponseWriter, r *http.R if err != nil { logger.Errorf("Failed to delete tray: %v", err) + http.Error(responseWriter, "Failed to delete tray", http.StatusInternalServerError) + return } logger.Infof("Agent %s unregistered, reason: %d", unregisterRequest.Agent.AgentId, unregisterRequest.Reason) @@ -296,5 +299,9 @@ func (h *Handlers) AgentInterrupt(responseWriter http.ResponseWriter, r *http.Re return } workflowRunId := tray.WorkflowRunId - h.RestartManager.RequestRestart(workflowRunId, tray.GitHubOrgName, tray.Repository) + if err := h.RestartManager.RequestRestart(workflowRunId, tray.GitHubOrgName, tray.Repository); err != nil { + logger.Errorf("Failed to request restart for workflow %d: %v", workflowRunId, err) + http.Error(responseWriter, "Failed to request restart", http.StatusInternalServerError) + return + } } From 4d4656b57947e5ab0156b0f256d0cc716cfc7f9e Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 01:02:35 +0400 Subject: [PATCH 15/27] warnings fix --- src/go.mod | 2 +- src/go.sum | 14 ++++++++++++-- .../trays/repositories/mongodbTrayRepository.go | 8 ++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/src/go.mod b/src/go.mod index d3e7ad4..b3e7782 100644 --- a/src/go.mod +++ b/src/go.mod @@ -4,6 +4,7 @@ go 1.25.5 require ( cloud.google.com/go/compute v1.53.0 + github.com/actions/scaleset v0.2.0 github.com/bradleyfalzon/ghinstallation/v2 v2.17.0 github.com/fsnotify/fsnotify v1.9.0 github.com/go-playground/validator v9.31.0+incompatible @@ -23,7 +24,6 @@ require ( cloud.google.com/go/auth v0.18.0 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect - github.com/actions/scaleset v0.2.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect diff --git a/src/go.sum b/src/go.sum index af5ec6f..4fff80a 100644 --- a/src/go.sum +++ b/src/go.sum @@ -18,9 +18,11 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/fatih/color v1.16.0 h1:zmkK9Ngbjj+K0yRhTVONQh1p/HknKYSlNT+vZCzyokM= +github.com/fatih/color v1.16.0/go.mod h1:fL2Sau1YI5c0pdGEVCbKQbLXB6edEj1ZgiY4NijnWvE= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= @@ -65,6 +67,8 @@ github.com/googleapis/gax-go/v2 v2.16.0 h1:iHbQmKLLZrexmb0OSsNGTeSTS0HO4YvFOG8g5 github.com/googleapis/gax-go/v2 v2.16.0/go.mod h1:o1vfQjjNZn4+dPnRdl/4ZD7S9414Y4xA+a/6Icj6l14= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= +github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= +github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-retryablehttp v0.7.8 h1:ylXZWnqa7Lhqpk0L1P1LzDtGcCR0rPVUrx/c8Unxc48= github.com/hashicorp/go-retryablehttp v0.7.8/go.mod h1:rjiScheydd+CxvumBsIrFKlx3iS0jrZ7LvzFGFmuKbw= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= @@ -79,12 +83,16 @@ github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= +github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= @@ -113,6 +121,8 @@ github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3A github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU= github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index d0db328..4abf9d4 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -143,12 +143,12 @@ func (m *MongodbTrayRepository) Delete(trayId string) error { func (m *MongodbTrayRepository) CountByTrayType(trayType string) (map[trays.TrayStatus]int, int, error) { var matchStage = bson.D{ - {"$match", bson.D{{"trayTypeName", trayType}}}, + {Key: "$match", Value: bson.D{{Key: "trayTypeName", Value: trayType}}}, } var groupStage = bson.D{ - {"$group", bson.D{ - {"_id", "$status"}, - {"count", bson.D{{"$sum", 1}}}, + {Key: "$group", Value: bson.D{ + {Key: "_id", Value: "$status"}, + {Key: "count", Value: bson.D{{Key: "$sum", Value: 1}}}, }}} cursor, err := m.collection.Aggregate(context.Background(), mongo.Pipeline{matchStage, groupStage}) From fc93fd28c9a4481b9f7571b2e380c69175ef3312 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 01:29:25 +0400 Subject: [PATCH 16/27] great cleanup --- src/agent/catteryClient/client.go | 32 -------- .../repositories/iRestarterRepository.go | 11 ++- .../mongodbRestarterRepository.go | 23 ++---- src/lib/restarter/workflowRestarter.go | 21 ++--- src/lib/scaleSetPoller/poller.go | 16 ++-- src/lib/trayManager/trayManager.go | 74 +++++++---------- src/lib/trays/providers/dockerProvider.go | 10 +-- src/lib/trays/providers/gceProvider.go | 17 ++-- src/lib/trays/repositories/iTrayRepository.go | 15 ++-- .../repositories/mongodbTrayRepository.go | 81 +++++++------------ .../mongodbTrayRepository_test.go | 44 +++++----- src/lib/trays/tray.go | 38 +++------ src/server/handlers/agentHandler.go | 54 +++---------- 13 files changed, 160 insertions(+), 276 deletions(-) diff --git a/src/agent/catteryClient/client.go b/src/agent/catteryClient/client.go index c586c62..68e419b 100644 --- a/src/agent/catteryClient/client.go +++ b/src/agent/catteryClient/client.go @@ -127,35 +127,3 @@ func (c *CatteryClient) Ping() (*messages.PingResponse, error) { return pingResponse, nil } - -func (c *CatteryClient) InterruptAgent(agent *agents.Agent) error { - - var client = c.httpClient - - requestJson, err := json.Marshal(messages.UnregisterRequest{ - Agent: *agent, - }) - if err != nil { - return err - } - - requestUrl, err := url.JoinPath(c.baseURL, "/agent", "interrupt/", agent.AgentId) - if err != nil { - return err - } - - var request, _ = http.NewRequest("POST", requestUrl, bytes.NewBuffer(requestJson)) - response, err := client.Do(request) - if err != nil { - return err - } - - defer response.Body.Close() - - if response.StatusCode != http.StatusOK { - bodyBytes, _ := io.ReadAll(response.Body) - return errors.New("response status code: " + response.Status + " body: " + string(bodyBytes)) - } - - return nil -} diff --git a/src/lib/restarter/repositories/iRestarterRepository.go b/src/lib/restarter/repositories/iRestarterRepository.go index e24786d..0fcbda3 100644 --- a/src/lib/restarter/repositories/iRestarterRepository.go +++ b/src/lib/restarter/repositories/iRestarterRepository.go @@ -1,6 +1,9 @@ package repositories -import "time" +import ( + "context" + "time" +) type RestartRequest struct { WorkflowRunId int64 `bson:"workflowRunId"` @@ -10,7 +13,7 @@ type RestartRequest struct { } type IRestarterRepository interface { - SaveRestartRequest(workflowRunId int64, orgName string, repoName string) error - DeleteRestartRequest(workflowRunId int64) error - GetAllPendingRestartRequests() ([]RestartRequest, error) + SaveRestartRequest(ctx context.Context, workflowRunId int64, orgName string, repoName string) error + DeleteRestartRequest(ctx context.Context, workflowRunId int64) error + GetAllPendingRestartRequests(ctx context.Context) ([]RestartRequest, error) } diff --git a/src/lib/restarter/repositories/mongodbRestarterRepository.go b/src/lib/restarter/repositories/mongodbRestarterRepository.go index 34b296a..7e98018 100644 --- a/src/lib/restarter/repositories/mongodbRestarterRepository.go +++ b/src/lib/restarter/repositories/mongodbRestarterRepository.go @@ -21,12 +21,10 @@ func (m *MongodbRestarterRepository) Connect(collection *mongo.Collection) { m.collection = collection } -func (m *MongodbRestarterRepository) SaveRestartRequest(workflowRunId int64, orgName string, repoName string) error { +func (m *MongodbRestarterRepository) SaveRestartRequest(ctx context.Context, workflowRunId int64, orgName string, repoName string) error { _, err := m.collection.UpdateOne( - context.Background(), - bson.M{ - "workflowRunId": workflowRunId, - }, + ctx, + bson.M{"workflowRunId": workflowRunId}, bson.M{ "$set": bson.M{ "workflowRunId": workflowRunId, @@ -40,24 +38,19 @@ func (m *MongodbRestarterRepository) SaveRestartRequest(workflowRunId int64, org return err } -func (m *MongodbRestarterRepository) DeleteRestartRequest(workflowRunId int64) error { - _, err := m.collection.DeleteOne( - context.Background(), - bson.M{ - "workflowRunId": workflowRunId, - }, - ) +func (m *MongodbRestarterRepository) DeleteRestartRequest(ctx context.Context, workflowRunId int64) error { + _, err := m.collection.DeleteOne(ctx, bson.M{"workflowRunId": workflowRunId}) return err } -func (m *MongodbRestarterRepository) GetAllPendingRestartRequests() ([]RestartRequest, error) { - cursor, err := m.collection.Find(context.Background(), bson.M{}) +func (m *MongodbRestarterRepository) GetAllPendingRestartRequests(ctx context.Context) ([]RestartRequest, error) { + cursor, err := m.collection.Find(ctx, bson.M{}) if err != nil { return nil, err } var requests []RestartRequest - if err := cursor.All(context.Background(), &requests); err != nil { + if err := cursor.All(ctx, &requests); err != nil { return nil, err } return requests, nil diff --git a/src/lib/restarter/workflowRestarter.go b/src/lib/restarter/workflowRestarter.go index aa52e1c..da18a1c 100644 --- a/src/lib/restarter/workflowRestarter.go +++ b/src/lib/restarter/workflowRestarter.go @@ -21,12 +21,11 @@ func NewWorkflowRestarter(repository repositories.IRestarterRepository) *Workflo func (wr *WorkflowRestarter) RequestRestart(workflowRunId int64, orgName string, repoName string) error { log.Debugf("Requesting restart for workflow run id %d (%s/%s)", workflowRunId, orgName, repoName) - return wr.repository.SaveRestartRequest(workflowRunId, orgName, repoName) + return wr.repository.SaveRestartRequest(context.Background(), workflowRunId, orgName, repoName) } // StartPoller starts a background goroutine that periodically checks pending restart // requests and triggers restarts when workflows have completed with failure. -// This replaces the webhook-based workflow_run event handling. func (wr *WorkflowRestarter) StartPoller(ctx context.Context) { const pollInterval = 30 * time.Second const requestTTL = 1 * time.Hour @@ -41,7 +40,7 @@ func (wr *WorkflowRestarter) StartPoller(ctx context.Context) { return default: time.Sleep(pollInterval) - wr.pollPendingRestarts(logger, requestTTL) + wr.pollPendingRestarts(ctx, logger, requestTTL) } } }() @@ -49,26 +48,25 @@ func (wr *WorkflowRestarter) StartPoller(ctx context.Context) { logger.Info("Restart poller started") } -func (wr *WorkflowRestarter) pollPendingRestarts(logger *log.Entry, ttl time.Duration) { - requests, err := wr.repository.GetAllPendingRestartRequests() +func (wr *WorkflowRestarter) pollPendingRestarts(ctx context.Context, logger *log.Entry, ttl time.Duration) { + requests, err := wr.repository.GetAllPendingRestartRequests(ctx) if err != nil { logger.Errorf("Failed to get pending restart requests: %v", err) return } for _, req := range requests { - // TTL safety net: delete stale requests if time.Since(req.CreatedAt) > ttl { logger.Warnf("Restart request for workflow %d expired (age: %v), deleting", req.WorkflowRunId, time.Since(req.CreatedAt)) - _ = wr.repository.DeleteRestartRequest(req.WorkflowRunId) + _ = wr.repository.DeleteRestartRequest(ctx, req.WorkflowRunId) continue } - wr.handleRestartRequest(logger, req) + wr.handleRestartRequest(ctx, logger, req) } } -func (wr *WorkflowRestarter) handleRestartRequest(logger *log.Entry, req repositories.RestartRequest) { +func (wr *WorkflowRestarter) handleRestartRequest(ctx context.Context, logger *log.Entry, req repositories.RestartRequest) { ghClient, err := githubClient.NewGithubClientWithOrgName(req.OrgName) if err != nil { logger.Errorf("Failed to get GitHub client for org %s: %v", req.OrgName, err) @@ -82,7 +80,6 @@ func (wr *WorkflowRestarter) handleRestartRequest(logger *log.Entry, req reposit } if status != "completed" { - // Workflow still running, skip for now return } @@ -96,12 +93,10 @@ func (wr *WorkflowRestarter) handleRestartRequest(logger *log.Entry, req reposit } logger.Infof("Successfully restarted failed jobs for workflow run %d", req.WorkflowRunId) default: - // success, cancelled, or other — just clean up logger.Debugf("Workflow run %d completed with conclusion '%s', cleaning up restart request", req.WorkflowRunId, conclusion) } - err = wr.repository.DeleteRestartRequest(req.WorkflowRunId) - if err != nil { + if err := wr.repository.DeleteRestartRequest(ctx, req.WorkflowRunId); err != nil { logger.Errorf("Failed to delete restart request for workflow %d: %v", req.WorkflowRunId, err) } } diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index 15ead67..ce102da 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -1,7 +1,6 @@ package scaleSetPoller import ( - "cattery/lib/config" "cattery/lib/metrics" "cattery/lib/scaleSetClient" "cattery/lib/trayManager" @@ -9,6 +8,8 @@ import ( "fmt" "strconv" + "cattery/lib/config" + "github.com/actions/scaleset" "github.com/actions/scaleset/listener" log "github.com/sirupsen/logrus" @@ -66,9 +67,7 @@ func (p *Poller) Run(ctx context.Context) error { return fmt.Errorf("failed to create listener: %w", err) } - scaler := &catteryScaler{ - poller: p, - } + scaler := &catteryScaler{poller: p} p.logger.Info("Entering listener loop") return l.Run(ctx, scaler) @@ -97,13 +96,13 @@ type catteryScaler struct { } func (cs *catteryScaler) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) { - err := cs.poller.trayManager.ScaleForDemand(cs.poller.trayType, count) + err := cs.poller.trayManager.ScaleForDemand(ctx, cs.poller.trayType, count) if err != nil { cs.poller.logger.Errorf("Failed to scale for demand (%d): %v", count, err) return 0, err } - total, err := cs.poller.trayManager.CountTrays(cs.poller.trayType.Name) + total, err := cs.poller.trayManager.CountTrays(ctx, cs.poller.trayType.Name) if err != nil { return 0, err } @@ -115,9 +114,8 @@ func (cs *catteryScaler) HandleJobStarted(ctx context.Context, jobInfo *scaleset jobInfo.JobDisplayName, jobInfo.RunnerName, jobInfo.WorkflowRunID) jobID, _ := strconv.ParseInt(jobInfo.JobID, 10, 64) - repository := jobInfo.RepositoryName - _, err := cs.poller.trayManager.SetJob(jobInfo.RunnerName, jobID, jobInfo.WorkflowRunID, repository) + _, err := cs.poller.trayManager.SetJob(ctx, jobInfo.RunnerName, jobID, jobInfo.WorkflowRunID, jobInfo.RepositoryName) if err != nil { cs.poller.logger.Errorf("Failed to set job on tray %s: %v", jobInfo.RunnerName, err) return err @@ -130,7 +128,7 @@ func (cs *catteryScaler) HandleJobCompleted(ctx context.Context, jobInfo *scales cs.poller.logger.Infof("Job completed: %s on runner %s (result: %s)", jobInfo.JobDisplayName, jobInfo.RunnerName, jobInfo.Result) - _, err := cs.poller.trayManager.DeleteTray(jobInfo.RunnerName) + _, err := cs.poller.trayManager.DeleteTray(ctx, jobInfo.RunnerName) if err != nil { cs.poller.logger.Errorf("Failed to delete tray %s: %v", jobInfo.RunnerName, err) return err diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 5b9b8c5..da693ff 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -23,19 +23,17 @@ func NewTrayManager(trayRepository repositories.ITrayRepository) *TrayManager { } } -func (tm *TrayManager) createTrays(trayType *config.TrayType, n int) error { +func (tm *TrayManager) createTrays(ctx context.Context, trayType *config.TrayType, n int) error { for i := 0; i < n; i++ { log.Infof("Creating tray %d for type: %s", i+1, trayType.Name) - err := tm.CreateTray(trayType) - if err != nil { + if err := tm.CreateTray(ctx, trayType); err != nil { return err } } return nil } -func (tm *TrayManager) CreateTray(trayType *config.TrayType) error { - +func (tm *TrayManager) CreateTray(ctx context.Context, trayType *config.TrayType) error { provider, err := providers.GetProvider(trayType.Provider) if err != nil { return fmt.Errorf("failed to get provider for type %s: %w", trayType.Name, err) @@ -45,16 +43,16 @@ func (tm *TrayManager) CreateTray(trayType *config.TrayType) error { err = provider.RunTray(tray) if err != nil { - log.Errorf("Failed to run tray for provider '%s', tray '%s': %v", trayType.Provider, tray.GetId(), err) + log.Errorf("Failed to run tray for provider '%s', tray '%s': %v", trayType.Provider, tray.Id, err) metrics.TrayProviderErrors(tray.GitHubOrgName, tray.ProviderName, tray.TrayTypeName, "create") return err } - err = tm.trayRepository.Save(tray) + err = tm.trayRepository.Save(ctx, tray) if err != nil { log.Errorf("Failed to save tray %s: %v — cleaning up provider resource", trayType.Name, err) if cleanErr := provider.CleanTray(tray); cleanErr != nil { - log.Errorf("Failed to clean up tray %s after save failure: %v", tray.GetId(), cleanErr) + log.Errorf("Failed to clean up tray %s after save failure: %v", tray.Id, cleanErr) metrics.TrayProviderErrors(tray.GitHubOrgName, tray.ProviderName, tray.TrayTypeName, "delete") } return fmt.Errorf("failed to save tray %s: %w", trayType.Name, err) @@ -63,8 +61,8 @@ func (tm *TrayManager) CreateTray(trayType *config.TrayType) error { return nil } -func (tm *TrayManager) GetTrayById(trayId string) (*trays.Tray, error) { - tray, err := tm.trayRepository.GetById(trayId) +func (tm *TrayManager) GetTrayById(ctx context.Context, trayId string) (*trays.Tray, error) { + tray, err := tm.trayRepository.GetById(ctx, trayId) if err != nil { return nil, err } @@ -75,50 +73,46 @@ func (tm *TrayManager) GetTrayById(trayId string) (*trays.Tray, error) { return tray, nil } -func (tm *TrayManager) Registering(trayId string) (*trays.Tray, error) { - tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRegistering, 0, 0, 0, "") +func (tm *TrayManager) Registering(ctx context.Context, trayId string) (*trays.Tray, error) { + tray, err := tm.trayRepository.UpdateStatus(ctx, trayId, trays.TrayStatusRegistering, 0, 0, 0, "") if err != nil { return nil, err } if tray == nil { return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) } - return tray, nil } -func (tm *TrayManager) Registered(trayId string, ghRunnerId int64) (*trays.Tray, error) { - tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRegistered, 0, 0, ghRunnerId, "") +func (tm *TrayManager) Registered(ctx context.Context, trayId string, ghRunnerId int64) (*trays.Tray, error) { + tray, err := tm.trayRepository.UpdateStatus(ctx, trayId, trays.TrayStatusRegistered, 0, 0, ghRunnerId, "") if err != nil { return nil, err } if tray == nil { return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) } - return tray, nil } -func (tm *TrayManager) SetJob(trayId string, jobRunId int64, workflowRunId int64, repository string) (*trays.Tray, error) { - tray, err := tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusRunning, jobRunId, workflowRunId, 0, repository) +func (tm *TrayManager) SetJob(ctx context.Context, trayId string, jobRunId int64, workflowRunId int64, repository string) (*trays.Tray, error) { + tray, err := tm.trayRepository.UpdateStatus(ctx, trayId, trays.TrayStatusRunning, jobRunId, workflowRunId, 0, repository) if err != nil { return nil, err } if tray == nil { return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) } - return tray, nil } -func (tm *TrayManager) DeleteTray(trayId string) (*trays.Tray, error) { - - var tray, err = tm.trayRepository.UpdateStatus(trayId, trays.TrayStatusDeleting, 0, 0, 0, "") +func (tm *TrayManager) DeleteTray(ctx context.Context, trayId string) (*trays.Tray, error) { + tray, err := tm.trayRepository.UpdateStatus(ctx, trayId, trays.TrayStatusDeleting, 0, 0, 0, "") if err != nil { return nil, err } if tray == nil { - return nil, nil // Tray not found, nothing to delete + return nil, nil } provider, err := providers.GetProviderForTray(tray) @@ -128,12 +122,12 @@ func (tm *TrayManager) DeleteTray(trayId string) (*trays.Tray, error) { err = provider.CleanTray(tray) if err != nil { - log.Errorf("Failed to delete tray for provider %s, tray %s: %v", provider.GetProviderName(), tray.GetId(), err) + log.Errorf("Failed to delete tray for provider %s, tray %s: %v", provider.GetProviderName(), tray.Id, err) metrics.TrayProviderErrors(tray.GitHubOrgName, tray.ProviderName, tray.TrayTypeName, "delete") return nil, err } - err = tm.trayRepository.Delete(trayId) + err = tm.trayRepository.Delete(ctx, trayId) if err != nil { return nil, err } @@ -142,8 +136,7 @@ func (tm *TrayManager) DeleteTray(trayId string) (*trays.Tray, error) { } func (tm *TrayManager) HandleStale(ctx context.Context) { - - var interval = time.Minute * 2 + interval := time.Minute * 2 go func() { for { @@ -151,10 +144,9 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { case <-ctx.Done(): return default: - time.Sleep(interval / 2) - stale, err := tm.trayRepository.GetStale(interval) + stale, err := tm.trayRepository.GetStale(ctx, interval) if err != nil { log.Errorf("Failed to get stale trays: %v", err) continue @@ -165,13 +157,10 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { } for _, tray := range stale { - log.Debugf("Deleting stale tray: %s", tray.GetId()) - - _, err := tm.DeleteTray(tray.GetId()) - if err != nil { - log.Errorf("Failed to delete tray %s: %v", tray.GetId(), err) + log.Debugf("Deleting stale tray: %s", tray.Id) + if _, err := tm.DeleteTray(ctx, tray.Id); err != nil { + log.Errorf("Failed to delete tray %s: %v", tray.Id, err) } - metrics.StaleTraysInc(tray.GitHubOrgName, tray.TrayTypeName) } } @@ -182,8 +171,8 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { // ScaleForDemand scales trays for a given tray type based on the desired runner count. // The desiredCount is TotalAssignedJobs from GitHub scale set statistics — the total // number of runners that should exist (running + idle) to serve all assigned jobs. -func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, desiredCount int) error { - countByStatus, total, err := tm.trayRepository.CountByTrayType(trayType.Name) +func (tm *TrayManager) ScaleForDemand(ctx context.Context, trayType *config.TrayType, desiredCount int) error { + countByStatus, total, err := tm.trayRepository.CountByTrayType(ctx, trayType.Name) if err != nil { log.Errorf("Failed to count trays for type %s: %v", trayType.Name, err) return err @@ -199,8 +188,7 @@ func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, desiredCount in traysToCreate = remainingCapacity } if traysToCreate > 0 { - err := tm.createTrays(trayType, traysToCreate) - if err != nil { + if err := tm.createTrays(ctx, trayType, traysToCreate); err != nil { return err } } @@ -212,12 +200,12 @@ func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, desiredCount in if traysToDelete > idleTrays { traysToDelete = idleTrays } - redundant, err := tm.trayRepository.MarkRedundant(trayType.Name, traysToDelete) + redundant, err := tm.trayRepository.MarkRedundant(ctx, trayType.Name, traysToDelete) if err != nil { return err } for _, tray := range redundant { - if _, delErr := tm.DeleteTray(tray.Id); delErr != nil { + if _, delErr := tm.DeleteTray(ctx, tray.Id); delErr != nil { log.Errorf("Failed to delete redundant tray %s: %v", tray.Id, delErr) } } @@ -227,8 +215,8 @@ func (tm *TrayManager) ScaleForDemand(trayType *config.TrayType, desiredCount in } // CountTrays returns the number of active (non-deleting) trays for a given tray type. -func (tm *TrayManager) CountTrays(trayTypeName string) (int, error) { - countByStatus, total, err := tm.trayRepository.CountByTrayType(trayTypeName) +func (tm *TrayManager) CountTrays(ctx context.Context, trayTypeName string) (int, error) { + countByStatus, total, err := tm.trayRepository.CountByTrayType(ctx, trayTypeName) if err != nil { return 0, err } diff --git a/src/lib/trays/providers/dockerProvider.go b/src/lib/trays/providers/dockerProvider.go index 9e7dbd3..93d2a3d 100644 --- a/src/lib/trays/providers/dockerProvider.go +++ b/src/lib/trays/providers/dockerProvider.go @@ -38,9 +38,9 @@ func (d *DockerProvider) GetProviderName() string { func (d *DockerProvider) RunTray(tray *trays.Tray) error { - var containerName = tray.GetId() + var containerName = tray.Id - var trayConfig = tray.GetTrayConfig().(config.DockerTrayConfig) + var trayConfig = tray.TrayConfig().(config.DockerTrayConfig) var image = trayConfig.Image @@ -50,7 +50,7 @@ func (d *DockerProvider) RunTray(tray *trays.Tray) error { "--add-host=host.docker.internal:host-gateway", "--name", containerName, image, - "/action-runner/cattery/cattery", "agent", "-i", tray.GetId(), "-s", serverUrl, "--runner-folder", "/action-runner") + "/action-runner/cattery/cattery", "agent", "-i", tray.Id, "-s", serverUrl, "--runner-folder", "/action-runner") d.logger.Info("Running docker command: ", dockerCommand.String()) err := dockerCommand.Run() @@ -64,14 +64,14 @@ func (d *DockerProvider) RunTray(tray *trays.Tray) error { } func (d *DockerProvider) CleanTray(tray *trays.Tray) error { - var dockerCommand = exec.Command("docker", "container", "stop", tray.GetId()) + var dockerCommand = exec.Command("docker", "container", "stop", tray.Id) dockerCommandOutput, err := dockerCommand.CombinedOutput() if err != nil { output := string(dockerCommandOutput) d.logger.Trace(output) if strings.Contains(strings.ToLower(output), "no such container") { - d.logger.Trace("No such container: ", tray.GetId()) + d.logger.Trace("No such container: ", tray.Id) return nil } return err diff --git a/src/lib/trays/providers/gceProvider.go b/src/lib/trays/providers/gceProvider.go index 317c387..7350fbb 100644 --- a/src/lib/trays/providers/gceProvider.go +++ b/src/lib/trays/providers/gceProvider.go @@ -50,7 +50,7 @@ func (g *GceProvider) GetProviderName() string { func (g *GceProvider) RunTray(tray *trays.Tray) error { ctx := context.Background() - var trayConfig = tray.GetTrayConfig().(config.GoogleTrayConfig) + var trayConfig = tray.TrayConfig().(config.GoogleTrayConfig) var ( project = g.providerConfig.Get("project") @@ -59,12 +59,17 @@ func (g *GceProvider) RunTray(tray *trays.Tray) error { machineType = trayConfig.MachineType ) + var extraMetadata config.TrayExtraMetadata + if tt := tray.TrayType(); tt != nil { + extraMetadata = tt.ExtraMetadata + } + var metadata = createGcpMetadata( map[string]string{ "cattery-url": config.AppConfig.Server.AdvertiseUrl, - "cattery-agent-id": tray.GetId(), + "cattery-agent-id": tray.Id, }, - tray.GetTrayType().ExtraMetadata, + extraMetadata, ) var zone = zones[rand.Intn(len(zones))] @@ -75,7 +80,7 @@ func (g *GceProvider) RunTray(tray *trays.Tray) error { SourceInstanceTemplate: &instanceTemplate, InstanceResource: &computepb.Instance{ MachineType: proto.String(fmt.Sprintf("zones/%s/machineTypes/%s", zone, machineType)), - Name: proto.String(tray.GetId()), + Name: proto.String(tray.Id), Metadata: metadata, }, }) @@ -101,7 +106,7 @@ func (g *GceProvider) CleanTray(tray *trays.Tray) error { ) _, err = client.Delete(context.Background(), &computepb.DeleteInstanceRequest{ - Instance: tray.GetId(), + Instance: tray.Id, Project: project, Zone: zone, }) @@ -111,7 +116,7 @@ func (g *GceProvider) CleanTray(tray *trays.Tray) error { if e.Code != 404 { return err } else { - g.logger.Tracef("Tray not found during deletion; skipping: %v (tray %s)", err, tray.GetId()) + g.logger.Tracef("Tray not found during deletion; skipping: %v (tray %s)", err, tray.Id) return nil } } diff --git a/src/lib/trays/repositories/iTrayRepository.go b/src/lib/trays/repositories/iTrayRepository.go index 85e0cdd..25ff366 100644 --- a/src/lib/trays/repositories/iTrayRepository.go +++ b/src/lib/trays/repositories/iTrayRepository.go @@ -2,15 +2,16 @@ package repositories import ( "cattery/lib/trays" + "context" "time" ) type ITrayRepository interface { - GetById(trayId string) (*trays.Tray, error) - Save(tray *trays.Tray) error - Delete(trayId string) error - UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) - CountByTrayType(trayType string) (map[trays.TrayStatus]int, int, error) - MarkRedundant(trayType string, limit int) ([]*trays.Tray, error) - GetStale(d time.Duration) ([]*trays.Tray, error) + GetById(ctx context.Context, trayId string) (*trays.Tray, error) + Save(ctx context.Context, tray *trays.Tray) error + Delete(ctx context.Context, trayId string) error + UpdateStatus(ctx context.Context, trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) + CountByTrayType(ctx context.Context, trayType string) (map[trays.TrayStatus]int, int, error) + MarkRedundant(ctx context.Context, trayType string, limit int) ([]*trays.Tray, error) + GetStale(ctx context.Context, d time.Duration) ([]*trays.Tray, error) } diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index 4abf9d4..43e46a0 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -23,14 +23,13 @@ func (m *MongodbTrayRepository) Connect(collection *mongo.Collection) { m.collection = collection } -func (m *MongodbTrayRepository) GetById(trayId string) (*trays.Tray, error) { - dbResult := m.collection.FindOne(context.Background(), bson.M{"id": trayId}) +func (m *MongodbTrayRepository) GetById(ctx context.Context, trayId string) (*trays.Tray, error) { + dbResult := m.collection.FindOne(ctx, bson.M{"id": trayId}) var result trays.Tray err := dbResult.Decode(&result) if err != nil { if errors.Is(err, mongo.ErrNoDocuments) { - // Handle the "not found" case implicitly return nil, nil } return nil, err @@ -39,8 +38,8 @@ func (m *MongodbTrayRepository) GetById(trayId string) (*trays.Tray, error) { return &result, nil } -func (m *MongodbTrayRepository) GetStale(d time.Duration) ([]*trays.Tray, error) { - dbResult, err := m.collection.Find(context.Background(), +func (m *MongodbTrayRepository) GetStale(ctx context.Context, d time.Duration) ([]*trays.Tray, error) { + dbResult, err := m.collection.Find(ctx, bson.M{ "status": bson.M{"$nin": bson.A{trays.TrayStatusRunning, trays.TrayStatusDeleting}}, "statusChanged": bson.M{"$lte": time.Now().UTC().Add(-d)}, @@ -50,20 +49,18 @@ func (m *MongodbTrayRepository) GetStale(d time.Duration) ([]*trays.Tray, error) } var traysArr []*trays.Tray - if err := dbResult.All(context.Background(), &traysArr); err != nil { + if err := dbResult.All(ctx, &traysArr); err != nil { return nil, err } return traysArr, nil - } -func (m *MongodbTrayRepository) MarkRedundant(trayType string, limit int) ([]*trays.Tray, error) { - - var resultTrays = make([]*trays.Tray, 0) +func (m *MongodbTrayRepository) MarkRedundant(ctx context.Context, trayType string, limit int) ([]*trays.Tray, error) { + resultTrays := make([]*trays.Tray, 0, limit) for i := 0; i < limit; i++ { dbResult := m.collection.FindOneAndUpdate( - context.Background(), + ctx, bson.M{"status": trays.TrayStatusCreating, "trayTypeName": trayType}, bson.M{"$set": bson.M{"status": trays.TrayStatusDeleting, "statusChanged": time.Now().UTC(), "jobRunId": 0}}, options.FindOneAndUpdate().SetReturnDocument(options.After)) @@ -83,38 +80,30 @@ func (m *MongodbTrayRepository) MarkRedundant(trayType string, limit int) ([]*tr return resultTrays, nil } -func (m *MongodbTrayRepository) Save(tray *trays.Tray) error { +func (m *MongodbTrayRepository) Save(ctx context.Context, tray *trays.Tray) error { tray.StatusChanged = time.Now().UTC() - _, err := m.collection.InsertOne(context.Background(), tray) - if err != nil { - return err - } - - return nil + _, err := m.collection.InsertOne(ctx, tray) + return err } -func (m *MongodbTrayRepository) UpdateStatus(trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) { - - var setQuery = bson.M{"status": status, "statusChanged": time.Now().UTC()} +func (m *MongodbTrayRepository) UpdateStatus(ctx context.Context, trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) { + setQuery := bson.M{"status": status, "statusChanged": time.Now().UTC()} if jobRunId != 0 { setQuery["jobRunId"] = jobRunId } - if ghRunnerId != 0 { setQuery["gitHubRunnerId"] = ghRunnerId } - if workflowRunId != 0 { setQuery["workflowRunId"] = workflowRunId } - if repository != "" { setQuery["repository"] = repository } dbResult := m.collection.FindOneAndUpdate( - context.Background(), + ctx, bson.M{"id": trayId}, bson.M{"$set": setQuery}, options.FindOneAndUpdate().SetReturnDocument(options.After)) @@ -131,53 +120,45 @@ func (m *MongodbTrayRepository) UpdateStatus(trayId string, status trays.TraySta return &result, nil } -func (m *MongodbTrayRepository) Delete(trayId string) error { - _, err := m.collection.DeleteOne(context.Background(), bson.M{"id": trayId}) - if err != nil { - return err - } - - return nil +func (m *MongodbTrayRepository) Delete(ctx context.Context, trayId string) error { + _, err := m.collection.DeleteOne(ctx, bson.M{"id": trayId}) + return err } -func (m *MongodbTrayRepository) CountByTrayType(trayType string) (map[trays.TrayStatus]int, int, error) { - - var matchStage = bson.D{ +func (m *MongodbTrayRepository) CountByTrayType(ctx context.Context, trayType string) (map[trays.TrayStatus]int, int, error) { + matchStage := bson.D{ {Key: "$match", Value: bson.D{{Key: "trayTypeName", Value: trayType}}}, } - var groupStage = bson.D{ + groupStage := bson.D{ {Key: "$group", Value: bson.D{ {Key: "_id", Value: "$status"}, {Key: "count", Value: bson.D{{Key: "$sum", Value: 1}}}, }}} - cursor, err := m.collection.Aggregate(context.Background(), mongo.Pipeline{matchStage, groupStage}) + cursor, err := m.collection.Aggregate(ctx, mongo.Pipeline{matchStage, groupStage}) if err != nil { return nil, 0, err } var dbResults []bson.M - if err = cursor.All(context.TODO(), &dbResults); err != nil { + if err = cursor.All(ctx, &dbResults); err != nil { return nil, 0, err } - var result = make(map[trays.TrayStatus]int) - result[trays.TrayStatusCreating] = 0 - result[trays.TrayStatusRegistering] = 0 - result[trays.TrayStatusDeleting] = 0 - result[trays.TrayStatusRegistered] = 0 - result[trays.TrayStatusRunning] = 0 - - var total = 0 + result := map[trays.TrayStatus]int{ + trays.TrayStatusCreating: 0, + trays.TrayStatusRegistering: 0, + trays.TrayStatusDeleting: 0, + trays.TrayStatusRegistered: 0, + trays.TrayStatusRunning: 0, + } + total := 0 for _, res := range dbResults { - var int32Status = res["_id"].(int32) - - status := int32Status + status := res["_id"].(int32) cnt, _ := res["count"].(int32) result[trays.TrayStatus(status)] = int(cnt) total += int(cnt) } return result, total, nil - } diff --git a/src/lib/trays/repositories/mongodbTrayRepository_test.go b/src/lib/trays/repositories/mongodbTrayRepository_test.go index 13afa9c..6f21cd7 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository_test.go +++ b/src/lib/trays/repositories/mongodbTrayRepository_test.go @@ -92,7 +92,7 @@ func TestGetById(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray}) // Test GetById - tray, err := repo.GetById("test-tray-1") + tray, err := repo.GetById(context.Background(),"test-tray-1") if err != nil { t.Fatalf("GetById failed: %v", err) } @@ -114,7 +114,7 @@ func TestGetById(t *testing.T) { } // Test GetById with non-existent ID - tray, err = repo.GetById("non-existent") + tray, err = repo.GetById(context.Background(),"non-existent") if err != nil { t.Error("Expected no error for non-existent tray, got: ", err) } @@ -148,13 +148,13 @@ func TestSave(t *testing.T) { tray.ProviderData["something"] = "worker-1" // Test Save - err := repo.Save(tray) + err := repo.Save(context.Background(),tray) if err != nil { t.Fatalf("Save failed: %v", err) } // Verify the tray was saved - savedTray, err := repo.GetById(tray.Id) + savedTray, err := repo.GetById(context.Background(),tray.Id) if err != nil { t.Fatalf("Failed to get saved tray: %v", err) } @@ -201,7 +201,7 @@ func TestUpdateStatus(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray}) // Test UpdateStatus with jobRunId only - updatedTray, err := repo.UpdateStatus("test-tray-1", trays.TrayStatusRegistered, 123, 0, 0, "") + updatedTray, err := repo.UpdateStatus(context.Background(),"test-tray-1", trays.TrayStatusRegistered, 123, 0, 0, "") if err != nil { t.Fatalf("UpdateStatus failed: %v", err) } @@ -219,7 +219,7 @@ func TestUpdateStatus(t *testing.T) { } // Test UpdateStatus with ghRunnerId - updatedTray, err = repo.UpdateStatus("test-tray-1", trays.TrayStatusRunning, 456, 333, 789, "") + updatedTray, err = repo.UpdateStatus(context.Background(),"test-tray-1", trays.TrayStatusRunning, 456, 333, 789, "") if err != nil { t.Fatalf("UpdateStatus with ghRunnerId failed: %v", err) } @@ -241,7 +241,7 @@ func TestUpdateStatus(t *testing.T) { } // Test UpdateStatus with non-existent ID - updatedTray, err = repo.UpdateStatus("non-existent", trays.TrayStatusRegistered, 123, 0, 0, "") + updatedTray, err = repo.UpdateStatus(context.Background(),"non-existent", trays.TrayStatusRegistered, 123, 0, 0, "") if err != nil { t.Fatalf("UpdateStatus with non-existent ID failed: %v", err) } @@ -265,13 +265,13 @@ func TestDelete(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray}) // Test Delete - err := repo.Delete("test-tray-1") + err := repo.Delete(context.Background(),"test-tray-1") if err != nil { t.Fatalf("Delete failed: %v", err) } // Verify the tray was deleted - deletedTray, err := repo.GetById("test-tray-1") + deletedTray, err := repo.GetById(context.Background(),"test-tray-1") if err != nil { t.Error("Expected no error for deleted tray, got: ", err) } @@ -281,7 +281,7 @@ func TestDelete(t *testing.T) { } // Test Delete with non-existent ID - err = repo.Delete("non-existent") + err = repo.Delete(context.Background(),"non-existent") if err != nil { t.Fatalf("Delete with non-existent ID failed: %v", err) } @@ -304,7 +304,7 @@ func TestMarkRedundant(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray1, testTray2, testTray3, testTray4}) // Test MarkRedundant - redundantTrays, err := repo.MarkRedundant("test-type", 2) + redundantTrays, err := repo.MarkRedundant(context.Background(),"test-type", 2) if err != nil { t.Fatalf("MarkRedundant failed: %v", err) } @@ -356,7 +356,7 @@ func TestMarkRedundant(t *testing.T) { } // Verify that trays with different status or type were not affected - unchangedTray, err := repo.GetById("test-tray-3") + unchangedTray, err := repo.GetById(context.Background(),"test-tray-3") if err != nil { t.Fatalf("Failed to get test-tray-3: %v", err) } @@ -365,7 +365,7 @@ func TestMarkRedundant(t *testing.T) { t.Errorf("Expected test-tray-3 status to remain %v, got %v", trays.TrayStatusRegistered, unchangedTray.Status) } - unchangedTray, err = repo.GetById("test-tray-4") + unchangedTray, err = repo.GetById(context.Background(),"test-tray-4") if err != nil { t.Fatalf("Failed to get test-tray-4: %v", err) } @@ -381,7 +381,7 @@ func TestMarkRedundant(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray5, testTray6}) // Mark only 1 tray as redundant - redundantTrays, err = repo.MarkRedundant("test-type", 1) + redundantTrays, err = repo.MarkRedundant(context.Background(),"test-type", 1) if err != nil { t.Fatalf("MarkRedundant with limit failed: %v", err) } @@ -402,7 +402,7 @@ func TestMarkRedundant(t *testing.T) { } // Test MarkRedundant with non-existent tray type - redundantTrays, err = repo.MarkRedundant("non-existent", 2) + redundantTrays, err = repo.MarkRedundant(context.Background(),"non-existent", 2) if err != nil { t.Fatalf("MarkRedundant with non-existent tray type failed: %v", err) } @@ -419,7 +419,7 @@ func TestMarkRedundant(t *testing.T) { } // Try to mark redundant trays in an empty collection - redundantTrays, err = repo.MarkRedundant("test-type", 2) + redundantTrays, err = repo.MarkRedundant(context.Background(),"test-type", 2) if err != nil { t.Fatalf("MarkRedundant with empty collection failed: %v", err) } @@ -457,7 +457,7 @@ func TestGetStale(t *testing.T) { insertTestTrays(t, collection, []*TestTray{staleTray1, staleTray2, freshTray1, freshTray2}) // Test GetStale with 5 minute duration - staleTrays, err := repo.GetStale(5*time.Minute) + staleTrays, err := repo.GetStale(context.Background(),5*time.Minute) if err != nil { t.Fatalf("GetStale failed: %v", err) } @@ -502,7 +502,7 @@ func TestGetStale(t *testing.T) { insertTestTrays(t, collection, []*TestTray{freshTray1, freshTray2}) // Test GetStale again with 5 minute duration - staleTrays, err = repo.GetStale(5*time.Minute) + staleTrays, err = repo.GetStale(context.Background(),5*time.Minute) if err != nil { t.Fatalf("GetStale failed: %v", err) } @@ -562,7 +562,7 @@ func TestConnect(t *testing.T) { insertTestTrays(t, collection, []*TestTray{testTray}) // Try to get the tray using the repository - tray, err := repo.GetById("test-connect") + tray, err := repo.GetById(context.Background(),"test-connect") if err != nil { t.Fatalf("GetById failed after Connect: %v", err) } @@ -603,7 +603,7 @@ func TestCountByTrayType(t *testing.T) { insertTestTrays(t, collection, testTrays) // Test CountByTrayType for test-type - counts, total, err := repo.CountByTrayType("test-type") + counts, total, err := repo.CountByTrayType(context.Background(),"test-type") if err != nil { t.Fatalf("CountByTrayType failed: %v", err) } @@ -630,7 +630,7 @@ func TestCountByTrayType(t *testing.T) { } // Test CountByTrayType for other-type - counts, total, err = repo.CountByTrayType("other-type") + counts, total, err = repo.CountByTrayType(context.Background(),"other-type") if err != nil { t.Fatalf("CountByTrayType for other-type failed: %v", err) } @@ -657,7 +657,7 @@ func TestCountByTrayType(t *testing.T) { } // Test CountByTrayType with non-existent tray type - counts, total, err = repo.CountByTrayType("non-existent") + counts, total, err = repo.CountByTrayType(context.Background(),"non-existent") if err != nil { t.Fatalf("CountByTrayType with non-existent tray type failed: %v", err) } diff --git a/src/lib/trays/tray.go b/src/lib/trays/tray.go index 23f95e0..2cac404 100644 --- a/src/lib/trays/tray.go +++ b/src/lib/trays/tray.go @@ -11,7 +11,6 @@ import ( type Tray struct { Id string `bson:"id"` TrayTypeName string `bson:"trayTypeName"` - trayType config.TrayType ProviderName string `bson:"providerName"` GitHubOrgName string `bson:"gitHubOrgName"` @@ -26,7 +25,6 @@ type Tray struct { } func NewTray(trayType config.TrayType) *Tray { - b := make([]byte, 8) _, err := rand.Read(b) if err != nil { @@ -34,41 +32,27 @@ func NewTray(trayType config.TrayType) *Tray { } id := hex.EncodeToString(b) - var trayId = fmt.Sprintf("%s-%s", trayType.Name, id) - var tray = &Tray{ - Id: trayId, + return &Tray{ + Id: fmt.Sprintf("%s-%s", trayType.Name, id), TrayTypeName: trayType.Name, - trayType: trayType, ProviderName: trayType.Provider, Status: TrayStatusCreating, GitHubOrgName: trayType.GitHubOrg, - JobRunId: 0, - WorkflowRunId: 0, ProviderData: make(map[string]string), } - - return tray -} - -func (tray *Tray) GetId() string { - return tray.Id -} - -func (tray *Tray) GetGitHubOrgName() string { - return tray.GitHubOrgName -} - -func (tray *Tray) GetTrayTypeName() string { - return tray.TrayTypeName } -func (tray *Tray) GetTrayType() config.TrayType { - return tray.trayType +// TrayType returns the configuration for this tray's type from the current config. +// Returns nil if the tray type no longer exists in config. +func (tray *Tray) TrayType() *config.TrayType { + return config.AppConfig.GetTrayType(tray.TrayTypeName) } -func (tray *Tray) GetTrayConfig() config.TrayConfig { - tt := config.AppConfig.GetTrayType(tray.TrayTypeName) +// TrayConfig returns the provider-specific config (DockerTrayConfig, GoogleTrayConfig, etc.). +// Returns nil if the tray type no longer exists in config. +func (tray *Tray) TrayConfig() config.TrayConfig { + tt := tray.TrayType() if tt == nil { return nil } @@ -76,6 +60,6 @@ func (tray *Tray) GetTrayConfig() config.TrayConfig { } func (tray *Tray) String() string { - return fmt.Sprintf("id: %s, trayTypeName: %s, status: %s, gitHubOrgName: %s, statusChanged: %s", + return fmt.Sprintf("id: %s, trayTypeName: %s, status: %s, gitHubOrgName: %s, statusChanged: %s", tray.Id, tray.TrayTypeName, tray.Status, tray.GitHubOrgName, tray.StatusChanged.Format(time.RFC3339)) } diff --git a/src/server/handlers/agentHandler.go b/src/server/handlers/agentHandler.go index b764f16..27b05a0 100644 --- a/src/server/handlers/agentHandler.go +++ b/src/server/handlers/agentHandler.go @@ -35,7 +35,7 @@ func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Req logger.Debug("Agent registration request") - var tray, err = h.TrayManager.Registering(agentId) + var tray, err = h.TrayManager.Registering(r.Context(), agentId) if err != nil { var errMsg = fmt.Sprintf("Failed to update tray status for agent '%s': %v", agentId, err) logger.Error(errMsg) @@ -43,16 +43,16 @@ func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Req return } - var trayType = config.AppConfig.GetTrayType(tray.GetTrayTypeName()) + var trayType = config.AppConfig.GetTrayType(tray.TrayTypeName) if trayType == nil { - var errMsg = fmt.Sprintf("Tray type '%s' not found", tray.GetTrayTypeName()) + var errMsg = fmt.Sprintf("Tray type '%s' not found", tray.TrayTypeName) logger.Error(errMsg) http.Error(responseWriter, errMsg, http.StatusInternalServerError) return } logger = logger.WithFields(log.Fields{"trayType": trayType.Name}) - logger.Debugf("Found tray %s for agent %s, with organization %s", tray.GetId(), agentId, tray.GetGitHubOrgName()) + logger.Debugf("Found tray %s for agent %s, with organization %s", tray.Id, agentId, tray.GitHubOrgName) poller := h.ScaleSetManager.GetPoller(trayType.Name) if poller == nil { @@ -62,7 +62,7 @@ func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Req return } - jitRunnerConfig, err := poller.Client().GenerateJitRunnerConfig(r.Context(), tray.GetId()) + jitRunnerConfig, err := poller.Client().GenerateJitRunnerConfig(r.Context(), tray.Id) if err != nil { logger.Errorf("Failed to generate jitRunnerConfig: %v", err) http.Error(responseWriter, "Failed to generate jitRunnerConfig", http.StatusInternalServerError) @@ -90,7 +90,7 @@ func (h *Handlers) AgentRegister(responseWriter http.ResponseWriter, r *http.Req return } - _, err = h.TrayManager.Registered(agentId, int64(jitRunnerConfig.Runner.ID)) + _, err = h.TrayManager.Registered(r.Context(), agentId, int64(jitRunnerConfig.Runner.ID)) if err != nil { logger.Errorf("%v", err) } @@ -116,7 +116,7 @@ func (h *Handlers) AgentUnregister(responseWriter http.ResponseWriter, r *http.R var trayId = r.PathValue("id") - var tray, err = h.TrayManager.GetTrayById(trayId) + var tray, err = h.TrayManager.GetTrayById(r.Context(), trayId) if err != nil { var errMsg = fmt.Sprintf("Failed to get tray for agent '%s': %v", trayId, err) logger.Error(errMsg) @@ -143,7 +143,7 @@ func (h *Handlers) AgentUnregister(responseWriter http.ResponseWriter, r *http.R logger.Tracef("Agent unregister request") - _, err = h.TrayManager.DeleteTray(tray.Id) + _, err = h.TrayManager.DeleteTray(r.Context(), tray.Id) if err != nil { logger.Errorf("Failed to delete tray: %v", err) @@ -161,46 +161,14 @@ func (h *Handlers) AgentUnregister(responseWriter http.ResponseWriter, r *http.R } func AgentDownloadBinary(responseWriter http.ResponseWriter, r *http.Request) { - var logger = log.WithFields(log.Fields{ - "handler": "agent", - "call": "AgentDownloadBinary", - }) - logger.Tracef("AgentDownloadBinary: %v", r) - - // Get the current executable path execPath, err := os.Executable() if err != nil { - logger.Errorf("Failed to get executable path: %v", err) http.Error(responseWriter, "Failed to get binary path", http.StatusInternalServerError) return } - // Open the binary file - file, err := os.Open(execPath) - if err != nil { - logger.Errorf("Failed to open binary file: %v", err) - http.Error(responseWriter, "Failed to open binary file", http.StatusInternalServerError) - return - } - defer file.Close() - - // Get file info for size and name - fileInfo, err := file.Stat() - if err != nil { - logger.Errorf("Failed to get file info: %v", err) - http.Error(responseWriter, "Failed to get file info", http.StatusInternalServerError) - return - } - - // Set appropriate headers - responseWriter.Header().Set("Content-Type", "application/octet-stream") responseWriter.Header().Set("Content-Disposition", fmt.Sprintf("attachment; filename=\"%s\"", filepath.Base(execPath))) - responseWriter.Header().Set("Content-Length", fmt.Sprintf("%d", fileInfo.Size())) - - // Serve the file - http.ServeContent(responseWriter, r, filepath.Base(execPath), fileInfo.ModTime(), file) - - logger.Infof("Binary file served: %s (%d bytes)", execPath, fileInfo.Size()) + http.ServeFile(responseWriter, r, execPath) } func (h *Handlers) AgentPing(responseWriter http.ResponseWriter, r *http.Request) { @@ -219,7 +187,7 @@ func (h *Handlers) AgentPing(responseWriter http.ResponseWriter, r *http.Request Message: "", } - tray, err := h.TrayManager.GetTrayById(agentId) + tray, err := h.TrayManager.GetTrayById(r.Context(), agentId) if err != nil { var errMsg = fmt.Sprintf("Failed to get tray by id '%s': %v", agentId, err) logger.Error(errMsg) @@ -285,7 +253,7 @@ func (h *Handlers) AgentInterrupt(responseWriter http.ResponseWriter, r *http.Re logger.Debug("Agent restart request with id " + agentId) - tray, err := h.TrayManager.GetTrayById(agentId) + tray, err := h.TrayManager.GetTrayById(r.Context(), agentId) if err != nil { var errMsg = fmt.Sprintf("Failed to get tray by id '%s': %v", agentId, err) logger.Error(errMsg) From 069196b9583e87d0fbee05a1200efd9be8c54c81 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 01:35:24 +0400 Subject: [PATCH 17/27] tests fix --- src/lib/config/config_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/config/config_test.go b/src/lib/config/config_test.go index e7a263b..4cf326c 100644 --- a/src/lib/config/config_test.go +++ b/src/lib/config/config_test.go @@ -28,6 +28,7 @@ database: github: - name: "test-org" appId: 12345 + appClientId: "Iv1.test123" installationId: 67890 webhookSecret: "secret" privateKeyPath: "path/to/key.pem" From 16ce3bb34e97a94c86c4e9893d72849d548ee7a4 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 18:15:49 +0400 Subject: [PATCH 18/27] graceful shutdown --- src/lib/scaleSetClient/scaleSetClient.go | 32 +++++++++++++++++++----- src/lib/scaleSetPoller/manager.go | 1 + src/lib/scaleSetPoller/poller.go | 9 ++++++- src/server/server.go | 19 +++++++++++--- 4 files changed, 50 insertions(+), 11 deletions(-) diff --git a/src/lib/scaleSetClient/scaleSetClient.go b/src/lib/scaleSetClient/scaleSetClient.go index 211bfdf..90602fb 100644 --- a/src/lib/scaleSetClient/scaleSetClient.go +++ b/src/lib/scaleSetClient/scaleSetClient.go @@ -5,6 +5,8 @@ import ( "context" "fmt" "os" + "strings" + "time" "github.com/actions/scaleset" log "github.com/sirupsen/logrus" @@ -79,13 +81,31 @@ func (sc *ScaleSetClient) EnsureScaleSet(ctx context.Context) error { func (sc *ScaleSetClient) CreateSession(ctx context.Context) error { hostname, _ := os.Hostname() - session, err := sc.client.MessageSessionClient(ctx, sc.scaleSet.ID, hostname) - if err != nil { - return fmt.Errorf("failed to create message session: %w", err) + + const maxRetries = 5 + const retryDelay = 30 * time.Second + + for attempt := range maxRetries { + session, err := sc.client.MessageSessionClient(ctx, sc.scaleSet.ID, hostname) + if err == nil { + sc.session = session + sc.logger.Info("Message session created") + return nil + } + + if !strings.Contains(err.Error(), "409 Conflict") || attempt == maxRetries-1 { + return fmt.Errorf("failed to create message session: %w", err) + } + + sc.logger.Warnf("Session conflict (attempt %d/%d), stale session likely exists — retrying in %v", attempt+1, maxRetries, retryDelay) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(retryDelay): + } } - sc.session = session - sc.logger.Info("Message session created") - return nil + + return fmt.Errorf("unreachable") } func (sc *ScaleSetClient) Poll(ctx context.Context, lastMessageID int, maxCapacity int) (*scaleset.RunnerScaleSetMessage, error) { diff --git a/src/lib/scaleSetPoller/manager.go b/src/lib/scaleSetPoller/manager.go index 0b3027d..37e0d8e 100644 --- a/src/lib/scaleSetPoller/manager.go +++ b/src/lib/scaleSetPoller/manager.go @@ -5,6 +5,7 @@ import "sync" type Manager struct { mu sync.RWMutex pollers map[string]*Poller + Wg sync.WaitGroup } func NewManager() *Manager { diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index ce102da..6bfc78f 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "strconv" + "time" "cattery/lib/config" @@ -52,7 +53,13 @@ func (p *Poller) Run(ctx context.Context) error { if err := p.client.CreateSession(ctx); err != nil { return fmt.Errorf("failed to create session: %w", err) } - defer p.client.Close(ctx) + defer func() { + closeCtx, closeCancel := context.WithTimeout(context.Background(), 10*time.Second) + defer closeCancel() + if err := p.client.Close(closeCtx); err != nil { + p.logger.Errorf("Failed to close session: %v", err) + } + }() scaleSetID := p.client.GetScaleSetID() diff --git a/src/server/server.go b/src/server/server.go index c3eecc3..b99622a 100644 --- a/src/server/server.go +++ b/src/server/server.go @@ -43,10 +43,15 @@ func Start() { logger.Fatal(err) } - err = client.Ping(context.Background(), nil) - if err != nil { - logger.Errorf("Failed to connect to MongoDB: %v", err) - os.Exit(1) + { + timeoutCtx, cf := context.WithTimeout(context.Background(), 3*time.Second) + defer cf() + + err = client.Ping(timeoutCtx, nil) + if err != nil { + logger.Errorf("Failed to connect to MongoDB: %v", err) + os.Exit(1) + } } var database = client.Database(config.AppConfig.Database.Database) @@ -77,7 +82,9 @@ func Start() { poller := scaleSetPoller.NewPoller(ssClient, trayType, tm) ssm.Register(trayType.Name, poller) + ssm.Wg.Add(1) go func(p *scaleSetPoller.Poller, name string) { + defer ssm.Wg.Done() for { if err := p.Run(ctx); err != nil { if ctx.Err() != nil { @@ -136,4 +143,8 @@ func Start() { sig := <-sigs logger.Info("Got signal ", sig) cancel() + + logger.Info("Waiting for pollers to shut down...") + ssm.Wg.Wait() + logger.Info("All pollers stopped") } From 735b583a5986c720a467e430858c7d800dbd1233 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 18:18:22 +0400 Subject: [PATCH 19/27] update examples --- .dockerignore | 7 ++++++- examples/docker-compose.yaml | 39 ++++++++++++++++++++++++++++++++++++ examples/example-config.yaml | 2 ++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 examples/docker-compose.yaml diff --git a/.dockerignore b/.dockerignore index c3265dd..31d11c5 100644 --- a/.dockerignore +++ b/.dockerignore @@ -3,4 +3,9 @@ bin/ examples/ README.MD LICENCE -.github/ \ No newline at end of file +.github/ +.git/ +docs +tests +.dockerignore +.gitignore \ No newline at end of file diff --git a/examples/docker-compose.yaml b/examples/docker-compose.yaml new file mode 100644 index 0000000..a8bb3c2 --- /dev/null +++ b/examples/docker-compose.yaml @@ -0,0 +1,39 @@ +version: '3.8' + +services: + mongo1: + image: mongo:latest + container_name: mongo1 + command: mongod --replSet rs0 --bind_ip_all + ports: + - "27017:27017" + volumes: + - mongo1-data:/data/db + networks: + - mongo-cluster + + mongo-config: + image: mongo:latest + container_name: mongo-config + depends_on: + - mongo1 + networks: + - mongo-cluster + # This command runs a script to initiate the replica set + command: > + bash -c "sleep 10 && mongosh --host mongo1:27017 < Date: Thu, 26 Mar 2026 18:29:22 +0400 Subject: [PATCH 20/27] agent docker --- examples/Dockerfile-cattery-tiny | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/Dockerfile-cattery-tiny b/examples/Dockerfile-cattery-tiny index e6a54eb..ab62069 100644 --- a/examples/Dockerfile-cattery-tiny +++ b/examples/Dockerfile-cattery-tiny @@ -5,14 +5,20 @@ RUN apt-get update && apt-get install -y \ WORKDIR /action-runner -RUN curl -sL -o actions-runner-linux-x64-2.323.0.tar.gz https://github.com/actions/runner/releases/download/v2.323.0/actions-runner-linux-x64-2.323.0.tar.gz +ARG RUNNER_VERISON='2.333' +ENV RUNNER_VERISON=$RUNNER_VERISON + +RUN curl -sL -o actions-runner-linux-x64-${RUNNER_VERISON}.0.tar.gz https://github.com/actions/runner/releases/download/v${RUNNER_VERISON}.0/actions-runner-linux-x64-${RUNNER_VERISON}.0.tar.gz RUN ls -al -RUN tar xzf ./actions-runner-linux-x64-2.323.0.tar.gz +RUN tar xzf ./actions-runner-linux-x64-${RUNNER_VERISON}.0.tar.gz WORKDIR /cattery-agent COPY . . -RUN go build -o /action-runner/cattery/cattery +RUN ls -al /action-runner +RUN ls -al /cattery-agent +RUN ls -al . + -#ENTRYPOINT ["/action-runner/cattery/cattery", "agent", "-r","/action-runner", "-s", "http://10.10.10.116:5137"] \ No newline at end of file +RUN cd src && go build -o /action-runner/cattery/cattery \ No newline at end of file From d2892a4934422b9032f40449f10a9cc27b67f259 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 19:31:21 +0400 Subject: [PATCH 21/27] fix nill provider --- src/lib/trays/providers/trayProviderFactory.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/lib/trays/providers/trayProviderFactory.go b/src/lib/trays/providers/trayProviderFactory.go index bac9b19..9e89b2d 100644 --- a/src/lib/trays/providers/trayProviderFactory.go +++ b/src/lib/trays/providers/trayProviderFactory.go @@ -52,9 +52,13 @@ func GetProvider(providerName string) (ITrayProvider, error) { switch provider["type"] { case "docker": - result = NewDockerProvider(providerName, provider) + if p := NewDockerProvider(providerName, provider); p != nil { + result = p + } case "google": - result = NewGceProvider(providerName, provider) + if p := NewGceProvider(providerName, provider); p != nil { + result = p + } default: return nil, errors.New("unknown provider type: " + provider["type"]) } From 1e7b9200fe11c1ef1e46a21b6b78c40b88b9dc39 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 22:24:30 +0400 Subject: [PATCH 22/27] wait gcp api --- src/lib/trays/providers/gceProvider.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/lib/trays/providers/gceProvider.go b/src/lib/trays/providers/gceProvider.go index 7350fbb..e8c2785 100644 --- a/src/lib/trays/providers/gceProvider.go +++ b/src/lib/trays/providers/gceProvider.go @@ -74,7 +74,7 @@ func (g *GceProvider) RunTray(tray *trays.Tray) error { var zone = zones[rand.Intn(len(zones))] - _, err := g.instanceClient.Insert(ctx, &computepb.InsertInstanceRequest{ + op, err := g.instanceClient.Insert(ctx, &computepb.InsertInstanceRequest{ Project: project, Zone: zone, SourceInstanceTemplate: &instanceTemplate, @@ -89,6 +89,11 @@ func (g *GceProvider) RunTray(tray *trays.Tray) error { return err } + if err := op.Wait(ctx); err != nil { + g.logger.Errorf("Failed waiting for tray creation to complete: %v", err) + return err + } + tray.ProviderData["zone"] = zone return nil From 3f9dad36b1b9ef7132ac060f3a43ca2dddc1fbea Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Thu, 26 Mar 2026 23:05:28 +0400 Subject: [PATCH 23/27] parallel trays --- src/lib/config/config.go | 19 +++++----- src/lib/trayManager/trayManager.go | 56 +++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/src/lib/config/config.go b/src/lib/config/config.go index 2b32cb8..966bc03 100644 --- a/src/lib/config/config.go +++ b/src/lib/config/config.go @@ -152,15 +152,18 @@ type GitHubOrganization struct { PrivateKeyPath string `yaml:"privateKeyPath"` } +const DefaultMaxParallelCreation = 10 + type TrayType struct { - Name string `yaml:"name" validate:"required"` - Provider string `yaml:"provider" validate:"required"` - RunnerGroupId int64 `yaml:"runnerGroupId" validate:"required"` - Shutdown bool `yaml:"shutdown"` - GitHubOrg string `yaml:"githubOrg" validate:"required"` - MaxTrays int `yaml:"limit"` - Config TrayConfig `yaml:"config"` - ExtraMetadata TrayExtraMetadata + Name string `yaml:"name" validate:"required"` + Provider string `yaml:"provider" validate:"required"` + RunnerGroupId int64 `yaml:"runnerGroupId" validate:"required"` + Shutdown bool `yaml:"shutdown"` + GitHubOrg string `yaml:"githubOrg" validate:"required"` + MaxTrays int `yaml:"limit"` + MaxParallelCreation int `yaml:"maxParallelCreation"` + Config TrayConfig `yaml:"config"` + ExtraMetadata TrayExtraMetadata } type TrayExtraMetadata map[string]string diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index da693ff..de5c147 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -8,6 +8,7 @@ import ( "cattery/lib/trays/repositories" "context" "fmt" + "sync" "time" log "github.com/sirupsen/logrus" @@ -23,13 +24,58 @@ func NewTrayManager(trayRepository repositories.ITrayRepository) *TrayManager { } } -func (tm *TrayManager) createTrays(ctx context.Context, trayType *config.TrayType, n int) error { - for i := 0; i < n; i++ { - log.Infof("Creating tray %d for type: %s", i+1, trayType.Name) - if err := tm.CreateTray(ctx, trayType); err != nil { - return err +func (tm *TrayManager) createTrays(ctx context.Context, trayType *config.TrayType, count int) error { + maxParallel := trayType.MaxParallelCreation + if maxParallel <= 0 { + maxParallel = config.DefaultMaxParallelCreation + } + + results := tm.createTraysParallel(ctx, trayType, count, maxParallel) + return tm.logCreationResults(trayType.Name, results) +} + +// createTraysParallel creates trays concurrently, limited to maxParallel at a time. +// Returns a slice of errors, one per tray (nil means success). +func (tm *TrayManager) createTraysParallel(ctx context.Context, trayType *config.TrayType, count int, maxParallel int) []error { + var wg sync.WaitGroup + semaphore := make(chan struct{}, maxParallel) + errors := make([]error, count) + + for i := 0; i < count; i++ { + semaphore <- struct{}{} // block if maxParallel goroutines are already running + wg.Add(1) + + go func(index int) { + defer wg.Done() + defer func() { <-semaphore }() + + log.Infof("Creating tray %d/%d for type: %s", index+1, count, trayType.Name) + errors[index] = tm.CreateTray(ctx, trayType) + }(i) + } + + wg.Wait() + return errors +} + +func (tm *TrayManager) logCreationResults(trayTypeName string, results []error) error { + total := len(results) + failed := 0 + + for _, err := range results { + if err != nil { + log.Errorf("Failed to create tray for type %s: %v", trayTypeName, err) + failed++ } } + + if failed == total { + return fmt.Errorf("all %d tray creations failed for type %s", total, trayTypeName) + } + if failed > 0 { + log.Warnf("%d out of %d tray creations failed for type %s", failed, total, trayTypeName) + } + return nil } From f245d2e5d21a601de084742f7242e8f6413aa3f4 Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Fri, 27 Mar 2026 01:34:35 +0400 Subject: [PATCH 24/27] stats --- src/lib/scaleSetPoller/poller.go | 33 +++++++++++++++++++++++++----- src/lib/trayManager/trayManager.go | 20 ++++++++++++------ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index 6bfc78f..1d6acb5 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -63,19 +63,20 @@ func (p *Poller) Run(ctx context.Context) error { scaleSetID := p.client.GetScaleSetID() + scaler := &catteryScaler{poller: p} + l, err := listener.New( &sessionAdapter{client: p.client}, listener.Config{ ScaleSetID: scaleSetID, MaxRunners: p.trayType.MaxTrays, }, + listener.WithMetricsRecorder(scaler), ) if err != nil { return fmt.Errorf("failed to create listener: %w", err) } - scaler := &catteryScaler{poller: p} - p.logger.Info("Entering listener loop") return l.Run(ctx, scaler) } @@ -97,13 +98,29 @@ func (s *sessionAdapter) Session() scaleset.RunnerScaleSetSession { return s.client.Session() } -// catteryScaler implements the listener.Scaler interface. +// catteryScaler implements the listener.Scaler and listener.MetricsRecorder interfaces. type catteryScaler struct { - poller *Poller + poller *Poller + latestStats *scaleset.RunnerScaleSetStatistic +} + +// MetricsRecorder implementation — captures GitHub statistics for ghost tray detection. + +func (cs *catteryScaler) RecordStatistics(statistics *scaleset.RunnerScaleSetStatistic) { + cs.latestStats = statistics } +func (cs *catteryScaler) RecordJobStarted(msg *scaleset.JobStarted) {} +func (cs *catteryScaler) RecordJobCompleted(msg *scaleset.JobCompleted) {} +func (cs *catteryScaler) RecordDesiredRunners(count int) {} + func (cs *catteryScaler) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) { - err := cs.poller.trayManager.ScaleForDemand(ctx, cs.poller.trayType, count) + githubIdleRunners := 0 + if cs.latestStats != nil { + githubIdleRunners = cs.latestStats.TotalIdleRunners + } + + err := cs.poller.trayManager.ScaleForDemand(ctx, cs.poller.trayType, count, githubIdleRunners) if err != nil { cs.poller.logger.Errorf("Failed to scale for demand (%d): %v", count, err) return 0, err @@ -135,6 +152,12 @@ func (cs *catteryScaler) HandleJobCompleted(ctx context.Context, jobInfo *scales cs.poller.logger.Infof("Job completed: %s on runner %s (result: %s)", jobInfo.JobDisplayName, jobInfo.RunnerName, jobInfo.Result) + if jobInfo.RunnerName == "" { + cs.poller.logger.Warnf("Job completed with empty runner name (result: %s, job: %s) — skipping tray deletion", + jobInfo.Result, jobInfo.JobDisplayName) + return nil + } + _, err := cs.poller.trayManager.DeleteTray(ctx, jobInfo.RunnerName) if err != nil { cs.poller.logger.Errorf("Failed to delete tray %s: %v", jobInfo.RunnerName, err) diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index de5c147..9a967e0 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -215,17 +215,25 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { } // ScaleForDemand scales trays for a given tray type based on the desired runner count. -// The desiredCount is TotalAssignedJobs from GitHub scale set statistics — the total -// number of runners that should exist (running + idle) to serve all assigned jobs. -func (tm *TrayManager) ScaleForDemand(ctx context.Context, trayType *config.TrayType, desiredCount int) error { - countByStatus, total, err := tm.trayRepository.CountByTrayType(ctx, trayType.Name) +// The desiredCount is TotalAssignedJobs from GitHub scale set statistics. +// githubIdleRunners is GitHub's reported idle runner count, used as source of truth +// for how many of our Registered trays are actually confirmed by GitHub. +func (tm *TrayManager) ScaleForDemand(ctx context.Context, trayType *config.TrayType, desiredCount int, githubIdleRunners int) error { + countByStatus, _, err := tm.trayRepository.CountByTrayType(ctx, trayType.Name) if err != nil { log.Errorf("Failed to count trays for type %s: %v", trayType.Name, err) return err } - idleTrays := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + countByStatus[trays.TrayStatusRegistered] - activeTotal := total - countByStatus[trays.TrayStatusDeleting] + // Trust GitHub's idle count over our local Registered count. + // If we have 4 Registered trays but GitHub says 0 idle, those 4 are ghosts + // from cancelled workflows — the stale handler will clean them up. + provisioningTrays := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] + confirmedIdle := min(countByStatus[trays.TrayStatusRegistered], githubIdleRunners) + runningTrays := countByStatus[trays.TrayStatusRunning] + + activeTotal := provisioningTrays + confirmedIdle + runningTrays + idleTrays := provisioningTrays + confirmedIdle if desiredCount > activeTotal { remainingCapacity := trayType.MaxTrays - activeTotal From 1abf5b4edaf7aab3d4bb76caad25f157348367fa Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Fri, 27 Mar 2026 03:23:42 +0400 Subject: [PATCH 25/27] scale logic fix --- src/lib/scaleSetPoller/poller.go | 36 +-- src/lib/trayManager/trayManager.go | 61 +---- src/lib/trays/repositories/iTrayRepository.go | 3 +- .../repositories/mongodbTrayRepository.go | 67 +---- .../mongodbTrayRepository_test.go | 247 ++---------------- 5 files changed, 60 insertions(+), 354 deletions(-) diff --git a/src/lib/scaleSetPoller/poller.go b/src/lib/scaleSetPoller/poller.go index 1d6acb5..1b7dd2c 100644 --- a/src/lib/scaleSetPoller/poller.go +++ b/src/lib/scaleSetPoller/poller.go @@ -100,37 +100,24 @@ func (s *sessionAdapter) Session() scaleset.RunnerScaleSetSession { // catteryScaler implements the listener.Scaler and listener.MetricsRecorder interfaces. type catteryScaler struct { - poller *Poller - latestStats *scaleset.RunnerScaleSetStatistic + poller *Poller } -// MetricsRecorder implementation — captures GitHub statistics for ghost tray detection. +// MetricsRecorder implementation. -func (cs *catteryScaler) RecordStatistics(statistics *scaleset.RunnerScaleSetStatistic) { - cs.latestStats = statistics -} - -func (cs *catteryScaler) RecordJobStarted(msg *scaleset.JobStarted) {} -func (cs *catteryScaler) RecordJobCompleted(msg *scaleset.JobCompleted) {} -func (cs *catteryScaler) RecordDesiredRunners(count int) {} +func (cs *catteryScaler) RecordStatistics(statistics *scaleset.RunnerScaleSetStatistic) {} +func (cs *catteryScaler) RecordJobStarted(msg *scaleset.JobStarted) {} +func (cs *catteryScaler) RecordJobCompleted(msg *scaleset.JobCompleted) {} +func (cs *catteryScaler) RecordDesiredRunners(count int) {} func (cs *catteryScaler) HandleDesiredRunnerCount(ctx context.Context, count int) (int, error) { - githubIdleRunners := 0 - if cs.latestStats != nil { - githubIdleRunners = cs.latestStats.TotalIdleRunners - } - - err := cs.poller.trayManager.ScaleForDemand(ctx, cs.poller.trayType, count, githubIdleRunners) + err := cs.poller.trayManager.ScaleForDemand(ctx, cs.poller.trayType, count) if err != nil { cs.poller.logger.Errorf("Failed to scale for demand (%d): %v", count, err) return 0, err } - total, err := cs.poller.trayManager.CountTrays(ctx, cs.poller.trayType.Name) - if err != nil { - return 0, err - } - return total, nil + return cs.poller.trayManager.CountTrays(ctx, cs.poller.trayType.Name) } func (cs *catteryScaler) HandleJobStarted(ctx context.Context, jobInfo *scaleset.JobStarted) error { @@ -139,12 +126,17 @@ func (cs *catteryScaler) HandleJobStarted(ctx context.Context, jobInfo *scaleset jobID, _ := strconv.ParseInt(jobInfo.JobID, 10, 64) - _, err := cs.poller.trayManager.SetJob(ctx, jobInfo.RunnerName, jobID, jobInfo.WorkflowRunID, jobInfo.RepositoryName) + tray, err := cs.poller.trayManager.SetJob(ctx, jobInfo.RunnerName, jobID, jobInfo.WorkflowRunID, jobInfo.RepositoryName) if err != nil { cs.poller.logger.Errorf("Failed to set job on tray %s: %v", jobInfo.RunnerName, err) return err } + if tray == nil { + cs.poller.logger.Warnf("Tray %s not found for job %s (workflow run %d) — tray already removed", + jobInfo.RunnerName, jobInfo.JobDisplayName, jobInfo.WorkflowRunID) + } + return nil } diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 9a967e0..75e7f20 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -146,9 +146,6 @@ func (tm *TrayManager) SetJob(ctx context.Context, trayId string, jobRunId int64 if err != nil { return nil, err } - if tray == nil { - return nil, fmt.Errorf("failed to update tray status for tray '%s'", trayId) - } return tray, nil } @@ -215,64 +212,26 @@ func (tm *TrayManager) HandleStale(ctx context.Context) { } // ScaleForDemand scales trays for a given tray type based on the desired runner count. -// The desiredCount is TotalAssignedJobs from GitHub scale set statistics. -// githubIdleRunners is GitHub's reported idle runner count, used as source of truth -// for how many of our Registered trays are actually confirmed by GitHub. -func (tm *TrayManager) ScaleForDemand(ctx context.Context, trayType *config.TrayType, desiredCount int, githubIdleRunners int) error { - countByStatus, _, err := tm.trayRepository.CountByTrayType(ctx, trayType.Name) +// Follows ARC's pattern: scale up when needed, let HandleJobCompleted and the stale +// handler take care of scale-down. No ghost detection — trust local tray state. +func (tm *TrayManager) ScaleForDemand(ctx context.Context, trayType *config.TrayType, desiredCount int) error { + activeCount, err := tm.CountTrays(ctx, trayType.Name) if err != nil { - log.Errorf("Failed to count trays for type %s: %v", trayType.Name, err) return err } - // Trust GitHub's idle count over our local Registered count. - // If we have 4 Registered trays but GitHub says 0 idle, those 4 are ghosts - // from cancelled workflows — the stale handler will clean them up. - provisioningTrays := countByStatus[trays.TrayStatusCreating] + countByStatus[trays.TrayStatusRegistering] - confirmedIdle := min(countByStatus[trays.TrayStatusRegistered], githubIdleRunners) - runningTrays := countByStatus[trays.TrayStatusRunning] - - activeTotal := provisioningTrays + confirmedIdle + runningTrays - idleTrays := provisioningTrays + confirmedIdle - - if desiredCount > activeTotal { - remainingCapacity := trayType.MaxTrays - activeTotal - traysToCreate := desiredCount - activeTotal - if traysToCreate > remainingCapacity { - traysToCreate = remainingCapacity - } - if traysToCreate > 0 { - if err := tm.createTrays(ctx, trayType, traysToCreate); err != nil { - return err - } - } + if desiredCount <= activeCount { + return nil } - if desiredCount < activeTotal && idleTrays > 0 { - excess := activeTotal - desiredCount - traysToDelete := excess - if traysToDelete > idleTrays { - traysToDelete = idleTrays - } - redundant, err := tm.trayRepository.MarkRedundant(ctx, trayType.Name, traysToDelete) - if err != nil { - return err - } - for _, tray := range redundant { - if _, delErr := tm.DeleteTray(ctx, tray.Id); delErr != nil { - log.Errorf("Failed to delete redundant tray %s: %v", tray.Id, delErr) - } - } + traysToCreate := min(desiredCount-activeCount, trayType.MaxTrays-activeCount) + if traysToCreate > 0 { + return tm.createTrays(ctx, trayType, traysToCreate) } - return nil } // CountTrays returns the number of active (non-deleting) trays for a given tray type. func (tm *TrayManager) CountTrays(ctx context.Context, trayTypeName string) (int, error) { - countByStatus, total, err := tm.trayRepository.CountByTrayType(ctx, trayTypeName) - if err != nil { - return 0, err - } - return total - countByStatus[trays.TrayStatusDeleting], nil + return tm.trayRepository.CountActive(ctx, trayTypeName) } diff --git a/src/lib/trays/repositories/iTrayRepository.go b/src/lib/trays/repositories/iTrayRepository.go index 25ff366..1ddbdf6 100644 --- a/src/lib/trays/repositories/iTrayRepository.go +++ b/src/lib/trays/repositories/iTrayRepository.go @@ -11,7 +11,6 @@ type ITrayRepository interface { Save(ctx context.Context, tray *trays.Tray) error Delete(ctx context.Context, trayId string) error UpdateStatus(ctx context.Context, trayId string, status trays.TrayStatus, jobRunId int64, workflowRunId int64, ghRunnerId int64, repository string) (*trays.Tray, error) - CountByTrayType(ctx context.Context, trayType string) (map[trays.TrayStatus]int, int, error) - MarkRedundant(ctx context.Context, trayType string, limit int) ([]*trays.Tray, error) + CountActive(ctx context.Context, trayType string) (int, error) GetStale(ctx context.Context, d time.Duration) ([]*trays.Tray, error) } diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index 43e46a0..cc8ee74 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -55,29 +55,15 @@ func (m *MongodbTrayRepository) GetStale(ctx context.Context, d time.Duration) ( return traysArr, nil } -func (m *MongodbTrayRepository) MarkRedundant(ctx context.Context, trayType string, limit int) ([]*trays.Tray, error) { - resultTrays := make([]*trays.Tray, 0, limit) - - for i := 0; i < limit; i++ { - dbResult := m.collection.FindOneAndUpdate( - ctx, - bson.M{"status": trays.TrayStatusCreating, "trayTypeName": trayType}, - bson.M{"$set": bson.M{"status": trays.TrayStatusDeleting, "statusChanged": time.Now().UTC(), "jobRunId": 0}}, - options.FindOneAndUpdate().SetReturnDocument(options.After)) - - var result trays.Tray - err := dbResult.Decode(&result) - if err != nil { - if errors.Is(err, mongo.ErrNoDocuments) { - break - } - return nil, err - } - - resultTrays = append(resultTrays, &result) +func (m *MongodbTrayRepository) CountActive(ctx context.Context, trayType string) (int, error) { + count, err := m.collection.CountDocuments(ctx, bson.M{ + "trayTypeName": trayType, + "status": bson.M{"$ne": trays.TrayStatusDeleting}, + }) + if err != nil { + return 0, err } - - return resultTrays, nil + return int(count), nil } func (m *MongodbTrayRepository) Save(ctx context.Context, tray *trays.Tray) error { @@ -125,40 +111,3 @@ func (m *MongodbTrayRepository) Delete(ctx context.Context, trayId string) error return err } -func (m *MongodbTrayRepository) CountByTrayType(ctx context.Context, trayType string) (map[trays.TrayStatus]int, int, error) { - matchStage := bson.D{ - {Key: "$match", Value: bson.D{{Key: "trayTypeName", Value: trayType}}}, - } - groupStage := bson.D{ - {Key: "$group", Value: bson.D{ - {Key: "_id", Value: "$status"}, - {Key: "count", Value: bson.D{{Key: "$sum", Value: 1}}}, - }}} - - cursor, err := m.collection.Aggregate(ctx, mongo.Pipeline{matchStage, groupStage}) - if err != nil { - return nil, 0, err - } - - var dbResults []bson.M - if err = cursor.All(ctx, &dbResults); err != nil { - return nil, 0, err - } - - result := map[trays.TrayStatus]int{ - trays.TrayStatusCreating: 0, - trays.TrayStatusRegistering: 0, - trays.TrayStatusDeleting: 0, - trays.TrayStatusRegistered: 0, - trays.TrayStatusRunning: 0, - } - - total := 0 - for _, res := range dbResults { - status := res["_id"].(int32) - cnt, _ := res["count"].(int32) - result[trays.TrayStatus(status)] = int(cnt) - total += int(cnt) - } - return result, total, nil -} diff --git a/src/lib/trays/repositories/mongodbTrayRepository_test.go b/src/lib/trays/repositories/mongodbTrayRepository_test.go index 6f21cd7..fa6128a 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository_test.go +++ b/src/lib/trays/repositories/mongodbTrayRepository_test.go @@ -8,7 +8,6 @@ import ( "testing" "time" - "go.mongodb.org/mongo-driver/v2/bson" "go.mongodb.org/mongo-driver/v2/mongo" "go.mongodb.org/mongo-driver/v2/mongo/options" ) @@ -287,145 +286,51 @@ func TestDelete(t *testing.T) { } } -// TestMarkRedundant tests the MarkRedundant method -func TestMarkRedundant(t *testing.T) { +// TestCountActive tests the CountActive method +func TestCountActive(t *testing.T) { client, collection := setupTestCollection(t) defer client.Disconnect(context.Background()) - // Create test repository repo := NewMongodbTrayRepository() repo.Connect(collection) - // Insert test data - testTray1 := createTestTray("test-tray-1", "test-type", trays.TrayStatusCreating, 0) - testTray2 := createTestTray("test-tray-2", "test-type", trays.TrayStatusCreating, 0) - testTray3 := createTestTray("test-tray-3", "test-type", trays.TrayStatusRegistered, 0) - testTray4 := createTestTray("test-tray-4", "other-type", trays.TrayStatusCreating, 0) - insertTestTrays(t, collection, []*TestTray{testTray1, testTray2, testTray3, testTray4}) - - // Test MarkRedundant - redundantTrays, err := repo.MarkRedundant(context.Background(),"test-type", 2) - if err != nil { - t.Fatalf("MarkRedundant failed: %v", err) - } - - // Verify that the correct number of trays were marked as redundant - if len(redundantTrays) != 2 { - t.Errorf("Expected 2 redundant trays, got %d", len(redundantTrays)) - } - - // Verify that the trays were actually marked as deleting in the database - // by querying the database directly - cursor, err := collection.Find(context.Background(), bson.M{"trayTypeName": "test-type", "status": trays.TrayStatusDeleting}) - if err != nil { - t.Fatalf("Failed to query database: %v", err) - } - - var deletingTrays []TestTray - err = cursor.All(context.Background(), &deletingTrays) - if err != nil { - t.Fatalf("Failed to decode cursor: %v", err) - } - - if len(deletingTrays) != 2 { - t.Errorf("Expected 2 trays marked as deleting in the database, got %d", len(deletingTrays)) - } - - // Verify that the correct trays were marked as deleting - deletingTrayIds := make(map[string]bool) - for _, tray := range deletingTrays { - deletingTrayIds[tray.Id] = true - - // Verify the status and jobRunId were updated correctly - if tray.Status != trays.TrayStatusDeleting { - t.Errorf("Expected tray status %v, got %v", trays.TrayStatusDeleting, tray.Status) - } - - if tray.JobRunId != 0 { - t.Errorf("Expected JobRunId 0, got %d", tray.JobRunId) - } - } - - // Check that the correct trays were marked as deleting - if !deletingTrayIds["test-tray-1"] { - t.Error("Expected test-tray-1 to be marked as deleting") - } - - if !deletingTrayIds["test-tray-2"] { - t.Error("Expected test-tray-2 to be marked as deleting") - } - - // Verify that trays with different status or type were not affected - unchangedTray, err := repo.GetById(context.Background(),"test-tray-3") - if err != nil { - t.Fatalf("Failed to get test-tray-3: %v", err) - } - - if unchangedTray.Status != trays.TrayStatusRegistered { - t.Errorf("Expected test-tray-3 status to remain %v, got %v", trays.TrayStatusRegistered, unchangedTray.Status) - } - - unchangedTray, err = repo.GetById(context.Background(),"test-tray-4") - if err != nil { - t.Fatalf("Failed to get test-tray-4: %v", err) - } - - if unchangedTray.Status != trays.TrayStatusCreating { - t.Errorf("Expected test-tray-4 status to remain %v, got %v", trays.TrayStatusCreating, unchangedTray.Status) - } - - // Test MarkRedundant with limit - // Add more test trays - testTray5 := createTestTray("test-tray-5", "test-type", trays.TrayStatusCreating, 0) - testTray6 := createTestTray("test-tray-6", "test-type", trays.TrayStatusCreating, 0) - insertTestTrays(t, collection, []*TestTray{testTray5, testTray6}) - - // Mark only 1 tray as redundant - redundantTrays, err = repo.MarkRedundant(context.Background(),"test-type", 1) - if err != nil { - t.Fatalf("MarkRedundant with limit failed: %v", err) - } - - // Verify that only 1 more tray was marked as deleting - cursor, err = collection.Find(context.Background(), bson.M{"trayTypeName": "test-type", "status": trays.TrayStatusDeleting}) - if err != nil { - t.Fatalf("Failed to query database: %v", err) + // Insert test data: 2 Creating, 1 Registered, 1 Running, 2 Deleting for test-type + testTrays := []*TestTray{ + createTestTray("test-tray-1", "test-type", trays.TrayStatusCreating, 0), + createTestTray("test-tray-2", "test-type", trays.TrayStatusCreating, 0), + createTestTray("test-tray-3", "test-type", trays.TrayStatusRegistered, 0), + createTestTray("test-tray-4", "test-type", trays.TrayStatusRunning, 0), + createTestTray("test-tray-5", "test-type", trays.TrayStatusDeleting, 0), + createTestTray("test-tray-6", "test-type", trays.TrayStatusDeleting, 0), + createTestTray("other-tray-1", "other-type", trays.TrayStatusCreating, 0), } + insertTestTrays(t, collection, testTrays) - err = cursor.All(context.Background(), &deletingTrays) + // Active = all non-deleting = 2 + 1 + 1 = 4 + count, err := repo.CountActive(context.Background(), "test-type") if err != nil { - t.Fatalf("Failed to decode cursor: %v", err) + t.Fatalf("CountActive failed: %v", err) } - - if len(deletingTrays) != 3 { - t.Errorf("Expected 3 trays marked as deleting in the database, got %d", len(deletingTrays)) + if count != 4 { + t.Errorf("Expected 4 active trays, got %d", count) } - // Test MarkRedundant with non-existent tray type - redundantTrays, err = repo.MarkRedundant(context.Background(),"non-existent", 2) + // other-type: 1 active + count, err = repo.CountActive(context.Background(), "other-type") if err != nil { - t.Fatalf("MarkRedundant with non-existent tray type failed: %v", err) - } - - if len(redundantTrays) != 0 { - t.Errorf("Expected 0 redundant trays for non-existent type, got %d", len(redundantTrays)) + t.Fatalf("CountActive for other-type failed: %v", err) } - - // Test MarkRedundant with empty collection - // Clear the collection - err = collection.Drop(context.Background()) - if err != nil { - t.Fatalf("Failed to drop collection: %v", err) + if count != 1 { + t.Errorf("Expected 1 active tray for other-type, got %d", count) } - // Try to mark redundant trays in an empty collection - redundantTrays, err = repo.MarkRedundant(context.Background(),"test-type", 2) + // non-existent type: 0 + count, err = repo.CountActive(context.Background(), "non-existent") if err != nil { - t.Fatalf("MarkRedundant with empty collection failed: %v", err) + t.Fatalf("CountActive for non-existent type failed: %v", err) } - - if len(redundantTrays) != 0 { - t.Errorf("Expected 0 redundant trays for empty collection, got %d", len(redundantTrays)) + if count != 0 { + t.Errorf("Expected 0 active trays for non-existent type, got %d", count) } } @@ -576,101 +481,3 @@ func TestConnect(t *testing.T) { } } -// TestCountByTrayType tests the CountByTrayType method -func TestCountByTrayType(t *testing.T) { - client, collection := setupTestCollection(t) - defer client.Disconnect(context.Background()) - - // Create test repository - repo := NewMongodbTrayRepository() - repo.Connect(collection) - - // Insert test data with specific counts for each status - // 2 Creating, 3 Registered, 1 Running, 2 Deleting for test-type - testTrays := []*TestTray{ - createTestTray("test-tray-1", "test-type", trays.TrayStatusCreating, 0), - createTestTray("test-tray-2", "test-type", trays.TrayStatusCreating, 0), - createTestTray("test-tray-3", "test-type", trays.TrayStatusRegistered, 0), - createTestTray("test-tray-4", "test-type", trays.TrayStatusRegistered, 0), - createTestTray("test-tray-5", "test-type", trays.TrayStatusRegistered, 0), - createTestTray("test-tray-6", "test-type", trays.TrayStatusRunning, 0), - createTestTray("test-tray-7", "test-type", trays.TrayStatusDeleting, 0), - createTestTray("test-tray-8", "test-type", trays.TrayStatusDeleting, 0), - // Different tray type - createTestTray("other-tray-1", "other-type", trays.TrayStatusCreating, 0), - createTestTray("other-tray-2", "other-type", trays.TrayStatusRegistered, 0), - } - insertTestTrays(t, collection, testTrays) - - // Test CountByTrayType for test-type - counts, total, err := repo.CountByTrayType(context.Background(),"test-type") - if err != nil { - t.Fatalf("CountByTrayType failed: %v", err) - } - - // Verify the total count - expectedTotal := 8 // Total number of test-type trays - if total != expectedTotal { - t.Errorf("Expected total count %d, got %d", expectedTotal, total) - } - - // Verify counts for each status - expectedCounts := map[trays.TrayStatus]int{ - trays.TrayStatusCreating: 2, - trays.TrayStatusRegistered: 3, - trays.TrayStatusRunning: 1, - trays.TrayStatusDeleting: 2, - trays.TrayStatusRegistering: 0, // No trays with this status - } - - for status, expectedCount := range expectedCounts { - if counts[status] != expectedCount { - t.Errorf("Expected count %d for status %v, got %d", expectedCount, status, counts[status]) - } - } - - // Test CountByTrayType for other-type - counts, total, err = repo.CountByTrayType(context.Background(),"other-type") - if err != nil { - t.Fatalf("CountByTrayType for other-type failed: %v", err) - } - - // Verify the total count for other-type - expectedTotal = 2 // Total number of other-type trays - if total != expectedTotal { - t.Errorf("Expected total count %d for other-type, got %d", expectedTotal, total) - } - - // Verify counts for each status for other-type - expectedCounts = map[trays.TrayStatus]int{ - trays.TrayStatusCreating: 1, - trays.TrayStatusRegistered: 1, - trays.TrayStatusRunning: 0, - trays.TrayStatusDeleting: 0, - trays.TrayStatusRegistering: 0, - } - - for status, expectedCount := range expectedCounts { - if counts[status] != expectedCount { - t.Errorf("Expected count %d for status %v in other-type, got %d", expectedCount, status, counts[status]) - } - } - - // Test CountByTrayType with non-existent tray type - counts, total, err = repo.CountByTrayType(context.Background(),"non-existent") - if err != nil { - t.Fatalf("CountByTrayType with non-existent tray type failed: %v", err) - } - - // Verify the total count for non-existent type - if total != 0 { - t.Errorf("Expected total count 0 for non-existent type, got %d", total) - } - - // Verify that all status counts are 0 for non-existent type - for status, count := range counts { - if count != 0 { - t.Errorf("Expected count 0 for status %v in non-existent type, got %d", status, count) - } - } -} From ff604247d4fbdd242d111e7acc451d9d52d20cea Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Fri, 27 Mar 2026 03:31:41 +0400 Subject: [PATCH 26/27] stale fix --- src/lib/trays/repositories/mongodbTrayRepository.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/trays/repositories/mongodbTrayRepository.go b/src/lib/trays/repositories/mongodbTrayRepository.go index cc8ee74..b022693 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository.go +++ b/src/lib/trays/repositories/mongodbTrayRepository.go @@ -41,7 +41,7 @@ func (m *MongodbTrayRepository) GetById(ctx context.Context, trayId string) (*tr func (m *MongodbTrayRepository) GetStale(ctx context.Context, d time.Duration) ([]*trays.Tray, error) { dbResult, err := m.collection.Find(ctx, bson.M{ - "status": bson.M{"$nin": bson.A{trays.TrayStatusRunning, trays.TrayStatusDeleting}}, + "status": bson.M{"$ne": trays.TrayStatusRunning}, "statusChanged": bson.M{"$lte": time.Now().UTC().Add(-d)}, }) if err != nil { From d225daa746419ff336006433790bc438125b240a Mon Sep 17 00:00:00 2001 From: Evgeny Snitko Date: Fri, 27 Mar 2026 05:19:54 +0400 Subject: [PATCH 27/27] errors handling --- src/agent/catteryClient/client.go | 14 +++++++------- src/agent/githubListener/githubListener.go | 6 ++++++ src/agent/githubListener/kill.go | 4 ++-- src/agent/githubListener/kill_linux.go | 5 ++--- src/lib/config/config.go | 2 +- src/lib/githubClient/githubClient.go | 16 +++++++++++----- src/lib/trayManager/trayManager.go | 5 ++++- src/lib/trays/providers/dockerProvider.go | 6 +++++- src/lib/trays/providers/gceProvider.go | 5 ++++- .../repositories/mongodbTrayRepository_test.go | 7 +++++-- src/lib/trays/tray.go | 6 +++--- 11 files changed, 50 insertions(+), 26 deletions(-) diff --git a/src/agent/catteryClient/client.go b/src/agent/catteryClient/client.go index 68e419b..4846bac 100644 --- a/src/agent/catteryClient/client.go +++ b/src/agent/catteryClient/client.go @@ -5,7 +5,7 @@ import ( "cattery/lib/agents" "cattery/lib/messages" "encoding/json" - "errors" + "fmt" "io" "net/http" "net/url" @@ -52,7 +52,7 @@ func (c *CatteryClient) RegisterAgent(id string) (*agents.Agent, *string, error) if response.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(response.Body) - return nil, nil, errors.New("response status code: " + response.Status + " body: " + string(bodyBytes)) + return nil, nil, fmt.Errorf("response status code: %s body: %s", response.Status, string(bodyBytes)) } var registerResponse = &messages.RegisterResponse{} @@ -93,7 +93,7 @@ func (c *CatteryClient) UnregisterAgent(agent *agents.Agent, reason messages.Unr if response.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(response.Body) - return errors.New("response status code: " + response.Status + " body: " + string(bodyBytes)) + return fmt.Errorf("response status code: %s body: %s", response.Status, string(bodyBytes)) } return nil @@ -103,26 +103,26 @@ func (c *CatteryClient) Ping() (*messages.PingResponse, error) { requestUrl, err := url.JoinPath(c.baseURL, "/agent", "ping", c.agentId) if err != nil { - return nil, errors.New("failed to join path: " + err.Error()) + return nil, fmt.Errorf("failed to join path: %w", err) } request, _ := http.NewRequest("POST", requestUrl, nil) response, err := c.httpClient.Do(request) if err != nil { - return nil, errors.New("post error: " + err.Error()) + return nil, fmt.Errorf("post error: %w", err) } defer response.Body.Close() if response.StatusCode != http.StatusOK { bodyBytes, _ := io.ReadAll(response.Body) - return nil, errors.New("response status code: " + response.Status + " body: " + string(bodyBytes)) + return nil, fmt.Errorf("response status code: %s body: %s", response.Status, string(bodyBytes)) } var pingResponse = &messages.PingResponse{} err = json.NewDecoder(response.Body).Decode(pingResponse) if err != nil { - return nil, errors.New("error decoding ping response: " + err.Error()) + return nil, fmt.Errorf("error decoding ping response: %w", err) } return pingResponse, nil diff --git a/src/agent/githubListener/githubListener.go b/src/agent/githubListener/githubListener.go index 602ba3f..5090edb 100644 --- a/src/agent/githubListener/githubListener.go +++ b/src/agent/githubListener/githubListener.go @@ -12,6 +12,7 @@ import ( type GithubListener struct { listenerPath string process *os.Process + started chan struct{} // closed once process has started (or failed) mut sync.Mutex } @@ -19,6 +20,7 @@ type GithubListener struct { func NewGithubListener(listenerPath string) *GithubListener { return &GithubListener{ listenerPath: listenerPath, + started: make(chan struct{}), } } @@ -33,6 +35,7 @@ func (l *GithubListener) Start(ctx context.Context, cancel context.CancelCauseFu err := commandRun.Start() if err != nil { log.Errorf("Listener failed to start: %v", err) + close(l.started) cancel(err) return } @@ -40,6 +43,7 @@ func (l *GithubListener) Start(ctx context.Context, cancel context.CancelCauseFu l.mut.Lock() l.process = commandRun.Process l.mut.Unlock() + close(l.started) err = commandRun.Wait() cancel(err) // nil means clean exit @@ -47,6 +51,8 @@ func (l *GithubListener) Start(ctx context.Context, cancel context.CancelCauseFu } func (l *GithubListener) Stop() { + <-l.started // wait for process to be set before attempting kill + l.mut.Lock() defer l.mut.Unlock() diff --git a/src/agent/githubListener/kill.go b/src/agent/githubListener/kill.go index ae7301a..6543fa7 100644 --- a/src/agent/githubListener/kill.go +++ b/src/agent/githubListener/kill.go @@ -3,14 +3,14 @@ package githubListener import ( - "errors" + "fmt" "os" ) func kill(l *GithubListener) error { err := l.process.Signal(os.Kill) if err != nil { - return errors.New("Failed to kill process: " + err.Error()) + return fmt.Errorf("failed to kill process: %w", err) } return nil diff --git a/src/agent/githubListener/kill_linux.go b/src/agent/githubListener/kill_linux.go index 9e98987..3c2743a 100644 --- a/src/agent/githubListener/kill_linux.go +++ b/src/agent/githubListener/kill_linux.go @@ -1,7 +1,7 @@ package githubListener import ( - "errors" + "fmt" "os/exec" ) @@ -9,8 +9,7 @@ func kill(l *GithubListener) error { var commandInterruptRun = exec.Command("pkill", "--signal", "SIGINT", "Runner.Listener") err := commandInterruptRun.Run() if err != nil { - var errMsg = "Failed to interrupt runner: " + err.Error() - return errors.New(errMsg) + return fmt.Errorf("failed to interrupt runner: %w", err) } return nil diff --git a/src/lib/config/config.go b/src/lib/config/config.go index 966bc03..10d6a7b 100644 --- a/src/lib/config/config.go +++ b/src/lib/config/config.go @@ -160,7 +160,7 @@ type TrayType struct { RunnerGroupId int64 `yaml:"runnerGroupId" validate:"required"` Shutdown bool `yaml:"shutdown"` GitHubOrg string `yaml:"githubOrg" validate:"required"` - MaxTrays int `yaml:"limit"` + MaxTrays int `yaml:"maxTrays"` MaxParallelCreation int `yaml:"maxParallelCreation"` Config TrayConfig `yaml:"config"` ExtraMetadata TrayExtraMetadata diff --git a/src/lib/githubClient/githubClient.go b/src/lib/githubClient/githubClient.go index b380a13..78371bc 100644 --- a/src/lib/githubClient/githubClient.go +++ b/src/lib/githubClient/githubClient.go @@ -4,6 +4,7 @@ import ( "cattery/lib/config" "context" "errors" + "fmt" "net/http" "sync" @@ -29,8 +30,13 @@ func NewGithubClientWithOrgName(orgName string) (*GithubClient, error) { return nil, errors.New("GitHub organization not found") } + client, err := createClient(orgConfig) + if err != nil { + return nil, err + } + return &GithubClient{ - client: createClient(orgConfig), + client: client, Org: orgConfig, }, nil } @@ -55,12 +61,12 @@ func (gc *GithubClient) GetWorkflowRunStatus(repoName string, workflowRunId int6 } // createClient creates a new GitHub client -func createClient(org *config.GitHubOrganization) *github.Client { +func createClient(org *config.GitHubOrganization) (*github.Client, error) { githubClientsMu.Lock() defer githubClientsMu.Unlock() if githubClient, ok := githubClients[org.Name]; ok { - return githubClient + return githubClient, nil } tr := http.DefaultTransport @@ -73,7 +79,7 @@ func createClient(org *config.GitHubOrganization) *github.Client { ) if err != nil { - log.Fatal(err) + return nil, fmt.Errorf("failed to load GitHub App private key for org %s: %w", org.Name, err) } // Use installation transport with github.com/google/go-github @@ -81,5 +87,5 @@ func createClient(org *config.GitHubOrganization) *github.Client { githubClients[org.Name] = client - return client + return client, nil } diff --git a/src/lib/trayManager/trayManager.go b/src/lib/trayManager/trayManager.go index 75e7f20..ccfc8b0 100644 --- a/src/lib/trayManager/trayManager.go +++ b/src/lib/trayManager/trayManager.go @@ -85,7 +85,10 @@ func (tm *TrayManager) CreateTray(ctx context.Context, trayType *config.TrayType return fmt.Errorf("failed to get provider for type %s: %w", trayType.Name, err) } - tray := trays.NewTray(*trayType) + tray, err := trays.NewTray(*trayType) + if err != nil { + return err + } err = provider.RunTray(tray) if err != nil { diff --git a/src/lib/trays/providers/dockerProvider.go b/src/lib/trays/providers/dockerProvider.go index 93d2a3d..fac5738 100644 --- a/src/lib/trays/providers/dockerProvider.go +++ b/src/lib/trays/providers/dockerProvider.go @@ -3,6 +3,7 @@ package providers import ( "cattery/lib/config" "cattery/lib/trays" + "fmt" "os/exec" "strings" @@ -40,7 +41,10 @@ func (d *DockerProvider) RunTray(tray *trays.Tray) error { var containerName = tray.Id - var trayConfig = tray.TrayConfig().(config.DockerTrayConfig) + trayConfig, ok := tray.TrayConfig().(config.DockerTrayConfig) + if !ok { + return fmt.Errorf("unexpected tray config type for docker provider, tray %s", tray.Id) + } var image = trayConfig.Image diff --git a/src/lib/trays/providers/gceProvider.go b/src/lib/trays/providers/gceProvider.go index e8c2785..3278960 100644 --- a/src/lib/trays/providers/gceProvider.go +++ b/src/lib/trays/providers/gceProvider.go @@ -50,7 +50,10 @@ func (g *GceProvider) GetProviderName() string { func (g *GceProvider) RunTray(tray *trays.Tray) error { ctx := context.Background() - var trayConfig = tray.TrayConfig().(config.GoogleTrayConfig) + trayConfig, ok := tray.TrayConfig().(config.GoogleTrayConfig) + if !ok { + return fmt.Errorf("unexpected tray config type for gce provider, tray %s", tray.Id) + } var ( project = g.providerConfig.Get("project") diff --git a/src/lib/trays/repositories/mongodbTrayRepository_test.go b/src/lib/trays/repositories/mongodbTrayRepository_test.go index fa6128a..26cac86 100644 --- a/src/lib/trays/repositories/mongodbTrayRepository_test.go +++ b/src/lib/trays/repositories/mongodbTrayRepository_test.go @@ -141,13 +141,16 @@ func TestSave(t *testing.T) { Config: &config.DockerTrayConfig{Image: "alpine", NamePrefix: "test"}, } - tray := trays.NewTray(trayType) + tray, err := trays.NewTray(trayType) + if err != nil { + t.Fatalf("NewTray failed: %v", err) + } // Set ProviderData and verify it round-trips tray.ProviderData["zone"] = "abc123" tray.ProviderData["something"] = "worker-1" // Test Save - err := repo.Save(context.Background(),tray) + err = repo.Save(context.Background(),tray) if err != nil { t.Fatalf("Save failed: %v", err) } diff --git a/src/lib/trays/tray.go b/src/lib/trays/tray.go index 2cac404..aee685c 100644 --- a/src/lib/trays/tray.go +++ b/src/lib/trays/tray.go @@ -24,11 +24,11 @@ type Tray struct { ProviderData map[string]string `bson:"providerData"` } -func NewTray(trayType config.TrayType) *Tray { +func NewTray(trayType config.TrayType) (*Tray, error) { b := make([]byte, 8) _, err := rand.Read(b) if err != nil { - panic(err) + return nil, fmt.Errorf("failed to generate tray ID: %w", err) } id := hex.EncodeToString(b) @@ -40,7 +40,7 @@ func NewTray(trayType config.TrayType) *Tray { Status: TrayStatusCreating, GitHubOrgName: trayType.GitHubOrg, ProviderData: make(map[string]string), - } + }, nil } // TrayType returns the configuration for this tray's type from the current config.