From 9dfd4aee1667f1cb79e2fd4ce7e415b24bea624f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Fri, 8 May 2026 23:21:55 +0800 Subject: [PATCH 01/58] feat(sdk-go): add User-Agent header to HTTP requests Set User-Agent: OpenSandbox-Go-SDK/1.0.1 on all outgoing requests (doRequestOnce, doStreamRequest, GetCommandLogs, UploadFiles, DownloadFile). Co-Authored-By: Claude Opus 4.7 --- sdks/sandbox/go/constants.go | 3 +++ sdks/sandbox/go/execd.go | 3 +++ sdks/sandbox/go/http.go | 2 ++ 3 files changed, 8 insertions(+) diff --git a/sdks/sandbox/go/constants.go b/sdks/sandbox/go/constants.go index de0bc557c..a3f659f8e 100644 --- a/sdks/sandbox/go/constants.go +++ b/sdks/sandbox/go/constants.go @@ -38,6 +38,9 @@ const ( // DefaultCodeInterpreterTimeoutSeconds is the default TTL for code interpreter sandboxes. DefaultCodeInterpreterTimeoutSeconds = 900 + // Version is the SDK version reported in the User-Agent header. + Version = "1.0.1" + // APIVersion is the lifecycle API version prefix. APIVersion = "v1" diff --git a/sdks/sandbox/go/execd.go b/sdks/sandbox/go/execd.go index b085a562e..f48303849 100644 --- a/sdks/sandbox/go/execd.go +++ b/sdks/sandbox/go/execd.go @@ -161,6 +161,7 @@ func (e *ExecdClient) GetCommandLogs(ctx context.Context, commandID string, curs if err != nil { return fmt.Errorf("opensandbox: create request: %w", err) } + req.Header.Set("User-Agent", "OpenSandbox-Go-SDK/"+Version) for k, v := range e.client.headers { req.Header.Set(k, v) } @@ -274,6 +275,7 @@ func (e *ExecdClient) UploadFiles(ctx context.Context, entries []UploadFileEntry } defer bodyCloser.Close() + req.Header.Set("User-Agent", "OpenSandbox-Go-SDK/"+Version) for k, v := range e.client.headers { req.Header.Set(k, v) } @@ -371,6 +373,7 @@ func (e *ExecdClient) DownloadFile(ctx context.Context, remotePath string, range if err != nil { return fmt.Errorf("opensandbox: create request: %w", err) } + req.Header.Set("User-Agent", "OpenSandbox-Go-SDK/"+Version) for k, v := range e.client.headers { req.Header.Set(k, v) } diff --git a/sdks/sandbox/go/http.go b/sdks/sandbox/go/http.go index ca09e6eba..3d4e86349 100644 --- a/sdks/sandbox/go/http.go +++ b/sdks/sandbox/go/http.go @@ -143,6 +143,7 @@ func (c *Client) doRequestOnce(ctx context.Context, method, path string, body an return fmt.Errorf("opensandbox: create request: %w", err) } + req.Header.Set("User-Agent", "OpenSandbox-Go-SDK/"+Version) for k, v := range c.headers { req.Header.Set(k, v) } @@ -197,6 +198,7 @@ func (c *Client) doStreamRequest(ctx context.Context, method, path string, body return fmt.Errorf("opensandbox: create request: %w", err) } + req.Header.Set("User-Agent", "OpenSandbox-Go-SDK/"+Version) for k, v := range c.headers { req.Header.Set(k, v) } From 59ef7600f7ab688808f6142f43cfb09ea24180ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sat, 9 May 2026 00:51:48 +0800 Subject: [PATCH 02/58] fix(go-sdk): drain response bodies, add io.Closer, remove deprecated DSA - Drain HTTP response bodies after read to preserve connection reuse - Change Sandbox.Close / SandboxManager.Close to return error (io.Closer) - Add tests for Sandbox lifecycle, health, and Close methods - Replace manual url.QueryEscape with url.Values for consistency - Remove deprecated crypto/dsa import and DSA public-key branch - Remove unused encoding/json import workaround in retry_test.go - Clean up unused go.mod dependencies Co-Authored-By: Claude Opus 4.7 --- sdks/sandbox/go/crypto_policy.go | 33 +---- sdks/sandbox/go/execd.go | 28 +++- sdks/sandbox/go/go.mod | 8 -- sdks/sandbox/go/go.sum | 19 --- sdks/sandbox/go/http.go | 2 + sdks/sandbox/go/manager.go | 2 +- sdks/sandbox/go/retry_test.go | 4 - sdks/sandbox/go/sandbox.go | 3 +- sdks/sandbox/go/sandbox_test.go | 234 +++++++++++++++++++++++++++++++ 9 files changed, 265 insertions(+), 68 deletions(-) create mode 100644 sdks/sandbox/go/sandbox_test.go diff --git a/sdks/sandbox/go/crypto_policy.go b/sdks/sandbox/go/crypto_policy.go index 08ea4e17c..644a871bd 100644 --- a/sdks/sandbox/go/crypto_policy.go +++ b/sdks/sandbox/go/crypto_policy.go @@ -15,7 +15,6 @@ package opensandbox import ( - "crypto/dsa" "crypto/ecdsa" "crypto/ed25519" "crypto/rsa" @@ -25,20 +24,18 @@ import ( ) const ( - nistMinRSABits = 2048 - nistMinDLKeyBits = 224 - nistMinDLGroupBits = 2048 - nistMinECBits = 224 - nistMinHashBits = 224 + nistMinRSABits = 2048 + nistMinECBits = 224 + nistMinHashBits = 224 ) func minHashBitsForSignatureAlgorithm(algo x509.SignatureAlgorithm) (int, error) { switch algo { case x509.MD2WithRSA, x509.MD5WithRSA: return 128, nil - case x509.SHA1WithRSA, x509.DSAWithSHA1, x509.ECDSAWithSHA1: + case x509.SHA1WithRSA, x509.ECDSAWithSHA1: return 160, nil - case x509.DSAWithSHA256, x509.SHA256WithRSA, x509.ECDSAWithSHA256: + case x509.SHA256WithRSA, x509.ECDSAWithSHA256: return 256, nil case x509.SHA384WithRSA, x509.ECDSAWithSHA384: return 384, nil @@ -103,26 +100,6 @@ func ensureCertPublicKeyMeetsNISTMinimums(cert *x509.Certificate) error { nistMinECBits, ) } - case *dsa.PublicKey: - if pub.Parameters.P == nil || pub.Parameters.Q == nil { - return fmt.Errorf("certificate DSA public key parameters are incomplete") - } - subgroupBits := pub.Parameters.Q.BitLen() - groupBits := pub.Parameters.P.BitLen() - if subgroupBits < nistMinDLKeyBits { - return fmt.Errorf( - "certificate DSA subgroup (Q) length %d bits is below NIST minimum %d bits", - subgroupBits, - nistMinDLKeyBits, - ) - } - if groupBits < nistMinDLGroupBits { - return fmt.Errorf( - "certificate DSA group (P) length %d bits is below NIST minimum %d bits", - groupBits, - nistMinDLGroupBits, - ) - } case ed25519.PublicKey: bits := len(pub) * 8 if bits < nistMinECBits { diff --git a/sdks/sandbox/go/execd.go b/sdks/sandbox/go/execd.go index b085a562e..99ce63af2 100644 --- a/sdks/sandbox/go/execd.go +++ b/sdks/sandbox/go/execd.go @@ -51,7 +51,9 @@ func (e *ExecdClient) Ping(ctx context.Context) error { // ListContexts returns all active code execution contexts for the given language. func (e *ExecdClient) ListContexts(ctx context.Context, language string) ([]CodeContext, error) { var result []CodeContext - path := "/code/contexts?language=" + url.QueryEscape(language) + params := url.Values{} + params.Set("language", language) + path := "/code/contexts?" + params.Encode() err := e.client.doRequest(ctx, http.MethodGet, path, nil, &result) return result, err } @@ -85,7 +87,9 @@ func (e *ExecdClient) DeleteContext(ctx context.Context, contextID string) error // DeleteContextsByLanguage deletes all code execution contexts for the given language. func (e *ExecdClient) DeleteContextsByLanguage(ctx context.Context, language string) error { - path := "/code/contexts?language=" + url.QueryEscape(language) + params := url.Values{} + params.Set("language", language) + path := "/code/contexts?" + params.Encode() return e.client.doRequest(ctx, http.MethodDelete, path, nil, nil) } @@ -97,7 +101,9 @@ func (e *ExecdClient) ExecuteCode(ctx context.Context, req RunCodeRequest, handl // InterruptCode interrupts the currently running code execution. func (e *ExecdClient) InterruptCode(ctx context.Context, sessionID string) error { - path := "/code?id=" + url.QueryEscape(sessionID) + params := url.Values{} + params.Set("id", sessionID) + path := "/code?" + params.Encode() return e.client.doRequest(ctx, http.MethodDelete, path, nil, nil) } @@ -131,7 +137,9 @@ func (e *ExecdClient) RunCommand(ctx context.Context, req RunCommandRequest, han // InterruptCommand interrupts the currently running command execution. func (e *ExecdClient) InterruptCommand(ctx context.Context, sessionID string) error { - path := "/command?id=" + url.QueryEscape(sessionID) + params := url.Values{} + params.Set("id", sessionID) + path := "/command?" + params.Encode() return e.client.doRequest(ctx, http.MethodDelete, path, nil, nil) } @@ -206,7 +214,9 @@ func (e *ExecdClient) GetCommandLogs(ctx context.Context, commandID string, curs // GetFileInfo retrieves metadata for the file at the given path. func (e *ExecdClient) GetFileInfo(ctx context.Context, path string) (map[string]FileInfo, error) { var result map[string]FileInfo - reqPath := "/files/info?path=" + url.QueryEscape(path) + params := url.Values{} + params.Set("path", path) + reqPath := "/files/info?" + params.Encode() err := e.client.doRequest(ctx, http.MethodGet, reqPath, nil, &result) return result, err } @@ -363,7 +373,9 @@ func (e *ExecdClient) newUploadFilesRequest(ctx context.Context, entries []Uploa // returned io.ReadCloser. Pass rangeHeader (e.g. "bytes=0-1023") for partial // content, or empty string for the full file. func (e *ExecdClient) DownloadFile(ctx context.Context, remotePath string, rangeHeader string) (io.ReadCloser, error) { - reqPath := "/files/download?path=" + url.QueryEscape(remotePath) + params := url.Values{} + params.Set("path", remotePath) + reqPath := "/files/download?" + params.Encode() var resp *http.Response err := e.client.withRetry(ctx, func() error { @@ -418,7 +430,9 @@ func OctalMode(m os.FileMode) int { // DeleteDirectory deletes a directory and all its contents recursively. func (e *ExecdClient) DeleteDirectory(ctx context.Context, path string) error { - reqPath := "/directories?path=" + url.QueryEscape(path) + params := url.Values{} + params.Set("path", path) + reqPath := "/directories?" + params.Encode() return e.client.doRequest(ctx, http.MethodDelete, reqPath, nil, nil) } diff --git a/sdks/sandbox/go/go.mod b/sdks/sandbox/go/go.mod index 102c1d54f..c17718eac 100644 --- a/sdks/sandbox/go/go.mod +++ b/sdks/sandbox/go/go.mod @@ -1,11 +1,3 @@ module github.com/alibaba/OpenSandbox/sdks/sandbox/go go 1.20 - -require github.com/oapi-codegen/runtime v1.2.0 - -require ( - github.com/apapsch/go-jsonmerge/v2 v2.0.0 // indirect - github.com/google/uuid v1.6.0 // indirect - github.com/stretchr/testify v1.11.1 // indirect -) diff --git a/sdks/sandbox/go/go.sum b/sdks/sandbox/go/go.sum index 0659558c5..e69de29bb 100644 --- a/sdks/sandbox/go/go.sum +++ b/sdks/sandbox/go/go.sum @@ -1,19 +0,0 @@ -github.com/RaveNoX/go-jsoncommentstrip v1.0.0/go.mod h1:78ihd09MekBnJnxpICcwzCMzGrKSKYe4AqU6PDYYpjk= -github.com/apapsch/go-jsonmerge/v2 v2.0.0 h1:axGnT1gRIfimI7gJifB699GoE/oq+F2MU7Dml6nw9rQ= -github.com/apapsch/go-jsonmerge/v2 v2.0.0/go.mod h1:lvDnEdqiQrp0O42VQGgmlKpxL1AP2+08jFMw88y4klk= -github.com/bmatcuk/doublestar v1.1.1/go.mod h1:UD6OnuiIn0yFxxA2le/rnRU1G4RaI4UvFv1sNto9p6w= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= -github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= -github.com/juju/gnuflag v0.0.0-20171113085948-2ce1bb71843d/go.mod h1:2PavIy+JPciBPrBUjwbNvtwB6RQlve+hkpll6QSNmOE= -github.com/oapi-codegen/runtime v1.2.0 h1:RvKc1CVS1QeKSNzO97FBQbSMZyQ8s6rZd+LpmzwHMP4= -github.com/oapi-codegen/runtime v1.2.0/go.mod h1:Y7ZhmmlE8ikZOmuHRRndiIm7nf3xcVv+YMweKgG1DT0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/spkg/bom v0.0.0-20160624110644-59b7046e48ad/go.mod h1:qLr4V1qq6nMqFKkMo8ZTx3f+BZEkzsRUY10Xsm2mwU0= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= -github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= -github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/sdks/sandbox/go/http.go b/sdks/sandbox/go/http.go index ca09e6eba..754fca297 100644 --- a/sdks/sandbox/go/http.go +++ b/sdks/sandbox/go/http.go @@ -166,12 +166,14 @@ func (c *Client) doRequestOnce(ctx context.Context, method, path string, body an // No content (e.g. 204) if resp.StatusCode == http.StatusNoContent || result == nil { + io.Copy(io.Discard, resp.Body) return nil } if err := json.NewDecoder(resp.Body).Decode(result); err != nil { return fmt.Errorf("opensandbox: decode response: %w", err) } + io.Copy(io.Discard, resp.Body) return nil } diff --git a/sdks/sandbox/go/manager.go b/sdks/sandbox/go/manager.go index e5f561caa..a46ca0a38 100644 --- a/sdks/sandbox/go/manager.go +++ b/sdks/sandbox/go/manager.go @@ -83,4 +83,4 @@ func (m *SandboxManager) DeleteSnapshot(ctx context.Context, snapshotID string) } // Close releases local resources. Currently a no-op placeholder. -func (m *SandboxManager) Close() {} +func (m *SandboxManager) Close() error { return nil } diff --git a/sdks/sandbox/go/retry_test.go b/sdks/sandbox/go/retry_test.go index 677cb3421..ce53ce6f2 100644 --- a/sdks/sandbox/go/retry_test.go +++ b/sdks/sandbox/go/retry_test.go @@ -22,7 +22,6 @@ import ( "crypto/rsa" "crypto/tls" "crypto/x509" - "encoding/json" "fmt" "math/big" "net/http" @@ -637,6 +636,3 @@ func TestRetry_CustomRetryableStatusCodes(t *testing.T) { require.Equal(t, "sbx-500-retried", got.ID) require.Equal(t, int32(2), attempts.Load()) } - -// suppress unused import warning -var _ = json.Marshal diff --git a/sdks/sandbox/go/sandbox.go b/sdks/sandbox/go/sandbox.go index 56aacbe40..a54346b6c 100644 --- a/sdks/sandbox/go/sandbox.go +++ b/sdks/sandbox/go/sandbox.go @@ -227,9 +227,10 @@ func (s *Sandbox) Kill(ctx context.Context) error { } // Close releases local HTTP resources. Does NOT terminate the sandbox. -func (s *Sandbox) Close() { +func (s *Sandbox) Close() error { // No-op for now — Go's http.Client doesn't need explicit close. // Placeholder for future transport pooling. + return nil } // Pause pauses the sandbox while preserving its state. diff --git a/sdks/sandbox/go/sandbox_test.go b/sdks/sandbox/go/sandbox_test.go new file mode 100644 index 000000000..2af2b1de0 --- /dev/null +++ b/sdks/sandbox/go/sandbox_test.go @@ -0,0 +1,234 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package opensandbox + +import ( + "context" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +func TestSandbox_Close(t *testing.T) { + sb := &Sandbox{id: "sbx-close"} + require.NoError(t, sb.Close(), "Close should return nil") +} + +func TestSandboxManager_Close(t *testing.T) { + mgr := &SandboxManager{} + require.NoError(t, mgr.Close(), "Close should return nil") +} + +func TestSandbox_Kill(t *testing.T) { + var ( + gotMethod string + gotPath string + ) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotMethod = r.Method + gotPath = r.URL.Path + w.WriteHeader(http.StatusNoContent) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-kill-test", + lifecycle: NewLifecycleClient(srv.URL, "test-key"), + } + + require.NoError(t, sb.Kill(context.Background())) + if gotMethod != http.MethodDelete { + assert.Fail(t, fmt.Sprintf("method = %q, want DELETE", gotMethod)) + } + if gotPath != "/sandboxes/sbx-kill-test" { + assert.Fail(t, fmt.Sprintf("path = %q, want /sandboxes/sbx-kill-test", gotPath)) + } +} + +func TestSandbox_GetInfo(t *testing.T) { + want := SandboxInfo{ + ID: "sbx-info", + Status: SandboxStatus{State: StateRunning}, + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + jsonResponse(w, http.StatusOK, want) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-info", + lifecycle: NewLifecycleClient(srv.URL, "test-key"), + } + + got, err := sb.GetInfo(context.Background()) + require.NoErrorf(t, err, "GetInfo") + if got.ID != want.ID { + assert.Fail(t, fmt.Sprintf("ID = %q, want %q", got.ID, want.ID)) + } + if got.Status.State != StateRunning { + assert.Fail(t, fmt.Sprintf("State = %q, want %q", got.Status.State, StateRunning)) + } +} + +func TestSandbox_Ping_ExecdNil(t *testing.T) { + sb := &Sandbox{id: "sbx-no-execd"} + err := sb.Ping(context.Background()) + require.Error(t, err) + if !strings.Contains(err.Error(), "execd client not initialized") { + assert.Fail(t, fmt.Sprintf("error = %q, want contains 'execd client not initialized'", err.Error())) + } +} + +func TestSandbox_Ping_OK(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-ping-ok", + execd: NewExecdClient(srv.URL, "tok"), + } + + require.NoError(t, sb.Ping(context.Background())) +} + +func TestSandbox_IsHealthy_ExecdNil(t *testing.T) { + sb := &Sandbox{id: "sbx-no-execd"} + if sb.IsHealthy(context.Background()) { + assert.Fail(t, "IsHealthy should return false when execd is nil") + } +} + +func TestSandbox_IsHealthy_True(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-healthy", + execd: NewExecdClient(srv.URL, "tok"), + } + + if !sb.IsHealthy(context.Background()) { + assert.Fail(t, "IsHealthy should return true when execd /ping succeeds") + } +} + +func TestSandbox_Renew(t *testing.T) { + expiresAt := time.Now().UTC().Add(time.Hour).Truncate(time.Second) + want := RenewExpirationResponse{ + ExpiresAt: expiresAt, + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + assert.Fail(t, fmt.Sprintf("expected POST, got %s", r.Method)) + } + if !strings.HasSuffix(r.URL.Path, "/renew-expiration") { + assert.Fail(t, fmt.Sprintf("expected /renew-expiration suffix in path %s", r.URL.Path)) + } + jsonResponse(w, http.StatusOK, want) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-renew", + lifecycle: NewLifecycleClient(srv.URL, "test-key"), + } + + got, err := sb.Renew(context.Background(), time.Hour) + require.NoErrorf(t, err, "Renew") + if got.ExpiresAt.Truncate(time.Second).Equal(expiresAt) { + return + } + assert.Fail(t, fmt.Sprintf("ExpiresAt = %v, want ~%v", got.ExpiresAt, expiresAt)) +} + +func TestSandbox_Pause(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + assert.Fail(t, fmt.Sprintf("expected POST, got %s", r.Method)) + } + if r.URL.Path != "/sandboxes/sbx-pause/pause" { + assert.Fail(t, fmt.Sprintf("path = %q, want /sandboxes/sbx-pause/pause", r.URL.Path)) + } + w.WriteHeader(http.StatusNoContent) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-pause", + lifecycle: NewLifecycleClient(srv.URL, "test-key"), + } + + require.NoError(t, sb.Pause(context.Background())) +} + +func TestSandbox_CreateSnapshot(t *testing.T) { + want := SnapshotInfo{ + ID: "snap-1", + SandboxID: "sbx-snap", + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + assert.Fail(t, fmt.Sprintf("expected POST, got %s", r.Method)) + } + if r.URL.Path != "/sandboxes/sbx-snap/snapshots" { + assert.Fail(t, fmt.Sprintf("path = %q, want /sandboxes/sbx-snap/snapshots", r.URL.Path)) + } + jsonResponse(w, http.StatusCreated, want) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-snap", + lifecycle: NewLifecycleClient(srv.URL, "test-key"), + } + + got, err := sb.CreateSnapshot(context.Background(), CreateSnapshotRequest{}) + require.NoErrorf(t, err, "CreateSnapshot") + if got.ID != "snap-1" { + assert.Fail(t, fmt.Sprintf("ID = %q, want snap-1", got.ID)) + } +} + +func TestSandbox_GetEndpoint(t *testing.T) { + want := Endpoint{ + Endpoint: "https://sbx-test.example.com:8080", + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !strings.Contains(r.URL.Path, "/sandboxes/sbx-endpoint/endpoints/8080") { + assert.Fail(t, fmt.Sprintf("expected path containing /sandboxes/sbx-endpoint/endpoints/8080, got %s", r.URL.Path)) + } + jsonResponse(w, http.StatusOK, want) + })) + defer srv.Close() + + sb := &Sandbox{ + id: "sbx-endpoint", + lifecycle: NewLifecycleClient(srv.URL, "test-key"), + config: &ConnectionConfig{}, + } + + got, err := sb.GetEndpoint(context.Background(), 8080) + require.NoErrorf(t, err, "GetEndpoint") + if got.Endpoint != want.Endpoint { + assert.Fail(t, fmt.Sprintf("Endpoint = %q, want %q", got.Endpoint, want.Endpoint)) + } +} From 29a7279af3c9ad52d593e3778ca81cf1da4ef6f4 Mon Sep 17 00:00:00 2001 From: yoogo Date: Sat, 9 May 2026 17:57:27 +0800 Subject: [PATCH 03/58] feat(k8s): add containerd-socket-path to controller --- .../opensandbox-controller/templates/deployment.yaml | 3 +++ kubernetes/charts/opensandbox-controller/values.yaml | 2 ++ kubernetes/cmd/controller/main.go | 4 ++++ .../internal/controller/sandboxsnapshot_controller.go | 3 +++ .../internal/controller/sandboxsnapshot_lifecycle.go | 11 +++++++++-- 5 files changed, 21 insertions(+), 2 deletions(-) diff --git a/kubernetes/charts/opensandbox-controller/templates/deployment.yaml b/kubernetes/charts/opensandbox-controller/templates/deployment.yaml index a94ecbe45..bbbe7a4cc 100644 --- a/kubernetes/charts/opensandbox-controller/templates/deployment.yaml +++ b/kubernetes/charts/opensandbox-controller/templates/deployment.yaml @@ -61,6 +61,9 @@ spec: {{- if .Values.controller.snapshot.imageCommitterImage }} - --image-committer-image={{ .Values.controller.snapshot.imageCommitterImage }} {{- end }} + {{- if .Values.controller.snapshot.containerdSocketPath }} + - --containerd-socket-path={{ .Values.controller.snapshot.containerdSocketPath }} + {{- end }} {{- if .Values.controller.snapshot.commitJobTimeout }} - --commit-job-timeout={{ .Values.controller.snapshot.commitJobTimeout }} {{- end }} diff --git a/kubernetes/charts/opensandbox-controller/values.yaml b/kubernetes/charts/opensandbox-controller/values.yaml index dd969a298..8d9b7c168 100644 --- a/kubernetes/charts/opensandbox-controller/values.yaml +++ b/kubernetes/charts/opensandbox-controller/values.yaml @@ -48,6 +48,8 @@ controller: snapshot: # -- Image used for commit operations (must contain nerdctl tool) imageCommitterImage: "image-committer:dev" + # -- Containerd socket path of host + containerdSocketPath: "/var/run/containerd/containerd.sock" # -- Timeout duration for commit jobs commitJobTimeout: "10m" # -- OCI registry prefix used for snapshot images. diff --git a/kubernetes/cmd/controller/main.go b/kubernetes/cmd/controller/main.go index b82234e83..5f402eb5d 100644 --- a/kubernetes/cmd/controller/main.go +++ b/kubernetes/cmd/controller/main.go @@ -198,6 +198,9 @@ func main() { var imageCommitterImage string flag.StringVar(&imageCommitterImage, "image-committer-image", "image-committer:dev", "The image used for commit operations (contains nerdctl tool).") + var containerdSocketPath string + flag.StringVar(&containerdSocketPath, "containerd-socket-path", controller.ContainerdSocketPath, "Containerd socket path") + // Commit job timeout var commitJobTimeout time.Duration flag.DurationVar(&commitJobTimeout, "commit-job-timeout", 10*time.Minute, "The timeout duration for commit jobs.") @@ -443,6 +446,7 @@ func main() { Scheme: mgr.GetScheme(), Recorder: mgr.GetEventRecorderFor("sandboxsnapshot-controller"), ImageCommitterImage: imageCommitterImage, + ContainerdSocketPath: containerdSocketPath, CommitJobTimeout: commitJobTimeout, SnapshotRegistry: snapshotRegistry, SnapshotRegistryInsecure: snapshotRegistryInsecure, diff --git a/kubernetes/internal/controller/sandboxsnapshot_controller.go b/kubernetes/internal/controller/sandboxsnapshot_controller.go index 2c6b6d6f7..d1e5288b2 100644 --- a/kubernetes/internal/controller/sandboxsnapshot_controller.go +++ b/kubernetes/internal/controller/sandboxsnapshot_controller.go @@ -67,6 +67,9 @@ type SandboxSnapshotReconciler struct { // ImageCommitterImage is the image for image-committer (uses nerdctl to commit/push container images) ImageCommitterImage string + // ContainerdSocketPath is containerd socket path for image-committer (nerdctl --address) + ContainerdSocketPath string + // CommitJobTimeout is the timeout for commit jobs (default: 10 minutes) CommitJobTimeout time.Duration diff --git a/kubernetes/internal/controller/sandboxsnapshot_lifecycle.go b/kubernetes/internal/controller/sandboxsnapshot_lifecycle.go index 26e34ecc2..31e9c6421 100644 --- a/kubernetes/internal/controller/sandboxsnapshot_lifecycle.go +++ b/kubernetes/internal/controller/sandboxsnapshot_lifecycle.go @@ -305,6 +305,13 @@ func (r *SandboxSnapshotReconciler) imageCommitterImage() string { return "image-committer:dev" } +func (r *SandboxSnapshotReconciler) containerdSocketPath() string { + if r.ContainerdSocketPath != "" { + return r.ContainerdSocketPath + } + return ContainerdSocketPath +} + func commitJobSecurityContext() *corev1.SecurityContext { return &corev1.SecurityContext{ RunAsUser: ptrToInt64(0), @@ -326,7 +333,7 @@ func (r *SandboxSnapshotReconciler) buildCommitJob(snapshot *sandboxv1alpha1.San { Name: "containerd-sock", VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: ContainerdSocketPath}, + HostPath: &corev1.HostPathVolumeSource{Path: r.containerdSocketPath()}, }, }, } @@ -462,7 +469,7 @@ func (r *SandboxSnapshotReconciler) buildUnpauseJob(snapshot *sandboxv1alpha1.Sa { Name: "containerd-sock", VolumeSource: corev1.VolumeSource{ - HostPath: &corev1.HostPathVolumeSource{Path: ContainerdSocketPath}, + HostPath: &corev1.HostPathVolumeSource{Path: r.containerdSocketPath()}, }, }, }, From 7f57bedb9b7609486d7f678da5cac77d58a637be Mon Sep 17 00:00:00 2001 From: Generalwin <52099674+Generalwin@users.noreply.github.com> Date: Wed, 13 May 2026 21:56:34 +0800 Subject: [PATCH 04/58] feat(k8s): add dockur windows pool exmaple (#878) * feat(k8s): add dockur windows pool exmaple * feat(server): fix windows reboot --- examples/README.md | 1 + examples/windows/README.md | 182 +++++++++++++ examples/windows/main.py | 61 +++++ examples/windows/main_fix_net.py | 92 +++++++ examples/windows/main_use_pool.py | 58 ++++ examples/windows/pool-win-example.yaml | 92 +++++++ .../services/k8s/batchsandbox_provider.py | 7 +- .../services/k8s/provider_common.py | 3 + .../services/k8s/windows_profile.py | 66 ++++- .../tests/k8s/test_batchsandbox_provider.py | 82 +++++- server/tests/k8s/test_k8s_windows_profile.py | 249 ++++++++++++++++++ 11 files changed, 888 insertions(+), 5 deletions(-) create mode 100644 examples/windows/README.md create mode 100644 examples/windows/main.py create mode 100644 examples/windows/main_fix_net.py create mode 100644 examples/windows/main_use_pool.py create mode 100644 examples/windows/pool-win-example.yaml create mode 100644 server/tests/k8s/test_k8s_windows_profile.py diff --git a/examples/README.md b/examples/README.md index 59fd8087f..42e4eb8c3 100644 --- a/examples/README.md +++ b/examples/README.md @@ -21,6 +21,7 @@ Examples for common OpenSandbox use cases. Each subdirectory contains runnable c - 🦞 [**nullclaw**](nullclaw): Launch a Nullclaw Gateway inside a sandbox - 🦞 [**openclaw**](openclaw): Run an OpenClaw Gateway inside a sandbox - 🖥️ [**desktop**](desktop): Launch VNC desktop (Xvfb + x11vnc) for VNC client connections +- 🪟 [**windows**](windows): Run a Windows guest VM via KVM/QEMU with RDP and web console access - Playwright [**playwright**](playwright): Launch headless browser (Playwright + Chromium) to scrape web content - VS Code [**vscode**](vscode): Launch code-server (VS Code Web) to provide browser access - Google Chrome [**chrome**](chrome): Launch headless Chromium with DevTools port exposed for remote debugging diff --git a/examples/windows/README.md b/examples/windows/README.md new file mode 100644 index 000000000..05afc0d51 --- /dev/null +++ b/examples/windows/README.md @@ -0,0 +1,182 @@ +# Windows Sandbox Example + +Run a Windows guest in an OpenSandbox sandbox via KVM/QEMU using the [`dockur/windows`](https://github.com/dockur/windows) image. + +## How it works + +OpenSandbox creates a Linux container running KVM/QEMU, which boots a Windows guest OS inside it. The Windows profile (`platform.os=windows`) automatically configures the required devices, capabilities, OEM scripts, and port mappings — you only need to specify `platform` and `resource` in the SDK call. + +## Prerequisites + +- OpenSandbox server running (e.g. `http://localhost:8080`) +- Host with `/dev/kvm` and `/dev/net/tun` present +- Server `storage.allowed_host_paths` configured for any host bind mounts + +## Start OpenSandbox server [local] + +```shell +uv pip install opensandbox-server +opensandbox-server init-config ~/.sandbox.toml --example docker +opensandbox-server +``` + +## Run the example + +```shell +uv pip install opensandbox +python main.py +``` + +The script will: + +1. Create a Windows sandbox with `dockurr/windows:latest` and Windows 11 +2. Wait until the sandbox is healthy (first boot can take several minutes) +3. Print the execd, RDP (3389), and web console (8006) endpoints +4. Execute a test command and print the output + +## Environment Variables + +- `SANDBOX_DOMAIN`: Sandbox service address (default: `localhost:8080`) +- `SANDBOX_API_KEY`: API key if your server requires authentication (optional for local) + +## Customization + +### Resource limits + +The Windows profile enforces minimum resources: **cpu >= 2, memory >= 4G, disk >= 64G**. The example uses 4 CPU, 8G RAM, and 64G disk. You can adjust these in the `main.py` `resource` dict. + +### Persistent storage + +Bind a host directory to `/storage` for a persistent system disk (add to the `SandboxSync.create` call): + +```python +from opensandbox.models.sandboxes import Host, Volume + +volumes = [ + Volume( + name="win-storage", + host=Host(path="/data/opensandbox/windows-storage"), + mount_path="/storage", + read_only=False, + ), +] +``` + +### Local ISO + +Bind a Windows install ISO to `/boot.iso` to avoid repeated downloads: + +```python +volumes = [ + Volume( + name="win-iso", + host=Host(path="/data/iso/Win11_23H2.iso"), + mount_path="/boot.iso", + read_only=True, + ), +] +``` + +### Windows guest configuration + +Pass [dockur/windows environment variables](https://github.com/dockur/windows) through the `env` parameter: + +```python +env = { + "VERSION": "11l", + "USERNAME": "Docker", + "PASSWORD": "your-secure-password", + "LANGUAGE": "Chinese", + "REGION": "zh-CN", + "KEYBOARD": "zh-CN", +} +``` + +Do not manually set `CPU_CORES`, `RAM_SIZE`, or `DISK_SIZE` — they are derived from `resourceLimits` automatically. + +## Exposed ports + +| Port | Service | +|------|---------| +| 44772 | execd (sandbox execution API) | +| 8080 | HTTP service | +| 3389 | RDP (native Remote Desktop) | +| 8006 | Web console (noVNC) | + +## Troubleshooting + +- **`Unsupported platform.os 'windows'`**: Server build has no Windows profile; upgrade OpenSandbox server. +- **`INVALID_PARAMETER` for resourceLimits**: Ensure cpu >= 2, memory >= 4G, disk >= 64G. +- **Stays Pending a long time**: First Windows install is slow; check host resources and `/storage` space, increase `ready_timeout`. +- **Status Running but endpoint unreachable**: Verify endpoint resolution returns a valid address; check `USER_PORTS` if you need additional ports forwarded. + +### ENI CNI network issue (Alibaba Cloud ACK) + +On clusters using ENI-based CNIs (e.g. Alibaba Cloud ACK Terway in ENI mode), dockur/windows fails at startup with: + +``` +❯ ERROR: This container does not support host mode networking! +``` + +or: + +``` +❯ ERROR: Status 1 while: ethtool -i "$VM_NET_DEV" +``` + +**Root cause**: The image's `network.sh` uses `ethtool -i` to check the network interface. ENI interfaces have real PCI bus-info, which triggers a false "host mode" detection. Standard veth-based CNIs (Calico, Flannel, Cilium) do NOT have this problem. + +**Solution**: Use the provided `main_fix_net.py` example, which patches the script at runtime and sets `NETWORK=slirp` for QEMU user-mode NAT: + +```shell +python main_fix_net.py +``` + +See [`main_fix_net.py`](./main_fix_net.py) for the full implementation. + +**How it works**: + +1. `sed` replaces three lines in `/run/network.sh` with empty variable assignments (`result=""`, `nic=""`, `bus=""`), preventing the ethtool check from aborting the script. +2. `NETWORK=slirp` tells the script to use QEMU's SLIRP networking (user-mode NAT), which doesn't require a real NIC. +3. `exec /usr/bin/tini -s /run/entry.sh` launches the original image entrypoint after patching. + +This approach keeps the Pod's independent IP and requires no image rebuild or `hostNetwork`. + +## Windows Sandbox from pool + +Use a pre-warmed K8s pool for faster Windows sandbox startup. + +### 1. Create the pool + +Apply the pool manifest (the image, resources, device mounts, and OEM scripts are pre-configured): + +```shell +kubectl apply -f pool-win-example.yaml +``` + +### 2. Start the OpenSandbox server [k8s] + +```shell +uv pip install opensandbox-server +opensandbox-server init-config ~/.sandbox.toml --example k8s +opensandbox-server +``` + +### 3. Run the pool example + +```shell +uv pip install opensandbox +python main_use_pool.py +``` + +The script acquires a sandbox from `pool-win-example`, prints endpoints, and runs a command. + +### Environment variables (pool) + +- `SANDBOX_DOMAIN`: Sandbox service address (default: `localhost:8080`) +- `SANDBOX_API_KEY`: API key if your server requires authentication + +## References + +- [Windows sandbox guide](../../docs/windows-sandbox.md) +- [dockur/windows](https://github.com/dockur/windows) diff --git a/examples/windows/main.py b/examples/windows/main.py new file mode 100644 index 000000000..e5075aadf --- /dev/null +++ b/examples/windows/main.py @@ -0,0 +1,61 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Minimal Windows sandbox example using dockur/windows.""" + +import os +from datetime import timedelta + +from opensandbox import SandboxSync +from opensandbox.config import ConnectionConfigSync +from opensandbox.models.sandboxes import PlatformSpec + + +def main() -> None: + cfg = ConnectionConfigSync( + domain=os.getenv("SANDBOX_DOMAIN", "localhost:8080"), + api_key=os.getenv("SANDBOX_API_KEY") or None, + request_timeout=timedelta(minutes=3), + use_server_proxy=True, + ) + + sbx = SandboxSync.create( + image="dockurr/windows:latest", + timeout=timedelta(hours=12), + ready_timeout=timedelta(minutes=30), + resource={ + "cpu": "4", + "memory": "8G", + "disk": "64G", + }, + env={"VERSION": "11"}, + platform=PlatformSpec(os="windows", arch="amd64"), + connection_config=cfg, + ) + + try: + print(f"Created: {sbx.id}") + print(f"execd: {sbx.get_endpoint(44772).endpoint}") + print(f"RDP: {sbx.get_endpoint(3389).endpoint}") + print(f"Web: {sbx.get_endpoint(8006).endpoint}") + + exec = sbx.commands.run("cmd /c echo Hello from Windows sandbox") + print(f"Command output: {exec.logs.stdout[0].text}") + finally: + sbx.kill() + sbx.close() + + +if __name__ == "__main__": + main() diff --git a/examples/windows/main_fix_net.py b/examples/windows/main_fix_net.py new file mode 100644 index 000000000..179433cda --- /dev/null +++ b/examples/windows/main_fix_net.py @@ -0,0 +1,92 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Windows sandbox example with ENI CNI network fix. + +Use this example on clusters with ENI-based CNIs (e.g. Alibaba Cloud ACK +Terway in ENI mode) where dockur/windows fails with: + + ERROR: This container does not support host mode networking! + +or: + + ERROR: Status 1 while: ethtool -i "$VM_NET_DEV" + +The fix patches /run/network.sh at container startup to bypass the +ethtool/bus-info check, then uses NETWORK=slirp for QEMU user-mode NAT. +Standard veth-based CNIs (Calico, Flannel, Cilium) do NOT need this fix. +""" + +import os +from datetime import timedelta + +from opensandbox import SandboxSync +from opensandbox.config import ConnectionConfigSync +from opensandbox.models.sandboxes import PlatformSpec + +# sed command to bypass the ethtool/grep checks in network.sh. +# Replaces three lines with empty variable assignments so that: +# - ethtool -i (would fail on ENI with real PCI bus-info) is skipped +# - grep on empty result (would fail with pipefail) is skipped +_NETWORK_PATCH_CMD = ( + "sed -i" + " -e 's/result=$(ethtool -i \"$VM_NET_DEV\")/result=\"\"/'" + " -e '/grep.*driver:/s/.*/ nic=\"\"/'" + " -e '/grep.*bus-info:/s/.*/ bus=\"\"/'" + " /run/network.sh" +) + +# Original dockur/windows ENTRYPOINT +_WINDOWS_ENTRYPOINT = "/usr/bin/tini -s /run/entry.sh" + + +def main() -> None: + cfg = ConnectionConfigSync( + domain=os.getenv("SANDBOX_DOMAIN", "localhost:8080"), + api_key=os.getenv("SANDBOX_API_KEY") or None, + request_timeout=timedelta(minutes=3), + use_server_proxy=True, + ) + + sbx = SandboxSync.create( + image="dockurr/windows:latest", + timeout=timedelta(hours=12), + ready_timeout=timedelta(minutes=120), + resource={"cpu": "8", "memory": "16G", "disk": "64G"}, + env={ + "VERSION": "11", + "NETWORK": "slirp", # Use QEMU built-in user-mode NAT + }, + # Patch network.sh then exec the original entrypoint + entrypoint=["/bin/sh", "-c", f"{_NETWORK_PATCH_CMD} && exec {_WINDOWS_ENTRYPOINT}"], + platform=PlatformSpec(os="windows", arch="amd64"), + connection_config=cfg, + ) + + try: + print(f"Created: {sbx.id}") + print(f"execd: {sbx.get_endpoint(44772).endpoint}") + print(f"RDP: {sbx.get_endpoint(3389).endpoint}") + print(f"Web: {sbx.get_endpoint(8006).endpoint}") + + result = sbx.commands.run("cmd /c echo Hello from Windows sandbox") + print(f"Command output: {result.logs.stdout[0].text}") + finally: + sbx.kill() + sbx.close() + + +if __name__ == "__main__": + main() diff --git a/examples/windows/main_use_pool.py b/examples/windows/main_use_pool.py new file mode 100644 index 000000000..8956ad114 --- /dev/null +++ b/examples/windows/main_use_pool.py @@ -0,0 +1,58 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Windows sandbox example using a pre-warmed K8s pool.""" + +import os +from datetime import timedelta + +from opensandbox import SandboxSync +from opensandbox.config import ConnectionConfigSync + + +def main() -> None: + cfg = ConnectionConfigSync( + domain=os.getenv("SANDBOX_DOMAIN", "localhost:8080"), + api_key=os.getenv("SANDBOX_API_KEY") or None, + request_timeout=timedelta(minutes=3), + use_server_proxy=True, + ) + + # Note: do NOT set entrypoint or env for Windows pool sandboxes. + # The pool template already configures the Windows guest (VERSION, + # CPU_CORES, etc.). Setting entrypoint or env would inject a + # taskTemplate that overrides the pool's pod spec, preventing + # dockur/windows from booting correctly. + sbx = SandboxSync.create( + image="dockurr/windows:latest", + timeout=timedelta(hours=1), + extensions={"poolRef": "pool-win-example"}, + connection_config=cfg, + ) + + try: + print(f"Created: {sbx.id}") + print(f"execd: {sbx.get_endpoint(44772).endpoint}") + print(f"RDP: {sbx.get_endpoint(3389).endpoint}") + print(f"Web: {sbx.get_endpoint(8006).endpoint}") + + exec = sbx.commands.run("cmd /c echo Hello from Windows sandbox") + print(f"Command output: {exec.logs.stdout[0].text}") + finally: + sbx.kill() + sbx.close() + + +if __name__ == "__main__": + main() diff --git a/examples/windows/pool-win-example.yaml b/examples/windows/pool-win-example.yaml new file mode 100644 index 000000000..bc9345107 --- /dev/null +++ b/examples/windows/pool-win-example.yaml @@ -0,0 +1,92 @@ +apiVersion: sandbox.opensandbox.io/v1alpha1 +kind: Pool +metadata: + labels: + app.kubernetes.io/name: sandbox-k8s + app.kubernetes.io/managed-by: kustomize + name: pool-win-example + namespace: opensandbox +spec: + template: + metadata: + labels: + app: example + spec: + containers: + - env: + - name: VERSION + value: "11" + - name: CPU_CORES + value: "8" + - name: RAM_SIZE + value: 16G + - name: DISK_SIZE + value: 64G + - name: USER_PORTS + value: 44772,8080,3389,8006 + image: dockurr/windows:latest + imagePullPolicy: IfNotPresent + name: sandbox + resources: + limits: + cpu: "8" + memory: 18Gi + requests: + cpu: "8" + memory: 18Gi + securityContext: + capabilities: + add: + - NET_ADMIN + - NET_RAW + privileged: true + volumeMounts: + - mountPath: /opt/opensandbox/bin + name: opensandbox-bin + - mountPath: /oem + name: opensandbox-win-oem + - mountPath: /dev/kvm + name: opensandbox-win-kvm + - mountPath: /dev/net/tun + name: opensandbox-win-tun + - mountPath: /storage + name: opensandbox-win-storage + initContainers: + - args: + - cp ./install.bat /oem/install.bat && cp ./execd.exe /oem/execd.exe && chmod + 0644 /oem/install.bat /oem/execd.exe + command: + - /bin/sh + - -c + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15 + name: execd-installer + volumeMounts: + - mountPath: /opt/opensandbox/bin + name: opensandbox-bin + - mountPath: /oem + name: opensandbox-win-oem + nodeSelector: + kubernetes.io/arch: amd64 + restartPolicy: Always + tolerations: + - operator: Exists + volumes: + - emptyDir: {} + name: opensandbox-bin + - emptyDir: {} + name: opensandbox-win-oem + - hostPath: + path: /dev/kvm + type: CharDevice + name: opensandbox-win-kvm + - hostPath: + path: /dev/net/tun + type: CharDevice + name: opensandbox-win-tun + - emptyDir: {} + name: opensandbox-win-storage + capacitySpec: + bufferMax: 3 + bufferMin: 1 + poolMax: 5 + poolMin: 0 \ No newline at end of file diff --git a/server/opensandbox_server/services/k8s/batchsandbox_provider.py b/server/opensandbox_server/services/k8s/batchsandbox_provider.py index fba4f6d1e..d9efea549 100644 --- a/server/opensandbox_server/services/k8s/batchsandbox_provider.py +++ b/server/opensandbox_server/services/k8s/batchsandbox_provider.py @@ -40,6 +40,7 @@ apply_egress_to_spec, ) from opensandbox_server.services.k8s.provider_common import ( + DEFAULT_ENTRYPOINT, _build_execd_init_container, _build_main_container, _container_to_dict, @@ -317,7 +318,7 @@ def _apply_platform_node_selector( template_spec=template_spec if isinstance(template_spec, dict) else {}, platform=platform, ) - + def _create_workload_from_pool( self, batchsandbox_name: str, @@ -333,8 +334,10 @@ def _create_workload_from_pool( spec: Dict[str, Any] = { "replicas": 1, "poolRef": pool_ref, - "taskTemplate": self._build_task_template(entrypoint, env), } + needs_task_template = env or entrypoint != DEFAULT_ENTRYPOINT + if needs_task_template: + spec["taskTemplate"] = self._build_task_template(entrypoint, env) if expires_at is not None: spec["expireTime"] = expires_at.isoformat() runtime_manifest = { diff --git a/server/opensandbox_server/services/k8s/provider_common.py b/server/opensandbox_server/services/k8s/provider_common.py index 38d5bddf8..645056a57 100644 --- a/server/opensandbox_server/services/k8s/provider_common.py +++ b/server/opensandbox_server/services/k8s/provider_common.py @@ -36,6 +36,9 @@ serialize_security_context_to_dict, ) +# Default entrypoint auto-filled by the SDK when user does not provide one. +DEFAULT_ENTRYPOINT = ["tail", "-f", "/dev/null"] + _GPU_RESOURCE_LIMIT_KEY = "gpu" # Canonical extended-resource name advertised by the NVIDIA device plugin. # Hardcoded for parity with the Docker runtime fix (#775), which targets diff --git a/server/opensandbox_server/services/k8s/windows_profile.py b/server/opensandbox_server/services/k8s/windows_profile.py index ccd65cac2..e7b6e1bc9 100644 --- a/server/opensandbox_server/services/k8s/windows_profile.py +++ b/server/opensandbox_server/services/k8s/windows_profile.py @@ -14,9 +14,12 @@ from __future__ import annotations +import math +import re from typing import Any, Dict, List, Optional from opensandbox_server.api.schema import PlatformSpec +from opensandbox_server.services.k8s.provider_common import DEFAULT_ENTRYPOINT from opensandbox_server.services.windows_common import ( inject_windows_resource_limits_env, inject_windows_user_ports, @@ -26,7 +29,11 @@ WINDOWS_OEM_VOLUME_NAME = "opensandbox-win-oem" WINDOWS_KVM_VOLUME_NAME = "opensandbox-win-kvm" WINDOWS_TUN_VOLUME_NAME = "opensandbox-win-tun" +WINDOWS_STORAGE_VOLUME_NAME = "opensandbox-win-storage" WINDOWS_PROFILE_DEFAULT_USER_PORTS = ["44772", "8080", "3389/tcp", "3389/udp", "8006/tcp"] +# Extra memory overhead (in Gi) reserved for QEMU process on top of guest RAM. +WINDOWS_QEMU_MEMORY_OVERHEAD_GI = 2 +_SIZE_PATTERN = re.compile(r"^\s*(\d+)\s*([a-zA-Z]*)\s*$") def is_windows_profile(platform: Optional[PlatformSpec]) -> bool: @@ -90,10 +97,36 @@ def apply_windows_profile_overrides( ) main_container = containers[0] - main_container["command"] = list(entrypoint) + # Entrypoint handling for Windows profile: + # - If user provides a custom entrypoint, use it as container command + # (e.g. for ENI network hack or other custom startup logic). + # - If no entrypoint or the SDK default, remove command to use image + # ENTRYPOINT (dockur/windows starts QEMU via /run/entry.sh). + if entrypoint and entrypoint != DEFAULT_ENTRYPOINT: + main_container["command"] = entrypoint + else: + main_container.pop("command", None) + main_container.pop("args", None) main_container["env"] = windows_env if windows_env else None - main_container.pop("resources", None) + # Set pod resources from resource_limits for proper K8s scheduling. + # Memory includes overhead for the QEMU process itself. + if resource_limits: + limits: Dict[str, str] = {} + if resource_limits.get("cpu"): + limits["cpu"] = resource_limits["cpu"] + if resource_limits.get("memory"): + limits["memory"] = _memory_with_qemu_overhead(resource_limits["memory"]) + if limits: + main_container["resources"] = { + "limits": limits, + "requests": dict(limits), + } + else: + main_container.pop("resources", None) + else: + main_container.pop("resources", None) security_context = main_container.setdefault("securityContext", {}) + security_context["privileged"] = True capabilities = security_context.setdefault("capabilities", {}) drop = capabilities.get("drop") if isinstance(drop, list): @@ -110,6 +143,7 @@ def apply_windows_profile_overrides( {"name": WINDOWS_OEM_VOLUME_NAME, "mountPath": "/oem"}, {"name": WINDOWS_KVM_VOLUME_NAME, "mountPath": "/dev/kvm"}, {"name": WINDOWS_TUN_VOLUME_NAME, "mountPath": "/dev/net/tun"}, + {"name": WINDOWS_STORAGE_VOLUME_NAME, "mountPath": "/storage"}, ], ) @@ -125,9 +159,14 @@ def apply_windows_profile_overrides( "name": WINDOWS_TUN_VOLUME_NAME, "hostPath": {"path": "/dev/net/tun", "type": "CharDevice"}, }, + {"name": WINDOWS_STORAGE_VOLUME_NAME, "emptyDir": {}}, ], ) + # dockur/windows relies on container restart to complete multi-phase + # installation (first boot installs from ISO, second boot runs from disk). + pod_spec["restartPolicy"] = "Always" + def apply_windows_profile_arch_selector( pod_spec: Dict[str, Any], @@ -183,6 +222,29 @@ def _merge_volume_mounts(container: Dict[str, Any], mounts_to_add: List[Dict[str existing_names.add(name) +def _memory_with_qemu_overhead(memory_value: str) -> str: + """Add QEMU process overhead to guest memory for K8s pod resource limits. + + Parses the guest RAM value (e.g. '8G', '16Gi') and adds + WINDOWS_QEMU_MEMORY_OVERHEAD_GI. Returns a Gi-suffixed string suitable + for Kubernetes resource quantities. + """ + match = _SIZE_PATTERN.match(memory_value) + if not match: + return memory_value + amount = int(match.group(1)) + unit = (match.group(2) or "").lower() + if unit in {"g", "gi", "gb"}: + total_gi = amount + WINDOWS_QEMU_MEMORY_OVERHEAD_GI + elif unit in {"m", "mi", "mb"}: + total_gi = math.ceil(amount / 1024) + WINDOWS_QEMU_MEMORY_OVERHEAD_GI + elif unit in {"t", "ti", "tb"}: + total_gi = amount * 1024 + WINDOWS_QEMU_MEMORY_OVERHEAD_GI + else: + return memory_value + return f"{total_gi}Gi" + + def _merge_volumes(pod_spec: Dict[str, Any], volumes_to_add: List[Dict[str, Any]]) -> None: volumes = pod_spec.setdefault("volumes", []) if not isinstance(volumes, list): diff --git a/server/tests/k8s/test_batchsandbox_provider.py b/server/tests/k8s/test_batchsandbox_provider.py index 452d76ab6..39150fcc1 100644 --- a/server/tests/k8s/test_batchsandbox_provider.py +++ b/server/tests/k8s/test_batchsandbox_provider.py @@ -198,7 +198,9 @@ def test_create_workload_windows_profile_uses_windows_runtime_shape(self, mock_k main_container = pod_spec["containers"][0] assert main_container["command"] == ["cmd", "/c", "echo hello"] - assert "resources" not in main_container + # Resources include QEMU memory overhead (8G + 2Gi overhead = 10Gi) + assert main_container["resources"]["limits"]["cpu"] == "4" + assert main_container["resources"]["limits"]["memory"] == "10Gi" env_dict = {item["name"]: item["value"] for item in main_container.get("env", [])} assert env_dict["VERSION"] == "11" @@ -212,6 +214,33 @@ def test_create_workload_windows_profile_uses_windows_runtime_shape(self, mock_k assert "opensandbox-win-kvm" in volume_names assert "opensandbox-win-tun" in volume_names + def test_create_workload_windows_profile_default_entrypoint_uses_image_entrypoint(self, mock_k8s_client): + """When entrypoint is the SDK default, command is removed so image ENTRYPOINT runs.""" + provider = BatchSandboxProvider(mock_k8s_client) + mock_k8s_client.create_custom_object.return_value = { + "metadata": {"name": "test-id", "uid": "test-uid"} + } + + provider.create_workload( + sandbox_id="test-id", + namespace="test-ns", + image_spec=ImageSpec(uri="dockurr/windows:latest"), + entrypoint=["tail", "-f", "/dev/null"], + env={"VERSION": "11"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + labels={"opensandbox.io/id": "test-id"}, + expires_at=None, + execd_image="execd:latest", + platform=PlatformSpec(os="windows", arch="amd64"), + ) + + body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] + pod_spec = body["spec"]["template"]["spec"] + main_container = pod_spec["containers"][0] + # No command set - image default ENTRYPOINT will be used + assert "command" not in main_container + assert "args" not in main_container + def test_create_workload_windows_profile_merges_user_ports(self, mock_k8s_client): provider = BatchSandboxProvider(mock_k8s_client) mock_k8s_client.create_custom_object.return_value = { @@ -1514,6 +1543,57 @@ def test_create_workload_poolref_builds_correct_manifest(self, mock_k8s_client): # Verify no template field (pool-based doesn't use template) assert "template" not in body["spec"] + def test_create_workload_poolref_default_entrypoint_no_env_omits_task_template(self, mock_k8s_client): + """When entrypoint is SDK default and env is empty, taskTemplate is omitted.""" + provider = BatchSandboxProvider(mock_k8s_client) + mock_k8s_client.create_custom_object.return_value = { + "metadata": {"name": "test-id", "uid": "test-uid"} + } + + provider.create_workload( + sandbox_id="test-id", + namespace="test-ns", + image_spec=ImageSpec(uri="dockurr/windows:latest"), + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={}, + labels={}, + expires_at=None, + execd_image="execd:latest", + extensions={"poolRef": "my-pool"}, + ) + + body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] + assert body["spec"]["poolRef"] == "my-pool" + assert "taskTemplate" not in body["spec"] + + def test_create_workload_poolref_default_entrypoint_with_env_includes_task_template(self, mock_k8s_client): + """When entrypoint is SDK default but env is non-empty, taskTemplate is generated.""" + provider = BatchSandboxProvider(mock_k8s_client) + mock_k8s_client.create_custom_object.return_value = { + "metadata": {"name": "test-id", "uid": "test-uid"} + } + + provider.create_workload( + sandbox_id="test-id", + namespace="test-ns", + image_spec=ImageSpec(uri="dockurr/windows:latest"), + entrypoint=["tail", "-f", "/dev/null"], + env={"VERSION": "11"}, + resource_limits={}, + labels={}, + expires_at=None, + execd_image="execd:latest", + extensions={"poolRef": "my-pool"}, + ) + + body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] + assert body["spec"]["poolRef"] == "my-pool" + assert "taskTemplate" in body["spec"] + task_template = body["spec"]["taskTemplate"] + assert task_template["spec"]["process"]["env"] == [{"name": "VERSION", "value": "11"}] + + class TestBatchSandboxProviderEgress: """BatchSandboxProvider egress sidecar tests""" diff --git a/server/tests/k8s/test_k8s_windows_profile.py b/server/tests/k8s/test_k8s_windows_profile.py new file mode 100644 index 000000000..c16ee108f --- /dev/null +++ b/server/tests/k8s/test_k8s_windows_profile.py @@ -0,0 +1,249 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for K8s windows_profile module.""" + +import pytest + +from opensandbox_server.services.k8s.windows_profile import ( + _memory_with_qemu_overhead, + apply_windows_profile_overrides, + build_windows_profile_env, +) + + +class TestMemoryWithQemuOverhead: + """Tests for _memory_with_qemu_overhead helper.""" + + @pytest.mark.parametrize( + ("input_value", "expected"), + [ + ("8G", "10Gi"), + ("16G", "18Gi"), + ("4Gi", "6Gi"), + ("8Gb", "10Gi"), + ], + ) + def test_gigabyte_units(self, input_value, expected): + assert _memory_with_qemu_overhead(input_value) == expected + + @pytest.mark.parametrize( + ("input_value", "expected"), + [ + ("8192M", "10Gi"), # 8192/1024 = 8, + 2 = 10 + ("8192Mi", "10Gi"), + ("4096Mb", "6Gi"), # 4096/1024 = 4, + 2 = 6 + ("1000Mi", "3Gi"), # ceil(1000/1024) = 1, + 2 = 3 + ], + ) + def test_megabyte_units(self, input_value, expected): + assert _memory_with_qemu_overhead(input_value) == expected + + @pytest.mark.parametrize( + ("input_value", "expected"), + [ + ("1T", "1026Gi"), # 1*1024 + 2 = 1026 + ("1Ti", "1026Gi"), + ], + ) + def test_terabyte_units(self, input_value, expected): + assert _memory_with_qemu_overhead(input_value) == expected + + def test_unrecognized_unit_returns_original(self): + assert _memory_with_qemu_overhead("8K") == "8K" + assert _memory_with_qemu_overhead("8Ki") == "8Ki" + + def test_unparseable_value_returns_original(self): + assert _memory_with_qemu_overhead("invalid") == "invalid" + assert _memory_with_qemu_overhead("") == "" + + def test_whitespace_tolerance(self): + assert _memory_with_qemu_overhead(" 8 G ") == "10Gi" + + +class TestBuildWindowsProfileEnv: + """Tests for build_windows_profile_env.""" + + def test_does_not_inject_kvm_n_by_default(self): + result = build_windows_profile_env( + env={"VERSION": "11"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + env_dict = {item["name"]: item["value"] for item in result} + assert "KVM" not in env_dict + + def test_preserves_user_kvm_override(self): + result = build_windows_profile_env( + env={"VERSION": "11", "KVM": "N"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + env_dict = {item["name"]: item["value"] for item in result} + assert env_dict["KVM"] == "N" + + def test_includes_user_env_and_resource_derived_env(self): + result = build_windows_profile_env( + env={"VERSION": "11", "LANGUAGE": "Chinese"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + env_dict = {item["name"]: item["value"] for item in result} + assert env_dict["VERSION"] == "11" + assert env_dict["LANGUAGE"] == "Chinese" + assert env_dict["CPU_CORES"] == "4" + assert env_dict["RAM_SIZE"] == "8G" + assert env_dict["DISK_SIZE"] == "64G" + + +class TestApplyWindowsProfileOverrides: + """Tests for apply_windows_profile_overrides entrypoint and resource handling.""" + + def _make_pod_spec(self): + return { + "initContainers": [ + { + "name": "execd-installer", + "image": "execd:test", + "command": ["/bin/sh", "-c"], + "args": ["cp ./execd /opt/opensandbox/bin/execd"], + "volumeMounts": [ + {"name": "opensandbox-bin", "mountPath": "/opt/opensandbox/bin"} + ], + } + ], + "containers": [ + { + "name": "sandbox", + "image": "dockurr/windows:latest", + "command": ["/opt/opensandbox/bin/bootstrap.sh", "tail", "-f", "/dev/null"], + "env": [{"name": "EXECD", "value": "/opt/opensandbox/bin/execd"}], + "volumeMounts": [ + {"name": "opensandbox-bin", "mountPath": "/opt/opensandbox/bin"} + ], + } + ], + "volumes": [{"name": "opensandbox-bin", "emptyDir": {}}], + } + + def test_custom_entrypoint_sets_command(self): + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["/bin/sh", "-c", "patch && exec /run/entry.sh"], + env={"VERSION": "11"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + main = pod_spec["containers"][0] + assert main["command"] == ["/bin/sh", "-c", "patch && exec /run/entry.sh"] + assert "args" not in main + + def test_default_entrypoint_removes_command(self): + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={"VERSION": "11"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + main = pod_spec["containers"][0] + assert "command" not in main + assert "args" not in main + + def test_empty_entrypoint_removes_command(self): + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=[], + env={"VERSION": "11"}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + main = pod_spec["containers"][0] + assert "command" not in main + + def test_resource_limits_sets_resources_with_overhead(self): + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={"cpu": "4", "memory": "8G", "disk": "64G"}, + ) + main = pod_spec["containers"][0] + assert main["resources"]["limits"]["cpu"] == "4" + assert main["resources"]["limits"]["memory"] == "10Gi" + assert main["resources"]["requests"]["cpu"] == "4" + assert main["resources"]["requests"]["memory"] == "10Gi" + + def test_empty_resource_limits_removes_resources(self): + pod_spec = self._make_pod_spec() + pod_spec["containers"][0]["resources"] = {"limits": {"cpu": "1"}} + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={}, + ) + main = pod_spec["containers"][0] + assert "resources" not in main + + def test_resource_limits_with_only_disk_removes_resources(self): + """disk is not a K8s resource, so if only disk is present, no limits are set.""" + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={"disk": "64G"}, + ) + main = pod_spec["containers"][0] + assert "resources" not in main + + def test_sets_privileged_true(self): + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={"cpu": "4", "memory": "8G"}, + ) + main = pod_spec["containers"][0] + assert main["securityContext"]["privileged"] is True + + def test_sets_restart_policy_always(self): + pod_spec = self._make_pod_spec() + pod_spec["restartPolicy"] = "Never" + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={"cpu": "4", "memory": "8G"}, + ) + assert pod_spec["restartPolicy"] == "Always" + + def test_adds_storage_volume_and_mount(self): + pod_spec = self._make_pod_spec() + apply_windows_profile_overrides( + pod_spec=pod_spec, + entrypoint=["tail", "-f", "/dev/null"], + env={}, + resource_limits={"cpu": "4", "memory": "8G"}, + ) + volume_names = [v["name"] for v in pod_spec["volumes"]] + assert "opensandbox-win-storage" in volume_names + storage_vol = next(v for v in pod_spec["volumes"] if v["name"] == "opensandbox-win-storage") + assert storage_vol == {"name": "opensandbox-win-storage", "emptyDir": {}} + + main = pod_spec["containers"][0] + mount_names = [m["name"] for m in main["volumeMounts"]] + assert "opensandbox-win-storage" in mount_names + storage_mount = next(m for m in main["volumeMounts"] if m["name"] == "opensandbox-win-storage") + assert storage_mount["mountPath"] == "/storage" From 5446bb42ec490cb1c551a7a7e58b9f677c989e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Thu, 14 May 2026 10:39:13 +0800 Subject: [PATCH 05/58] fix(execd): use merged CA bundle for REQUESTS_CA_BUNDLE and SSL_CERT_FILE --- components/execd/bootstrap.sh | 73 +++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/components/execd/bootstrap.sh b/components/execd/bootstrap.sh index 9a6ec23c9..2c809e129 100755 --- a/components/execd/bootstrap.sh +++ b/components/execd/bootstrap.sh @@ -42,27 +42,56 @@ _sudo() { fi } -# Install mitm egress CA into the system trust store (no extra env vars). -# - Debian/Ubuntu/Alpine: update-ca-certificates + /usr/local/share/ca-certificates/ -# - RHEL/CentOS/Fedora/Alma/Rocky: update-ca-trust + /etc/pki/ca-trust/source/anchors/ +# Install mitm CA into the system trust store and set OPENSANDBOX_MERGED_CA +# to a PEM bundle containing system roots + mitm CA (for env vars like +# REQUESTS_CA_BUNDLE that replace rather than append to the default roots). +OPENSANDBOX_MERGED_CA="" trust_mitm_ca() { cert="$1" + merged="/opt/opensandbox/merged-ca-certificates.pem" + installed=false + if command -v update-ca-certificates >/dev/null 2>&1; then - _sudo mkdir -p /usr/local/share/ca-certificates - _sudo cp "$cert" /usr/local/share/ca-certificates/opensandbox-mitmproxy-ca.crt - _sudo update-ca-certificates - return 0 - fi - if command -v update-ca-trust >/dev/null 2>&1; then - _sudo mkdir -p /etc/pki/ca-trust/source/anchors - _sudo cp "$cert" /etc/pki/ca-trust/source/anchors/opensandbox-mitmproxy-ca.pem - if ! _sudo update-ca-trust extract; then - _sudo update-ca-trust + if _sudo mkdir -p /usr/local/share/ca-certificates \ + && _sudo cp "$cert" /usr/local/share/ca-certificates/opensandbox-mitmproxy-ca.crt \ + && _sudo update-ca-certificates; then + installed=true + if [ -f /etc/ssl/certs/ca-certificates.crt ] && [ -s /etc/ssl/certs/ca-certificates.crt ]; then + OPENSANDBOX_MERGED_CA="/etc/ssl/certs/ca-certificates.crt" + return 0 + fi + fi + elif command -v update-ca-trust >/dev/null 2>&1; then + if _sudo mkdir -p /etc/pki/ca-trust/source/anchors \ + && _sudo cp "$cert" /etc/pki/ca-trust/source/anchors/opensandbox-mitmproxy-ca.pem \ + && { _sudo update-ca-trust extract || _sudo update-ca-trust; }; then + installed=true + if [ -f /etc/pki/tls/certs/ca-bundle.crt ] && [ -s /etc/pki/tls/certs/ca-bundle.crt ]; then + OPENSANDBOX_MERGED_CA="/etc/pki/tls/certs/ca-bundle.crt" + return 0 + fi fi - return 0 fi - echo "warning: cannot install mitm CA (need update-ca-certificates or update-ca-trust)" >&2 + # System trust-store update unavailable or failed — build merged bundle manually. + if [ "$installed" = false ]; then + echo "warning: cannot install mitm CA into system trust store; building merged bundle manually" >&2 + else + echo "warning: system trust-store updated but consolidated bundle not found; building merged bundle manually" >&2 + fi + for candidate in \ + /etc/ssl/certs/ca-certificates.crt \ + /etc/pki/tls/certs/ca-bundle.crt \ + /etc/ssl/cert.pem \ + /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem; do + if [ -f "$candidate" ] && [ -s "$candidate" ]; then + cat "$candidate" "$cert" > "$merged" + OPENSANDBOX_MERGED_CA="$merged" + return 0 + fi + done + + echo "warning: could not locate system CA bundle to merge with mitm CA" >&2 return 0 } @@ -117,8 +146,18 @@ if is_truthy "${OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT:-}"; then if [ -f "$MITM_CA" ] && [ -s "$MITM_CA" ]; then trust_mitm_ca_nss "$MITM_CA" || true - export NODE_EXTRA_CA_CERTS="$MITM_CA" - export REQUESTS_CA_BUNDLE="$MITM_CA" + export NODE_EXTRA_CA_CERTS="$MITM_CA" # additive — Node appends to built-in roots + + # REQUESTS_CA_BUNDLE and SSL_CERT_FILE replace the default bundle, + # so use merged roots (system CA + mitm CA). + if [ -n "$OPENSANDBOX_MERGED_CA" ] && [ -f "$OPENSANDBOX_MERGED_CA" ]; then + export REQUESTS_CA_BUNDLE="$OPENSANDBOX_MERGED_CA" + export SSL_CERT_FILE="$OPENSANDBOX_MERGED_CA" + else + echo "warning: merged CA bundle not available; REQUESTS_CA_BUNDLE/SSL_CERT_FILE will only contain the mitm CA" >&2 + export REQUESTS_CA_BUNDLE="$MITM_CA" + export SSL_CERT_FILE="$MITM_CA" + fi fi fi From 16a4ef7988105c7af7158e3cb875079a158256b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Thu, 14 May 2026 12:36:10 +0800 Subject: [PATCH 06/58] fix(execd): use merged CA bundle for REQUESTS_CA_BUNDLE and SSL_CERT_FILE --- components/execd/bootstrap.sh | 54 +++++++++++++++++------------------ 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/components/execd/bootstrap.sh b/components/execd/bootstrap.sh index 2c809e129..b39076cfa 100755 --- a/components/execd/bootstrap.sh +++ b/components/execd/bootstrap.sh @@ -42,56 +42,54 @@ _sudo() { fi } -# Install mitm CA into the system trust store and set OPENSANDBOX_MERGED_CA -# to a PEM bundle containing system roots + mitm CA (for env vars like -# REQUESTS_CA_BUNDLE that replace rather than append to the default roots). +# Install mitm CA into the system trust store (for non-Python programs) +# and set OPENSANDBOX_MERGED_CA to a PEM bundle containing a full root +# set + mitm CA (for env vars like REQUESTS_CA_BUNDLE that *replace* +# rather than append to the default roots). OPENSANDBOX_MERGED_CA="" trust_mitm_ca() { cert="$1" merged="/opt/opensandbox/merged-ca-certificates.pem" - installed=false + # 1) Try to install into the system trust store (best-effort). if command -v update-ca-certificates >/dev/null 2>&1; then - if _sudo mkdir -p /usr/local/share/ca-certificates \ + _sudo mkdir -p /usr/local/share/ca-certificates \ && _sudo cp "$cert" /usr/local/share/ca-certificates/opensandbox-mitmproxy-ca.crt \ - && _sudo update-ca-certificates; then - installed=true - if [ -f /etc/ssl/certs/ca-certificates.crt ] && [ -s /etc/ssl/certs/ca-certificates.crt ]; then - OPENSANDBOX_MERGED_CA="/etc/ssl/certs/ca-certificates.crt" - return 0 - fi - fi + && _sudo update-ca-certificates \ + || echo "warning: update-ca-certificates failed; system trust store may not include mitm CA" >&2 elif command -v update-ca-trust >/dev/null 2>&1; then - if _sudo mkdir -p /etc/pki/ca-trust/source/anchors \ + _sudo mkdir -p /etc/pki/ca-trust/source/anchors \ && _sudo cp "$cert" /etc/pki/ca-trust/source/anchors/opensandbox-mitmproxy-ca.pem \ - && { _sudo update-ca-trust extract || _sudo update-ca-trust; }; then - installed=true - if [ -f /etc/pki/tls/certs/ca-bundle.crt ] && [ -s /etc/pki/tls/certs/ca-bundle.crt ]; then - OPENSANDBOX_MERGED_CA="/etc/pki/tls/certs/ca-bundle.crt" - return 0 - fi - fi + && { _sudo update-ca-trust extract || _sudo update-ca-trust; } \ + || echo "warning: update-ca-trust failed; system trust store may not include mitm CA" >&2 + else + echo "warning: no system trust-store tooling found (need update-ca-certificates or update-ca-trust)" >&2 fi - # System trust-store update unavailable or failed — build merged bundle manually. - if [ "$installed" = false ]; then - echo "warning: cannot install mitm CA into system trust store; building merged bundle manually" >&2 - else - echo "warning: system trust-store updated but consolidated bundle not found; building merged bundle manually" >&2 + # 2) Build a merged bundle (complete root set + mitm CA). + # Prefer certifi (full Mozilla root set) over system bundles which + # may be incomplete in minimal Docker images. + certifi_ca="" + if command -v python3 >/dev/null 2>&1; then + certifi_ca="$(python3 -c 'import certifi; print(certifi.where())' 2>/dev/null)" || certifi_ca="" + elif command -v python >/dev/null 2>&1; then + certifi_ca="$(python -c 'import certifi; print(certifi.where())' 2>/dev/null)" || certifi_ca="" fi + for candidate in \ + "$certifi_ca" \ /etc/ssl/certs/ca-certificates.crt \ /etc/pki/tls/certs/ca-bundle.crt \ /etc/ssl/cert.pem \ /etc/pki/ca-trust/extracted/pem/tls-ca-bundle.pem; do - if [ -f "$candidate" ] && [ -s "$candidate" ]; then + if [ -n "$candidate" ] && [ -f "$candidate" ] && [ -s "$candidate" ]; then cat "$candidate" "$cert" > "$merged" OPENSANDBOX_MERGED_CA="$merged" return 0 fi done - echo "warning: could not locate system CA bundle to merge with mitm CA" >&2 + echo "warning: could not locate any CA bundle to merge with mitm CA" >&2 return 0 } @@ -149,7 +147,7 @@ if is_truthy "${OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT:-}"; then export NODE_EXTRA_CA_CERTS="$MITM_CA" # additive — Node appends to built-in roots # REQUESTS_CA_BUNDLE and SSL_CERT_FILE replace the default bundle, - # so use merged roots (system CA + mitm CA). + # so use merged roots (certifi/system CA + mitm CA). if [ -n "$OPENSANDBOX_MERGED_CA" ] && [ -f "$OPENSANDBOX_MERGED_CA" ]; then export REQUESTS_CA_BUNDLE="$OPENSANDBOX_MERGED_CA" export SSL_CERT_FILE="$OPENSANDBOX_MERGED_CA" From 976536503855331f546574e57028d96305b3c6f3 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 14 May 2026 05:48:05 +0000 Subject: [PATCH 07/58] chore: bump execd to v1.0.16 --- examples/agent-sandbox/README.md | 2 +- examples/code-interpreter/README.md | 2 +- examples/windows/pool-win-example.yaml | 2 +- kubernetes/charts/opensandbox-server/values.yaml | 2 +- kubernetes/config/samples/sandbox_v1alpha1_pool.yaml | 2 +- .../config/samples/sandbox_v1alpha1_pool_restart.yaml | 2 +- oseps/0004-secure-container-runtime.md | 6 +++--- oseps/0007-fast-sandbox-runtime-support.md | 2 +- server/DEVELOPMENT.md | 2 +- server/docker-compose.example.yaml | 4 ++-- .../examples/example.config.k8s.toml | 2 +- .../examples/example.config.k8s.zh.toml | 2 +- server/opensandbox_server/examples/example.config.toml | 2 +- .../opensandbox_server/examples/example.config.zh.toml | 2 +- server/tests/test_docker_service.py | 10 +++++----- 15 files changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/agent-sandbox/README.md b/examples/agent-sandbox/README.md index 8b6eb0d64..4797cbf22 100644 --- a/examples/agent-sandbox/README.md +++ b/examples/agent-sandbox/README.md @@ -23,7 +23,7 @@ opensandbox-server init-config ~/.sandbox.toml --example docker ```toml [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" [kubernetes] namespace = "default" diff --git a/examples/code-interpreter/README.md b/examples/code-interpreter/README.md index 7e2fed488..0562d2f6c 100644 --- a/examples/code-interpreter/README.md +++ b/examples/code-interpreter/README.md @@ -104,7 +104,7 @@ spec: - name: opensandbox-bin mountPath: /opt/opensandbox/bin - name: execd-installer - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 command: [ "/bin/sh", "-c" ] args: - | diff --git a/examples/windows/pool-win-example.yaml b/examples/windows/pool-win-example.yaml index bc9345107..511e84b0b 100644 --- a/examples/windows/pool-win-example.yaml +++ b/examples/windows/pool-win-example.yaml @@ -58,7 +58,7 @@ spec: command: - /bin/sh - -c - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 name: execd-installer volumeMounts: - mountPath: /opt/opensandbox/bin diff --git a/kubernetes/charts/opensandbox-server/values.yaml b/kubernetes/charts/opensandbox-server/values.yaml index 58a90bc4f..8f5d74064 100644 --- a/kubernetes/charts/opensandbox-server/values.yaml +++ b/kubernetes/charts/opensandbox-server/values.yaml @@ -83,7 +83,7 @@ configToml: | [runtime] type = "kubernetes" - execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" [kubernetes] kubeconfig_path = "" diff --git a/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml b/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml index b4d8ebb7a..8e2985f37 100644 --- a/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml +++ b/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml @@ -31,7 +31,7 @@ spec: - name: opensandbox-bin mountPath: /opt/opensandbox/bin - name: execd-installer - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 command: [ "/bin/sh", "-c" ] args: - | diff --git a/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml b/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml index dc303b84c..07c09c298 100644 --- a/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml +++ b/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml @@ -56,7 +56,7 @@ spec: command: - /bin/sh - -c - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 name: execd-installer volumeMounts: - mountPath: /opt/opensandbox/bin diff --git a/oseps/0004-secure-container-runtime.md b/oseps/0004-secure-container-runtime.md index 9161fb0cd..ce031ac94 100644 --- a/oseps/0004-secure-container-runtime.md +++ b/oseps/0004-secure-container-runtime.md @@ -180,7 +180,7 @@ Extension to `~/.sandbox.toml`. A single `[secure_runtime]` section configures t ```toml [runtime] type = "docker" # or "kubernetes" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" # Secure container runtime configuration. # When enabled, ALL sandboxes on this server use the specified runtime. @@ -210,7 +210,7 @@ Example 1 — gVisor on Docker: # ~/.sandbox.toml [runtime] type = "docker" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" [secure_runtime] type = "gvisor" @@ -224,7 +224,7 @@ Example 2 — Kata Containers (QEMU) on Kubernetes: # ~/.sandbox.toml [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" [secure_runtime] type = "kata" diff --git a/oseps/0007-fast-sandbox-runtime-support.md b/oseps/0007-fast-sandbox-runtime-support.md index 10db7327e..451ce7f5e 100644 --- a/oseps/0007-fast-sandbox-runtime-support.md +++ b/oseps/0007-fast-sandbox-runtime-support.md @@ -611,7 +611,7 @@ api_key = "your-secret-key" [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" [kubernetes] namespace = "default" diff --git a/server/DEVELOPMENT.md b/server/DEVELOPMENT.md index cbc9ec891..be1ca5557 100644 --- a/server/DEVELOPMENT.md +++ b/server/DEVELOPMENT.md @@ -61,7 +61,7 @@ This guide provides comprehensive information for developers working on OpenSand [runtime] type = "docker" - execd_image = "opensandbox/execd:v1.0.15" + execd_image = "opensandbox/execd:v1.0.16" [docker] network_mode = "host" diff --git a/server/docker-compose.example.yaml b/server/docker-compose.example.yaml index 5b7c792d0..500571521 100644 --- a/server/docker-compose.example.yaml +++ b/server/docker-compose.example.yaml @@ -10,8 +10,8 @@ configs: [runtime] type = "docker" - # execd_image = "opensandbox/execd:v1.0.15" - execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15" + # execd_image = "opensandbox/execd:v1.0.16" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" [egress] image = "opensandbox/egress:v1.0.11" diff --git a/server/opensandbox_server/examples/example.config.k8s.toml b/server/opensandbox_server/examples/example.config.k8s.toml index 5329c7381..230e9dfb9 100644 --- a/server/opensandbox_server/examples/example.config.k8s.toml +++ b/server/opensandbox_server/examples/example.config.k8s.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" [storage] # Allowlist of host path prefixes permitted for bind mounts. diff --git a/server/opensandbox_server/examples/example.config.k8s.zh.toml b/server/opensandbox_server/examples/example.config.k8s.zh.toml index afad730ab..034927741 100644 --- a/server/opensandbox_server/examples/example.config.k8s.zh.toml +++ b/server/opensandbox_server/examples/example.config.k8s.zh.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "kubernetes" -execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15" +execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" [storage] # 允许进行 bind mount 的宿主机路径前缀白名单。 diff --git a/server/opensandbox_server/examples/example.config.toml b/server/opensandbox_server/examples/example.config.toml index 251f8d534..ba1d2d06b 100644 --- a/server/opensandbox_server/examples/example.config.toml +++ b/server/opensandbox_server/examples/example.config.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "docker" -execd_image = "opensandbox/execd:v1.0.15" +execd_image = "opensandbox/execd:v1.0.16" [storage] # Allowlist of host path prefixes permitted for bind mounts. diff --git a/server/opensandbox_server/examples/example.config.zh.toml b/server/opensandbox_server/examples/example.config.zh.toml index 3585895ab..d8b6aabf5 100644 --- a/server/opensandbox_server/examples/example.config.zh.toml +++ b/server/opensandbox_server/examples/example.config.zh.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "docker" -execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.15" +execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" [storage] allowed_host_paths = [] diff --git a/server/tests/test_docker_service.py b/server/tests/test_docker_service.py index 5344f3fa5..5ee050957 100644 --- a/server/tests/test_docker_service.py +++ b/server/tests/test_docker_service.py @@ -1433,7 +1433,7 @@ async def test_create_sandbox_windows_profile_injects_runtime_defaults(mock_dock mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.15" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1516,7 +1516,7 @@ async def test_create_sandbox_windows_profile_rejects_missing_runtime_devices(mo mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.15" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1555,7 +1555,7 @@ async def test_create_sandbox_windows_profile_rejects_below_minimum_resource_lim mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.15" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1592,7 +1592,7 @@ async def test_create_sandbox_windows_profile_accepts_dockur_demo_like_request(m mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.15" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1646,7 +1646,7 @@ async def test_create_sandbox_windows_profile_with_network_policy_maps_windows_p mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.15" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" cfg.docker.network_mode = "bridge" cfg.egress = EgressConfig(image="opensandbox/egress:latest") service = DockerSandboxService(config=cfg) From 5bcb721252f31b7dea4c6b81790cbb995d8a855f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Fri, 15 May 2026 00:00:59 +0800 Subject: [PATCH 08/58] chore(README): osb-cli apikey ops for quick start --- README.md | 1 + cli/README.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index 8381e37aa..84f59a3e7 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,7 @@ Quick start: osb config init osb config set connection.domain localhost:8080 osb config set connection.protocol http +osb config set connection.api_key osb sandbox create --image python:3.12 --timeout 30m -o json osb command run -o raw -- python -c "print(1 + 1)" ``` diff --git a/cli/README.md b/cli/README.md index 928d1523a..28647264f 100644 --- a/cli/README.md +++ b/cli/README.md @@ -50,6 +50,7 @@ opensandbox-server osb config init osb config set connection.domain localhost:8080 osb config set connection.protocol http +osb config set connection.api_key osb config show -o json ``` From 7e0a53b492e4b803c91c6b33d4f20a6ebdffeaf8 Mon Sep 17 00:00:00 2001 From: junxin Date: Fri, 15 May 2026 14:30:37 +0800 Subject: [PATCH 09/58] chore(chart): bump opensandbox-controller chart version to 0.2.0 --- kubernetes/charts/opensandbox-controller/Chart.yaml | 4 ++-- kubernetes/charts/opensandbox/Chart.yaml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kubernetes/charts/opensandbox-controller/Chart.yaml b/kubernetes/charts/opensandbox-controller/Chart.yaml index cbb32533c..b3711fa4e 100644 --- a/kubernetes/charts/opensandbox-controller/Chart.yaml +++ b/kubernetes/charts/opensandbox-controller/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: opensandbox-controller description: A Kubernetes operator for managing sandbox environments with resource pooling and batch delivery type: application -version: 0.1.0 -appVersion: "0.1.0" +version: 0.2.0 +appVersion: "0.2.0" keywords: - sandbox diff --git a/kubernetes/charts/opensandbox/Chart.yaml b/kubernetes/charts/opensandbox/Chart.yaml index 77842db5b..714d85699 100644 --- a/kubernetes/charts/opensandbox/Chart.yaml +++ b/kubernetes/charts/opensandbox/Chart.yaml @@ -16,8 +16,8 @@ apiVersion: v2 name: opensandbox description: All-in-one Helm chart for deploying OpenSandbox controller and server type: application -version: 0.1.0 -appVersion: "0.1.0" +version: 0.2.0 +appVersion: "0.2.0" keywords: - sandbox @@ -40,7 +40,7 @@ kubeVersion: ">=1.21.1-0" dependencies: - name: opensandbox-controller - version: "0.1.0" + version: "0.2.0" repository: "file://../opensandbox-controller" - name: opensandbox-server version: "0.1.0" From f7943dcc745803e50ecfe09d9cb56d4d026b446f Mon Sep 17 00:00:00 2001 From: junxin Date: Fri, 15 May 2026 14:40:13 +0800 Subject: [PATCH 10/58] chore(chart): regenerate Chart.lock for opensandbox-controller 0.2.0 --- kubernetes/charts/opensandbox/Chart.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kubernetes/charts/opensandbox/Chart.lock b/kubernetes/charts/opensandbox/Chart.lock index c455f5efc..eb1c59e50 100644 --- a/kubernetes/charts/opensandbox/Chart.lock +++ b/kubernetes/charts/opensandbox/Chart.lock @@ -1,9 +1,9 @@ dependencies: - name: opensandbox-controller repository: file://../opensandbox-controller - version: 0.1.0 + version: 0.2.0 - name: opensandbox-server repository: file://../opensandbox-server version: 0.1.0 -digest: sha256:c66976fab3f4eea75ec3004c1842079d754387ea430c56cce5514e4f457ee40c -generated: "2026-03-04T17:56:49.467373+08:00" +digest: sha256:b88aa0bfffb5e30aa46163794cb74aa25157ee4b4a437c22867df19633b2a89f +generated: "2026-05-15T14:39:18.571067+08:00" From 0e13e9920d4282d91b478e9fb3c415b0e909842e Mon Sep 17 00:00:00 2001 From: ninan-nn Date: Fri, 15 May 2026 15:40:52 +0800 Subject: [PATCH 11/58] ci: fix publish (#897) --- .github/workflows/publish-js-sdks.yml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/publish-js-sdks.yml b/.github/workflows/publish-js-sdks.yml index d051b0a1a..bfd3173b5 100644 --- a/.github/workflows/publish-js-sdks.yml +++ b/.github/workflows/publish-js-sdks.yml @@ -42,14 +42,13 @@ jobs: - name: Set up pnpm uses: pnpm/action-setup@v4 with: - version: latest - - - name: Enable corepack - run: corepack enable + version: 9.15.0 + run_install: false - name: Get pnpm store path id: pnpm-store - run: echo "STORE_PATH=$(corepack pnpm store path)" >> "$GITHUB_OUTPUT" + working-directory: sdks + run: echo "STORE_PATH=$(pnpm store path)" >> "$GITHUB_OUTPUT" - name: Cache pnpm store uses: actions/cache@v5 @@ -60,11 +59,11 @@ jobs: - name: Install workspace dependencies working-directory: sdks - run: corepack pnpm install --frozen-lockfile + run: pnpm install --frozen-lockfile - name: Build SDK working-directory: sdks - run: corepack pnpm --filter ${{ matrix.sdk.packageName }}... --sort run build + run: pnpm --filter ${{ matrix.sdk.packageName }}... --sort run build - name: Pack SDK if: startsWith(github.ref, format('refs/tags/js/{0}/v', matrix.sdk.tagPrefix)) @@ -74,7 +73,7 @@ jobs: set -euo pipefail PACK_DIR="${GITHUB_WORKSPACE}/dist/npm/${{ matrix.sdk.name }}" mkdir -p "$PACK_DIR" - corepack pnpm pack --pack-destination "$PACK_DIR" + pnpm pack --pack-destination "$PACK_DIR" PACKAGE_TARBALL="$(find "$PACK_DIR" -maxdepth 1 -name '*.tgz' -print -quit)" if [[ -z "$PACKAGE_TARBALL" ]]; then echo "No package tarball was produced in $PACK_DIR" >&2 @@ -93,4 +92,4 @@ jobs: env: NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} run: | - corepack pnpm publish "${{ steps.pack.outputs.tarball }}" --access public --no-git-checks + pnpm publish "${{ steps.pack.outputs.tarball }}" --access public --no-git-checks From 5203befe9f2ef6e031f75c30f30a3b38866b429a Mon Sep 17 00:00:00 2001 From: LuoBo <95574295+longsuizhi@users.noreply.github.com> Date: Sun, 17 May 2026 13:51:19 +0800 Subject: [PATCH 12/58] fix(server): make resourceLimits/image/entrypoint optional when poolRef is set (#883) * fix(server): make resourceLimits/image/entrypoint optional when poolRef is set When creating a sandbox from a pre-configured Pool (via extensions.poolRef), the image, entrypoint, and resourceLimits are all defined in the Pool CRD template. Requiring callers to provide dummy values for these fields is unnecessary and error-prone. Changes: - Make resource_limits Optional with None default in CreateSandboxRequest - Skip image/snapshotId/entrypoint validation when poolRef is present - Add explicit resourceLimits required check for non-pool requests - Guard against None resource_limits in Docker provider code paths * fix(server): address review feedback for pool mode optional fields - Skip image/entrypoint resolution in K8s service layer when poolRef is set - Reject poolRef on Docker provider (unsupported) - Reject snapshotId when poolRef is set (conflicting fields) - Update specs/sandbox-lifecycle.yml: remove required constraint on resourceLimits, document pool mode behavior - All 1038 tests pass * fix: guard _ensure_image_auth_support against None image, align spec docs - Fix AttributeError when image is None in pool mode (P1) - Clarify in spec that snapshotId is rejected (not optional) with poolRef * test: add pool mode validation tests for poolRef, snapshotId rejection, Docker guard, and image auth - Schema: poolRef-only happy path, poolRef+snapshotId rejection, resourceLimits still required without poolRef, blank poolRef ignored - Docker: rejects poolRef with SANDBOX::UNSUPPORTED_POOL_REF - K8s: pool mode skips image/entrypoint validation, image auth guard handles None image without AttributeError All 1046 tests pass (8 new). * fix: normalize blank snapshotId to None in pool mode When poolRef is set and snapshotId is whitespace-only (e.g. ' '), the validator now clears it to None before returning. This prevents downstream code from treating a truthy whitespace string as a real snapshot ID (e.g. writing an invalid Kubernetes label). Adds test_pool_mode_normalizes_blank_snapshot_id to cover this edge case. --------- Co-authored-by: longsuizhi --- server/opensandbox_server/api/schema.py | 22 ++++++- .../services/docker/container_ops.py | 2 +- .../services/docker/docker_service.py | 12 +++- .../services/k8s/kubernetes_service.py | 9 ++- server/tests/k8s/test_kubernetes_service.py | 61 +++++++++++++++++++ server/tests/test_docker_service.py | 23 +++++++ server/tests/test_schema.py | 58 ++++++++++++++++++ specs/sandbox-lifecycle.yml | 15 ++++- 8 files changed, 190 insertions(+), 12 deletions(-) diff --git a/server/opensandbox_server/api/schema.py b/server/opensandbox_server/api/schema.py index a6987c738..f3be288e0 100644 --- a/server/opensandbox_server/api/schema.py +++ b/server/opensandbox_server/api/schema.py @@ -403,10 +403,10 @@ class CreateSandboxRequest(BaseModel): "null timeout when the workload provider does not support non-expiring sandboxes." ), ) - resource_limits: ResourceLimits = Field( - ..., + resource_limits: Optional[ResourceLimits] = Field( + None, alias="resourceLimits", - description="Runtime resource constraints for the sandbox instance", + description="Runtime resource constraints for the sandbox instance. Optional when poolRef is provided.", ) env: Optional[Dict[str, Optional[str]]] = Field( None, @@ -457,6 +457,19 @@ class CreateSandboxRequest(BaseModel): @model_validator(mode="after") def validate_source_and_entrypoint(self) -> "CreateSandboxRequest": + # When poolRef is set, image/snapshotId/entrypoint/resourceLimits are + # all defined in the Pool CRD and not required from the caller. + has_pool_ref = bool((self.extensions or {}).get("poolRef", "").strip()) + if has_pool_ref: + # Reject conflicting fields that would be ignored in pool mode + if bool((self.snapshot_id or "").strip()): + raise ValueError("snapshotId cannot be used together with poolRef.") + # Normalize blank snapshotId so downstream code won't see + # a truthy whitespace string (e.g. " ") as a real value. + if self.snapshot_id is not None and not self.snapshot_id.strip(): + self.snapshot_id = None + return self + has_image = self.image is not None and bool(self.image.uri.strip()) has_snapshot = bool((self.snapshot_id or "").strip()) @@ -472,6 +485,9 @@ def validate_source_and_entrypoint(self) -> "CreateSandboxRequest": if self.snapshot_id is not None and not has_snapshot: self.snapshot_id = None + if self.resource_limits is None: + raise ValueError("resourceLimits is required when poolRef is not provided.") + return self class Config: diff --git a/server/opensandbox_server/services/docker/container_ops.py b/server/opensandbox_server/services/docker/container_ops.py index 88fc4e7bd..d1da88a5b 100644 --- a/server/opensandbox_server/services/docker/container_ops.py +++ b/server/opensandbox_server/services/docker/container_ops.py @@ -329,7 +329,7 @@ def _resolve_image_auth( def _resolve_resource_limits( self, request: CreateSandboxRequest ) -> tuple[Optional[int], Optional[int], Optional[int]]: - resource_limits = request.resource_limits.root or {} + resource_limits = (request.resource_limits.root if request.resource_limits else None) or {} mem_limit = parse_memory_limit(resource_limits.get("memory")) nano_cpus = parse_nano_cpus(resource_limits.get("cpu")) gpu_count = parse_gpu_request(resource_limits.get("gpu")) diff --git a/server/opensandbox_server/services/docker/docker_service.py b/server/opensandbox_server/services/docker/docker_service.py index c02264e5c..6a7c8a05f 100644 --- a/server/opensandbox_server/services/docker/docker_service.py +++ b/server/opensandbox_server/services/docker/docker_service.py @@ -607,6 +607,14 @@ async def create_sandbox(self, request: CreateSandboxRequest) -> CreateSandboxRe Raises: HTTPException: If sandbox creation fails """ + if (request.extensions or {}).get("poolRef", "").strip(): + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail={ + "code": "SANDBOX::UNSUPPORTED_POOL_REF", + "message": "poolRef is not supported by the Docker provider. Use Kubernetes BatchSandbox provider instead.", + }, + ) request = resolve_sandbox_image_from_request(request) ensure_entrypoint(request.entrypoint or []) ensure_metadata_labels(request.metadata) @@ -761,7 +769,7 @@ def _provision_sandbox( requested_windows_profile = is_windows_platform(request.platform) if requested_windows_profile: - validate_windows_resource_limits(request.resource_limits.root or {}) + validate_windows_resource_limits((request.resource_limits.root if request.resource_limits else None) or {}) validate_windows_runtime_prerequisites() # Prepare OSSFS mounts first so binds can reference mounted host paths. @@ -855,7 +863,7 @@ def _provision_sandbox( ) environment = inject_windows_resource_limits_env( environment, - request.resource_limits.root or {}, + (request.resource_limits.root if request.resource_limits else None) or {}, ) environment = inject_windows_user_ports(environment, exposed_ports) diff --git a/server/opensandbox_server/services/k8s/kubernetes_service.py b/server/opensandbox_server/services/k8s/kubernetes_service.py index 28a9fd361..f5958c644 100644 --- a/server/opensandbox_server/services/k8s/kubernetes_service.py +++ b/server/opensandbox_server/services/k8s/kubernetes_service.py @@ -264,7 +264,7 @@ def _ensure_image_auth_support(self, request: CreateSandboxRequest) -> None: Raises HTTP 400 if the provider does not support per-request image auth. """ - if request.image.auth is None: + if request.image is None or request.image.auth is None: return if self.workload_provider.supports_image_auth(): return @@ -404,8 +404,11 @@ async def create_sandbox(self, request: CreateSandboxRequest) -> CreateSandboxRe Raises: HTTPException: If creation fails, timeout, or invalid parameters """ - request = resolve_sandbox_image_from_request(request) - ensure_entrypoint(request.entrypoint or []) + has_pool_ref = bool((request.extensions or {}).get("poolRef", "").strip()) + + if not has_pool_ref: + request = resolve_sandbox_image_from_request(request) + ensure_entrypoint(request.entrypoint or []) ensure_metadata_labels(request.metadata) ensure_platform_valid(request.platform) ensure_timeout_within_limit( diff --git a/server/tests/k8s/test_kubernetes_service.py b/server/tests/k8s/test_kubernetes_service.py index 553b93311..3aa6add8b 100644 --- a/server/tests/k8s/test_kubernetes_service.py +++ b/server/tests/k8s/test_kubernetes_service.py @@ -525,6 +525,67 @@ async def test_create_sandbox_rejects_timeout_above_configured_maximum( assert "configured maximum of 3600s" in exc_info.value.detail["message"] k8s_service.workload_provider.create_workload.assert_not_called() + @pytest.mark.asyncio + async def test_create_sandbox_pool_mode_skips_image_and_entrypoint_validation( + self, k8s_service, mock_workload + ): + """Pool mode: poolRef only, no image/entrypoint/resourceLimits — should succeed.""" + from opensandbox_server.api.schema import CreateSandboxRequest + + pool_request = CreateSandboxRequest( + extensions={"poolRef": "my-pool"}, + ) + + k8s_service.workload_provider.create_workload.return_value = { + "name": "test-sandbox-pool", + "uid": "pool-123", + } + k8s_service.workload_provider.get_workload.return_value = mock_workload + k8s_service.workload_provider.get_status.return_value = { + "state": "Running", + "reason": "", + "message": "Pod is running", + "last_transition_at": datetime.now(timezone.utc), + } + k8s_service.workload_provider.get_endpoint_info.return_value = "10.244.0.5:8080" + k8s_service.workload_provider.get_expiration.return_value = datetime.now(timezone.utc) + timedelta(hours=1) + + response = await k8s_service.create_sandbox(pool_request) + + assert response.id is not None + assert response.status.state == "Running" + k8s_service.workload_provider.create_workload.assert_called_once() + + @pytest.mark.asyncio + async def test_create_sandbox_pool_mode_image_auth_guard_no_error( + self, k8s_service, mock_workload + ): + """Pool mode with image=None should not raise AttributeError in _ensure_image_auth_support.""" + from opensandbox_server.api.schema import CreateSandboxRequest + + pool_request = CreateSandboxRequest( + extensions={"poolRef": "my-pool"}, + ) + assert pool_request.image is None + + k8s_service.workload_provider.create_workload.return_value = { + "name": "test-sandbox-pool2", + "uid": "pool-456", + } + k8s_service.workload_provider.get_workload.return_value = mock_workload + k8s_service.workload_provider.get_status.return_value = { + "state": "Running", + "reason": "", + "message": "Pod is running", + "last_transition_at": datetime.now(timezone.utc), + } + k8s_service.workload_provider.get_endpoint_info.return_value = "10.244.0.6:8080" + k8s_service.workload_provider.get_expiration.return_value = datetime.now(timezone.utc) + timedelta(hours=1) + + # Should not raise AttributeError on None.auth + response = await k8s_service.create_sandbox(pool_request) + assert response.id is not None + class TestWaitForSandboxReady: """_wait_for_sandbox_ready method tests""" diff --git a/server/tests/test_docker_service.py b/server/tests/test_docker_service.py index 5ee050957..435e7047b 100644 --- a/server/tests/test_docker_service.py +++ b/server/tests/test_docker_service.py @@ -330,6 +330,29 @@ async def test_create_sandbox_rejects_invalid_metadata(mock_docker): assert exc.value.detail["code"] == SandboxErrorCodes.INVALID_METADATA_LABEL mock_client.containers.create.assert_not_called() +@pytest.mark.asyncio +@patch("opensandbox_server.services.docker.docker_service.docker") +async def test_create_sandbox_rejects_pool_ref_on_docker(mock_docker): + mock_client = MagicMock() + mock_client.containers.list.return_value = [] + mock_docker.from_env.return_value = mock_client + + service = DockerSandboxService(config=_app_config()) + + request = CreateSandboxRequest( + image=ImageSpec(uri="python:3.11"), + entrypoint=["python"], + resourceLimits=ResourceLimits(root={}), + extensions={"poolRef": "my-pool"}, + ) + + with pytest.raises(HTTPException) as exc: + await service.create_sandbox(request) + + assert exc.value.status_code == status.HTTP_400_BAD_REQUEST + assert exc.value.detail["code"] == "SANDBOX::UNSUPPORTED_POOL_REF" + mock_client.containers.create.assert_not_called() + @pytest.mark.asyncio @patch("opensandbox_server.services.docker.docker_service.docker") async def test_create_sandbox_rejects_timeout_above_configured_maximum(mock_docker): diff --git a/server/tests/test_schema.py b/server/tests/test_schema.py index 1676b11f4..867f373f2 100644 --- a/server/tests/test_schema.py +++ b/server/tests/test_schema.py @@ -601,3 +601,61 @@ def test_request_allows_timeout_above_previous_hardcoded_limit(self): assert request.timeout == 172800 +class TestCreateSandboxRequestPoolMode: + """Tests for pool mode (extensions.poolRef) validation.""" + + def test_pool_mode_accepts_only_pool_ref(self): + """Happy path: poolRef only, no image/entrypoint/resourceLimits required.""" + request = CreateSandboxRequest( + extensions={"poolRef": "my-pool"}, + ) + assert request.image is None + assert request.entrypoint is None + assert request.resource_limits is None + assert request.extensions["poolRef"] == "my-pool" + + def test_pool_mode_accepts_pool_ref_with_optional_fields(self): + """poolRef with optional env/metadata/timeout should be valid.""" + request = CreateSandboxRequest( + extensions={"poolRef": "my-pool"}, + env={"KEY": "value"}, + metadata={"team": "test"}, + timeout=600, + ) + assert request.extensions["poolRef"] == "my-pool" + assert request.env == {"KEY": "value"} + + def test_pool_mode_rejects_snapshot_id_with_pool_ref(self): + """snapshotId and poolRef cannot be used together.""" + with pytest.raises(ValidationError) as exc_info: + CreateSandboxRequest( + snapshotId="snap-001", + extensions={"poolRef": "my-pool"}, + ) + errors = exc_info.value.errors() + assert any("snapshotId" in str(e) and "poolRef" in str(e) for e in errors) + + def test_resource_limits_required_without_pool_ref(self): + """Without poolRef, resourceLimits is still required (image mode).""" + with pytest.raises(ValidationError): + CreateSandboxRequest( + image=ImageSpec(uri="python:3.11"), + entrypoint=["python"], + ) + + def test_pool_mode_normalizes_blank_snapshot_id(self): + """Blank snapshotId (e.g. whitespace) should be normalized to None in pool mode.""" + req = CreateSandboxRequest( + extensions={"poolRef": "my-pool"}, + snapshotId=" ", + ) + assert req.snapshot_id is None + + def test_pool_mode_ignores_blank_pool_ref(self): + """Blank poolRef should not trigger pool mode.""" + with pytest.raises(ValidationError): + CreateSandboxRequest( + extensions={"poolRef": " "}, + ) + + diff --git a/specs/sandbox-lifecycle.yml b/specs/sandbox-lifecycle.yml index 52d06e8b8..9809d7d99 100644 --- a/specs/sandbox-lifecycle.yml +++ b/specs/sandbox-lifecycle.yml @@ -1155,15 +1155,22 @@ components: CreateSandboxRequest: type: object - required: [resourceLimits] description: | - Request to create a new sandbox from either a container image or a snapshot. - Exactly one of `image` or `snapshotId` must be provided. + Request to create a new sandbox from either a container image, a snapshot, + or a pre-configured pool (via `extensions.poolRef`). + + **Standard mode**: Exactly one of `image` or `snapshotId` must be provided, + and `resourceLimits` is required. When `image` is provided, `entrypoint` is required. When `snapshotId` is provided, `entrypoint` is optional. If omitted, the server defaults the sandbox entrypoint to `["tail", "-f", "/dev/null"]`. + **Pool mode**: When `extensions.poolRef` is set, the sandbox is created from + a pre-configured pool. In this case `image`, `entrypoint`, and + `resourceLimits` are all optional (defined by the Pool CRD template). + `snapshotId` must not be provided together with `poolRef`. + **Note**: API Key authentication is required via the `OPEN-SANDBOX-API-KEY` header. properties: image: @@ -1204,6 +1211,8 @@ components: $ref: '#/components/schemas/ResourceLimits' description: | Runtime resource constraints for the sandbox instance. + Required when `extensions.poolRef` is not set. + Optional when using pool mode (resource limits are defined by the Pool CRD template). SDK clients should provide sensible defaults (e.g., cpu: "500m", memory: "512Mi"). env: From 8539764f2c6811dd96f140b858ec34c17b65b539 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 17 May 2026 14:03:49 +0800 Subject: [PATCH 13/58] fix(sdk-go): forward all GetEndpoint headers on subsequent requests Mirror the Python SDK behavior where every header returned by the lifecycle GetEndpoint call (auth tokens, routing hints, sticky-session keys, etc.) is forwarded as-is on every subsequent execd/egress request. The previous code extracted only the X-EXECD-ACCESS-TOKEN / OPENSANDBOX-EGRESS-AUTH header and dropped the rest, which broke routing when the server added new headers. Co-Authored-By: Claude Opus 4.7 --- sdks/sandbox/go/config.go | 18 +++--- sdks/sandbox/go/egress.go | 5 +- sdks/sandbox/go/opensandbox_test.go | 86 +++++++++++++++++++++++++++++ sdks/sandbox/go/sandbox.go | 40 ++++++-------- 4 files changed, 118 insertions(+), 31 deletions(-) diff --git a/sdks/sandbox/go/config.go b/sdks/sandbox/go/config.go index 6e8718068..67a1bd837 100644 --- a/sdks/sandbox/go/config.go +++ b/sdks/sandbox/go/config.go @@ -177,22 +177,24 @@ func (c *ConnectionConfig) lifecycleClient() *LifecycleClient { return NewLifecycleClient(c.GetBaseURL()+"/"+APIVersion, c.GetAPIKey(), c.clientOpts(true)...) } -// execdClient creates an ExecdClient for a resolved endpoint. -// endpointHeaders are additional headers from the endpoint resolution (e.g. routing headers). -func (c *ConnectionConfig) execdClient(endpointURL, token string, endpointHeaders map[string]string) *ExecdClient { +// execdClient creates an ExecdClient for a resolved endpoint. All headers +// returned by the lifecycle GetEndpoint call (auth tokens, routing hints, +// sticky-session keys, etc.) are forwarded as-is on every subsequent request. +func (c *ConnectionConfig) execdClient(endpointURL string, endpointHeaders map[string]string) *ExecdClient { opts := c.clientOpts(true) if len(endpointHeaders) > 0 { opts = append(opts, WithHeaders(endpointHeaders)) } - return NewExecdClient(endpointURL, token, opts...) + return NewExecdClient(endpointURL, "", opts...) } -// egressClient creates an EgressClient for a resolved endpoint. -// endpointHeaders are additional headers from the endpoint resolution (e.g. routing headers). -func (c *ConnectionConfig) egressClient(endpointURL, token string, endpointHeaders map[string]string) *EgressClient { +// egressClient creates an EgressClient for a resolved endpoint. All headers +// returned by the lifecycle GetEndpoint call are forwarded as-is on every +// subsequent request. +func (c *ConnectionConfig) egressClient(endpointURL string, endpointHeaders map[string]string) *EgressClient { opts := c.clientOpts(false) if len(endpointHeaders) > 0 { opts = append(opts, WithHeaders(endpointHeaders)) } - return NewEgressClient(endpointURL, token, opts...) + return NewEgressClient(endpointURL, "", opts...) } diff --git a/sdks/sandbox/go/egress.go b/sdks/sandbox/go/egress.go index eabe14567..6a5ef1cf4 100644 --- a/sdks/sandbox/go/egress.go +++ b/sdks/sandbox/go/egress.go @@ -22,6 +22,9 @@ type EgressClient struct { *Client } +// egressAuthHeader is the authentication header used by the Egress sidecar API. +const egressAuthHeader = "OPENSANDBOX-EGRESS-AUTH" + // NewEgressClient creates a new EgressClient. // baseURL is the sandbox-specific egress sidecar endpoint // (e.g. "http://localhost:18080"). @@ -29,7 +32,7 @@ type EgressClient struct { // if the sidecar does not require authentication. func NewEgressClient(baseURL, authToken string, opts ...Option) *EgressClient { return &EgressClient{ - Client: NewClient(baseURL, authToken, "OPENSANDBOX-EGRESS-AUTH", opts...), + Client: NewClient(baseURL, authToken, egressAuthHeader, opts...), } } diff --git a/sdks/sandbox/go/opensandbox_test.go b/sdks/sandbox/go/opensandbox_test.go index ffec48ff7..6a7a2ebbe 100644 --- a/sdks/sandbox/go/opensandbox_test.go +++ b/sdks/sandbox/go/opensandbox_test.go @@ -1022,6 +1022,92 @@ func TestExecdAuthHeader(t *testing.T) { require.NoErrorf(t, err, "Ping") } +// TestResolveExecdForwardsAllEndpointHeaders verifies that every header +// returned by GetEndpoint (auth tokens, routing hints, sticky-session keys, +// etc.) is forwarded as-is on subsequent execd requests, mirroring the +// Python SDK behavior. +func TestResolveExecdForwardsAllEndpointHeaders(t *testing.T) { + endpointHeaders := map[string]string{ + "X-EXECD-ACCESS-TOKEN": "execd-tok", + "X-Route-Hint": "vip-pool", + "X-Sticky-Session": "sess-abc", + } + + execdSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + for k, want := range endpointHeaders { + if got := r.Header.Get(k); got != want { + assert.Fail(t, fmt.Sprintf("header %s = %q, want %q", k, got, want)) + } + } + w.WriteHeader(http.StatusOK) + })) + defer execdSrv.Close() + + lifecycleSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.Contains(r.URL.Path, "/endpoints/") { + jsonResponse(w, http.StatusOK, Endpoint{ + Endpoint: execdSrv.URL, + Headers: endpointHeaders, + }) + return + } + w.WriteHeader(http.StatusNotFound) + })) + defer lifecycleSrv.Close() + + config := ConnectionConfig{Domain: lifecycleSrv.URL} + sb := &Sandbox{ + id: "sbx-headers", + config: &config, + lifecycle: config.lifecycleClient(), + } + + require.NoErrorf(t, sb.resolveExecd(context.Background()), "resolveExecd") + require.NoErrorf(t, sb.execd.Ping(context.Background()), "Ping") +} + +// TestResolveEgressForwardsAllEndpointHeaders verifies the same forwarding +// behavior for the egress sidecar client. +func TestResolveEgressForwardsAllEndpointHeaders(t *testing.T) { + endpointHeaders := map[string]string{ + "OPENSANDBOX-EGRESS-AUTH": "egress-tok", + "X-Route-Hint": "egress-vip", + } + + egressSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + for k, want := range endpointHeaders { + if got := r.Header.Get(k); got != want { + assert.Fail(t, fmt.Sprintf("header %s = %q, want %q", k, got, want)) + } + } + jsonResponse(w, http.StatusOK, PolicyStatusResponse{Status: "ok"}) + })) + defer egressSrv.Close() + + lifecycleSrv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method == http.MethodGet && strings.Contains(r.URL.Path, "/endpoints/") { + jsonResponse(w, http.StatusOK, Endpoint{ + Endpoint: egressSrv.URL, + Headers: endpointHeaders, + }) + return + } + w.WriteHeader(http.StatusNotFound) + })) + defer lifecycleSrv.Close() + + config := ConnectionConfig{Domain: lifecycleSrv.URL} + sb := &Sandbox{ + id: "sbx-egress-headers", + config: &config, + lifecycle: config.lifecycleClient(), + } + + require.NoErrorf(t, sb.resolveEgress(context.Background()), "resolveEgress") + _, err := sb.egress.GetPolicy(context.Background()) + require.NoErrorf(t, err, "GetPolicy") +} + func TestSandboxManager_ListFilter(t *testing.T) { now := time.Now().UTC().Truncate(time.Second) want := ListSandboxesResponse{ diff --git a/sdks/sandbox/go/sandbox.go b/sdks/sandbox/go/sandbox.go index 5be59ddf7..f4a457f33 100644 --- a/sdks/sandbox/go/sandbox.go +++ b/sdks/sandbox/go/sandbox.go @@ -410,23 +410,19 @@ func (s *Sandbox) resolveExecd(ctx context.Context) error { execdURL = s.config.GetProtocol() + "://" + execdURL } - token := "" - var extraHeaders map[string]string - if endpoint.Headers != nil { - token = endpoint.Headers["X-EXECD-ACCESS-TOKEN"] - // Preserve all endpoint headers (e.g. routing headers) except the auth token - extraHeaders = make(map[string]string, len(endpoint.Headers)) - for k, v := range endpoint.Headers { - if k != "X-EXECD-ACCESS-TOKEN" { - extraHeaders[k] = v + headers := make(map[string]string, len(endpoint.Headers)+1) + for k, v := range endpoint.Headers { + headers[k] = v + } + if s.config.UseServerProxy { + if _, ok := headers[execdAuthHeader]; !ok { + if apiKey := s.config.GetAPIKey(); apiKey != "" { + headers[execdAuthHeader] = apiKey } } } - if s.config.UseServerProxy && token == "" { - token = s.config.GetAPIKey() - } - s.execd = s.config.execdClient(execdURL, token, extraHeaders) + s.execd = s.config.execdClient(execdURL, headers) return nil } @@ -450,18 +446,18 @@ func (s *Sandbox) resolveEgress(ctx context.Context) error { egressURL = s.config.GetProtocol() + "://" + egressURL } - token := "" - var extraHeaders map[string]string - if endpoint.Headers != nil { - token = endpoint.Headers["OPENSANDBOX-EGRESS-AUTH"] - extraHeaders = make(map[string]string, len(endpoint.Headers)) - for k, v := range endpoint.Headers { - if k != "OPENSANDBOX-EGRESS-AUTH" { - extraHeaders[k] = v + headers := make(map[string]string, len(endpoint.Headers)+1) + for k, v := range endpoint.Headers { + headers[k] = v + } + if s.config.UseServerProxy { + if _, ok := headers[egressAuthHeader]; !ok { + if apiKey := s.config.GetAPIKey(); apiKey != "" { + headers[egressAuthHeader] = apiKey } } } - s.egress = s.config.egressClient(egressURL, token, extraHeaders) + s.egress = s.config.egressClient(egressURL, headers) return nil } From a0f66f3e5b67a9e719914006e06bf89881ccf214 Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Sun, 17 May 2026 14:47:47 +0800 Subject: [PATCH 14/58] fix(execd): ensure uploaded files are visible before responding (#895) * fix(execd): ensure uploaded files are visible before responding On weakly-coherent filesystems (virtio-fs, 9pfs), a freshly-written file can be invisible to subsequent processes for a brief window after the upload handler returns 200, causing intermittent "file not found" errors when callers immediately invoke /command after /files/upload. - fsync the parent directory after closing the new file so the new dirent is durable and observable. Best-effort: ignore ENOTSUP. - Wrap ChmodFile in a one-shot retry with a short sleep. ChmodFile always invokes chown under the hood, so a freshly-created dirent that has not yet propagated would otherwise surface as ENOENT and turn a recoverable visibility delay into a 500 response. Co-Authored-By: Claude Opus 4.7 * refactor(execd): split UploadFile to reduce cognitive complexity Extract form parsing, metadata parsing, target resolution, file write, and permission application into helpers so UploadFile passes the gocognit threshold (>30). Behavior unchanged; errors flow via *uploadError and funnel through a single RespondError call site. Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- .../pkg/web/controller/filesystem_upload.go | 290 ++++++++++-------- 1 file changed, 164 insertions(+), 126 deletions(-) diff --git a/components/execd/pkg/web/controller/filesystem_upload.go b/components/execd/pkg/web/controller/filesystem_upload.go index 459a9d2d7..eb5397f7c 100644 --- a/components/execd/pkg/web/controller/filesystem_upload.go +++ b/components/execd/pkg/web/controller/filesystem_upload.go @@ -18,173 +18,211 @@ import ( "encoding/json" "fmt" "io" + "mime/multipart" "net/http" "os" "path/filepath" + "time" "github.com/alibaba/opensandbox/execd/pkg/log" "github.com/alibaba/opensandbox/execd/pkg/util/pathutil" "github.com/alibaba/opensandbox/execd/pkg/web/model" ) +type uploadError struct { + status int + code model.ErrorCode + message string +} + +func newUploadError(status int, code model.ErrorCode, message string) *uploadError { + return &uploadError{status: status, code: code, message: message} +} + // UploadFile uploads files with metadata to specified paths func (c *FilesystemController) UploadFile() { rec := beginFilesystemMetric("upload") defer rec.Finish(c.basicController) + metadataParts, fileParts, uerr := c.parseUploadForm() + if uerr != nil { + c.RespondError(uerr.status, uerr.code, uerr.message) + return + } + + for i := range metadataParts { + if uerr := c.processUploadPair(metadataParts[i], fileParts[i]); uerr != nil { + c.RespondError(uerr.status, uerr.code, uerr.message) + return + } + } + + rec.MarkSuccess() + c.RespondSuccess(nil) +} + +func (c *FilesystemController) parseUploadForm() ([]*multipart.FileHeader, []*multipart.FileHeader, *uploadError) { form, err := c.ctx.MultipartForm() if err != nil || form == nil { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFile, - "multipart form is empty", - ) - return + return nil, nil, newUploadError(http.StatusBadRequest, model.ErrorCodeInvalidFile, "multipart form is empty") } metadataParts := form.File["metadata"] fileParts := form.File["file"] if len(metadataParts) == 0 { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFileMetadata, - "metadata file is missing", - ) - return + return nil, nil, newUploadError(http.StatusBadRequest, model.ErrorCodeInvalidFileMetadata, "metadata file is missing") } - if len(fileParts) == 0 { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFileContent, - "file is missing", - ) - return + return nil, nil, newUploadError(http.StatusBadRequest, model.ErrorCodeInvalidFileContent, "file is missing") } - if len(metadataParts) != len(fileParts) { - c.RespondError( + return nil, nil, newUploadError( http.StatusBadRequest, model.ErrorCodeInvalidFile, fmt.Sprintf("metadata and file count mismatch: %d vs %d", len(metadataParts), len(fileParts)), ) - return } + return metadataParts, fileParts, nil +} - for i := range metadataParts { - metadataHeader := metadataParts[i] - metadataFile, err := metadataHeader.Open() - if err != nil { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFileMetadata, - fmt.Sprintf("error opening metadata file. %v", err), - ) - return - } +func (c *FilesystemController) processUploadPair(metadataHeader, fileHeader *multipart.FileHeader) *uploadError { + meta, uerr := parseUploadMetadata(metadataHeader) + if uerr != nil { + return uerr + } - metaBytes, err := io.ReadAll(metadataFile) - metadataFile.Close() - if err != nil { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFileMetadata, - fmt.Sprintf("error reading metadata content. %v", err), - ) - return - } + resolvedPath, uerr := resolveUploadTarget(meta.Path) + if uerr != nil { + return uerr + } - var meta model.FileMetadata - if err := json.Unmarshal(metaBytes, &meta); err != nil { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFileMetadata, - fmt.Sprintf("invalid metadata format. %v", err), - ) - return - } + if uerr := writeUploadFile(resolvedPath, fileHeader); uerr != nil { + return uerr + } - targetPath := meta.Path - if targetPath == "" { - c.RespondError( - http.StatusBadRequest, - model.ErrorCodeInvalidFileMetadata, - "metadata path is empty", - ) - return - } - resolvedPath, err := pathutil.ExpandPath(targetPath) - if err != nil { - c.RespondError( - http.StatusInternalServerError, - model.ErrorCodeRuntimeError, - fmt.Sprintf("error resolving target path %s. %v", targetPath, err), - ) - return - } + return applyUploadPermission(resolvedPath, meta.Permission) +} - targetDir := filepath.Dir(resolvedPath) - if err := os.MkdirAll(targetDir, os.ModePerm); err != nil { - c.RespondError( - http.StatusInternalServerError, - model.ErrorCodeRuntimeError, - fmt.Sprintf("error creating target directory %s. %v", targetDir, err), - ) - return - } +func parseUploadMetadata(header *multipart.FileHeader) (*model.FileMetadata, *uploadError) { + metadataFile, err := header.Open() + if err != nil { + return nil, newUploadError( + http.StatusBadRequest, + model.ErrorCodeInvalidFileMetadata, + fmt.Sprintf("error opening metadata file. %v", err), + ) + } + metaBytes, err := io.ReadAll(metadataFile) + metadataFile.Close() + if err != nil { + return nil, newUploadError( + http.StatusBadRequest, + model.ErrorCodeInvalidFileMetadata, + fmt.Sprintf("error reading metadata content. %v", err), + ) + } - fileHeader := fileParts[i] - file, err := fileHeader.Open() - if err != nil { - c.RespondError( - http.StatusInternalServerError, - model.ErrorCodeRuntimeError, - fmt.Sprintf("error opening file %s. %v", fileHeader.Filename, err), - ) - return - } + var meta model.FileMetadata + if err := json.Unmarshal(metaBytes, &meta); err != nil { + return nil, newUploadError( + http.StatusBadRequest, + model.ErrorCodeInvalidFileMetadata, + fmt.Sprintf("invalid metadata format. %v", err), + ) + } + if meta.Path == "" { + return nil, newUploadError(http.StatusBadRequest, model.ErrorCodeInvalidFileMetadata, "metadata path is empty") + } + return &meta, nil +} - dst, err := os.OpenFile(resolvedPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) - if err != nil { - file.Close() - c.RespondError( - http.StatusInternalServerError, - model.ErrorCodeRuntimeError, - fmt.Sprintf("error opening destination file %s. %v", resolvedPath, err), - ) - return - } +func resolveUploadTarget(targetPath string) (string, *uploadError) { + resolvedPath, err := pathutil.ExpandPath(targetPath) + if err != nil { + return "", newUploadError( + http.StatusInternalServerError, + model.ErrorCodeRuntimeError, + fmt.Sprintf("error resolving target path %s. %v", targetPath, err), + ) + } + targetDir := filepath.Dir(resolvedPath) + if err := os.MkdirAll(targetDir, os.ModePerm); err != nil { + return "", newUploadError( + http.StatusInternalServerError, + model.ErrorCodeRuntimeError, + fmt.Sprintf("error creating target directory %s. %v", targetDir, err), + ) + } + return resolvedPath, nil +} - if _, err := io.Copy(dst, file); err != nil { - dst.Close() - file.Close() - c.RespondError( - http.StatusInternalServerError, - model.ErrorCodeRuntimeError, - fmt.Sprintf("error copying file %s. %v", resolvedPath, err), - ) - return - } +func writeUploadFile(resolvedPath string, fileHeader *multipart.FileHeader) *uploadError { + file, err := fileHeader.Open() + if err != nil { + return newUploadError( + http.StatusInternalServerError, + model.ErrorCodeRuntimeError, + fmt.Sprintf("error opening file %s. %v", fileHeader.Filename, err), + ) + } + defer file.Close() + + dst, err := os.OpenFile(resolvedPath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, os.ModePerm) + if err != nil { + return newUploadError( + http.StatusInternalServerError, + model.ErrorCodeRuntimeError, + fmt.Sprintf("error opening destination file %s. %v", resolvedPath, err), + ) + } - if err := dst.Sync(); err != nil { - log.Error("failed to sync target file: %v", err) - } - if err := dst.Close(); err != nil { - log.Error("failed to close target file: %v", err) - } - file.Close() - - if err := ChmodFile(resolvedPath, meta.Permission); err != nil { - c.RespondError( - http.StatusInternalServerError, - model.ErrorCodeRuntimeError, - fmt.Sprintf("error chmoding file %s. %v", resolvedPath, err), - ) - return + if _, err := io.Copy(dst, file); err != nil { + dst.Close() + return newUploadError( + http.StatusInternalServerError, + model.ErrorCodeRuntimeError, + fmt.Sprintf("error copying file %s. %v", resolvedPath, err), + ) + } + + if err := dst.Sync(); err != nil { + log.Error("failed to sync target file: %v", err) + } + if err := dst.Close(); err != nil { + log.Error("failed to close target file: %v", err) + } + + // fsync parent directory so the new dirent is durable and visible on + // weakly-coherent filesystems (virtio-fs, 9pfs, etc.). Best-effort: + // some filesystems return ENOTSUP for directory fsync. + targetDir := filepath.Dir(resolvedPath) + if d, err := os.Open(targetDir); err == nil { + if err := d.Sync(); err != nil { + log.Warning("failed to sync parent dir %s: %v", targetDir, err) } + _ = d.Close() } + return nil +} - rec.MarkSuccess() - c.RespondSuccess(nil) +// applyUploadPermission applies the metadata permission with one retry to +// absorb metadata-propagation delay on weakly-coherent filesystems +// (virtio-fs, 9pfs). ChmodFile always invokes chown under the hood, so a +// freshly-created dirent that has not yet propagated will surface as ENOENT +// here even though the file is fully written and synced. +func applyUploadPermission(resolvedPath string, permission model.Permission) *uploadError { + chmodErr := ChmodFile(resolvedPath, permission) + if chmodErr != nil { + time.Sleep(20 * time.Millisecond) + chmodErr = ChmodFile(resolvedPath, permission) + } + if chmodErr != nil { + return newUploadError( + http.StatusInternalServerError, + model.ErrorCodeRuntimeError, + fmt.Sprintf("error chmoding file %s. %v", resolvedPath, chmodErr), + ) + } + return nil } From 745c1945aef846b10d8ac4cc2c4a83b00e68f805 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 17 May 2026 15:06:09 +0800 Subject: [PATCH 15/58] perf(server): expose uvicorn worker/concurrency knobs Add ServerConfig fields to make uvicorn process count, concurrency limits, socket backlog, and event-loop/HTTP parser implementation configurable. Defaults preserve current behavior (workers=1) while enabling operators to scale a single pod across multiple Python processes when apiserver capacity allows. - pyproject.toml: switch to uvicorn[standard] for uvloop/httptools/watchfiles - config.py: ServerConfig.workers, limit_concurrency, backlog, loop, http - cli.py: thread new fields into uvicorn.run; force workers=1 under --reload - main.py: pass loop/http to dev __main__ entry - examples + configuration.md: document tunables and apiserver tradeoff Co-Authored-By: Claude Opus 4.7 --- server/configuration.md | 5 + server/opensandbox_server/cli.py | 18 ++- server/opensandbox_server/config.py | 39 ++++++ server/opensandbox_server/main.py | 2 + server/pyproject.toml | 2 +- server/tests/test_config.py | 49 +++++++ server/uv.lock | 205 +++++++++++++++++++++++++++- 7 files changed, 314 insertions(+), 6 deletions(-) diff --git a/server/configuration.md b/server/configuration.md index a178eebbf..26b8aed21 100644 --- a/server/configuration.md +++ b/server/configuration.md @@ -66,6 +66,11 @@ Example files in this repository: | `eip` | string \| omitted | `null` | Public IP or hostname used as the **host part** when the server returns sandbox endpoint URLs (notably Docker runtime). | | `max_sandbox_timeout_seconds` | integer \| omitted | `null` | Upper bound on sandbox TTL in seconds for **create** requests that specify `timeout`. Must be ≥ **60** if set. Omit to disable the server-side cap. | | `timeout_keep_alive` | integer | `30` | Idle keep-alive timeout (seconds) passed to uvicorn. | +| `workers` | integer | `1` | Number of uvicorn worker processes. Each worker is a separate Python process with its own event loop and (under the Kubernetes runtime) its own informer watch streams to the apiserver. Default `1` keeps apiserver pressure predictable; bump to 2–8 based on CPU quota and apiserver capacity. Ignored when `--reload` is set. | +| `limit_concurrency` | integer \| omitted | `1024` | Maximum concurrent connections per worker before returning 503. Provides backpressure protection under burst load. Omit to disable. | +| `backlog` | integer | `2048` | Socket listen backlog passed to uvicorn. | +| `loop` | `"auto"` \| `"uvloop"` \| `"asyncio"` | `"auto"` | Event loop implementation. `auto` prefers uvloop and falls back to asyncio. | +| `http` | `"auto"` \| `"httptools"` \| `"h11"` | `"auto"` | HTTP protocol parser. `auto` prefers httptools and falls back to h11. | --- diff --git a/server/opensandbox_server/cli.py b/server/opensandbox_server/cli.py index 2b029ced0..a542b6bff 100644 --- a/server/opensandbox_server/cli.py +++ b/server/opensandbox_server/cli.py @@ -286,13 +286,25 @@ def main() -> None: from opensandbox_server import main as server_main # local import after env is set + server_cfg = server_main.app_config.server + workers = 1 if args.reload else server_cfg.workers + if args.reload and server_cfg.workers > 1: + print( + f"--reload set; ignoring workers={server_cfg.workers}, using 1\n" + ) + uvicorn.run( "opensandbox_server.main:app", - host=server_main.app_config.server.host, - port=server_main.app_config.server.port, + host=server_cfg.host, + port=server_cfg.port, reload=args.reload, log_config=server_main._log_config, - timeout_keep_alive=server_main.app_config.server.timeout_keep_alive, + timeout_keep_alive=server_cfg.timeout_keep_alive, + workers=workers, + limit_concurrency=server_cfg.limit_concurrency, + backlog=server_cfg.backlog, + loop=server_cfg.loop, + http=server_cfg.http, ) diff --git a/server/opensandbox_server/config.py b/server/opensandbox_server/config.py index 5a1cedda1..54ad9729b 100644 --- a/server/opensandbox_server/config.py +++ b/server/opensandbox_server/config.py @@ -453,6 +453,45 @@ class ServerConfig(BaseModel): "Connections idle longer than this may be closed by the server." ), ) + workers: int = Field( + default=1, + ge=1, + description=( + "Number of uvicorn worker processes. Each worker is a separate " + "Python process with its own event loop and (under the Kubernetes " + "runtime) its own informer watch streams to the apiserver. " + "Default 1 to keep apiserver pressure predictable; bump to 2-8 " + "based on CPU quota and apiserver capacity. Ignored when " + "--reload is set." + ), + ) + limit_concurrency: Optional[int] = Field( + default=1024, + ge=1, + description=( + "Maximum concurrent connections per worker before returning 503. " + "Set null to disable. Provides backpressure protection under burst load." + ), + ) + backlog: int = Field( + default=2048, + ge=1, + description="Socket listen backlog passed to uvicorn.", + ) + loop: Literal["auto", "uvloop", "asyncio"] = Field( + default="auto", + description=( + "Event loop implementation. 'auto' uses uvloop when available and " + "falls back to asyncio. 'asyncio' forces the stdlib loop." + ), + ) + http: Literal["auto", "httptools", "h11"] = Field( + default="auto", + description=( + "HTTP protocol parser. 'auto' uses httptools when available and " + "falls back to h11." + ), + ) api_key: Optional[str] = Field( default=None, description="Global API key for authenticating incoming lifecycle API calls.", diff --git a/server/opensandbox_server/main.py b/server/opensandbox_server/main.py index ba0004f67..f9d0e7f40 100644 --- a/server/opensandbox_server/main.py +++ b/server/opensandbox_server/main.py @@ -204,4 +204,6 @@ async def health_check(): reload=True, log_config=_log_config, timeout_keep_alive=app_config.server.timeout_keep_alive, + loop=app_config.server.loop, + http=app_config.server.http, ) diff --git a/server/pyproject.toml b/server/pyproject.toml index 5bf44ab97..36b6f523f 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -51,7 +51,7 @@ dependencies = [ "pydantic-settings", "pyyaml", "tomli; python_version < \"3.11\"", - "uvicorn", + "uvicorn[standard]", "websockets>=14.0", ] diff --git a/server/tests/test_config.py b/server/tests/test_config.py index e45821670..578c5bd15 100644 --- a/server/tests/test_config.py +++ b/server/tests/test_config.py @@ -164,6 +164,55 @@ def test_server_config_defaults_include_max_sandbox_timeout(): assert server_cfg.max_sandbox_timeout_seconds is None +def test_server_config_uvicorn_tuning_defaults(): + """ServerConfig exposes uvicorn worker/concurrency knobs with sensible defaults.""" + server_cfg = ServerConfig() + assert server_cfg.workers == 1 + assert server_cfg.limit_concurrency == 1024 + assert server_cfg.backlog == 2048 + assert server_cfg.loop == "auto" + assert server_cfg.http == "auto" + + +def test_server_config_uvicorn_tuning_overrides(): + server_cfg = ServerConfig( + workers=8, + limit_concurrency=256, + backlog=4096, + loop="uvloop", + http="httptools", + ) + assert server_cfg.workers == 8 + assert server_cfg.limit_concurrency == 256 + assert server_cfg.backlog == 4096 + assert server_cfg.loop == "uvloop" + assert server_cfg.http == "httptools" + + +def test_server_config_workers_must_be_positive(): + with pytest.raises(ValidationError): + ServerConfig(workers=0) + + +def test_server_config_limit_concurrency_must_be_positive_when_set(): + with pytest.raises(ValidationError): + ServerConfig(limit_concurrency=0) + cfg = ServerConfig(limit_concurrency=None) + assert cfg.limit_concurrency is None + + +def test_server_config_backlog_must_be_positive(): + with pytest.raises(ValidationError): + ServerConfig(backlog=0) + + +def test_server_config_loop_and_http_reject_unknown_values(): + with pytest.raises(ValidationError): + ServerConfig(loop="trio") # type: ignore[arg-type] + with pytest.raises(ValidationError): + ServerConfig(http="hyper") # type: ignore[arg-type] + + def test_store_defaults_to_sqlite(): cfg = StoreConfig() assert cfg.type == "sqlite" diff --git a/server/uv.lock b/server/uv.lock index 0dc237ed0..34c7ea8f1 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -347,6 +347,49 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, ] +[[package]] +name = "httptools" +version = "0.7.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b5/46/120a669232c7bdedb9d52d4aeae7e6c7dfe151e99dc70802e2fc7a5e1993/httptools-0.7.1.tar.gz", hash = "sha256:abd72556974f8e7c74a259655924a717a2365b236c882c3f6f8a45fe94703ac9", size = 258961, upload-time = "2025-10-10T03:55:08.559Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/e5/c07e0bcf4ec8db8164e9f6738c048b2e66aabf30e7506f440c4cc6953f60/httptools-0.7.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:11d01b0ff1fe02c4c32d60af61a4d613b74fad069e47e06e9067758c01e9ac78", size = 204531, upload-time = "2025-10-10T03:54:20.887Z" }, + { url = "https://files.pythonhosted.org/packages/7e/4f/35e3a63f863a659f92ffd92bef131f3e81cf849af26e6435b49bd9f6f751/httptools-0.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84d86c1e5afdc479a6fdabf570be0d3eb791df0ae727e8dbc0259ed1249998d4", size = 109408, upload-time = "2025-10-10T03:54:22.455Z" }, + { url = "https://files.pythonhosted.org/packages/f5/71/b0a9193641d9e2471ac541d3b1b869538a5fb6419d52fd2669fa9c79e4b8/httptools-0.7.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c8c751014e13d88d2be5f5f14fc8b89612fcfa92a9cc480f2bc1598357a23a05", size = 440889, upload-time = "2025-10-10T03:54:23.753Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d9/2e34811397b76718750fea44658cb0205b84566e895192115252e008b152/httptools-0.7.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:654968cb6b6c77e37b832a9be3d3ecabb243bbe7a0b8f65fbc5b6b04c8fcabed", size = 440460, upload-time = "2025-10-10T03:54:25.313Z" }, + { url = "https://files.pythonhosted.org/packages/01/3f/a04626ebeacc489866bb4d82362c0657b2262bef381d68310134be7f40bb/httptools-0.7.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b580968316348b474b020edf3988eecd5d6eec4634ee6561e72ae3a2a0e00a8a", size = 425267, upload-time = "2025-10-10T03:54:26.81Z" }, + { url = "https://files.pythonhosted.org/packages/a5/99/adcd4f66614db627b587627c8ad6f4c55f18881549bab10ecf180562e7b9/httptools-0.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d496e2f5245319da9d764296e86c5bb6fcf0cf7a8806d3d000717a889c8c0b7b", size = 424429, upload-time = "2025-10-10T03:54:28.174Z" }, + { url = "https://files.pythonhosted.org/packages/d5/72/ec8fc904a8fd30ba022dfa85f3bbc64c3c7cd75b669e24242c0658e22f3c/httptools-0.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:cbf8317bfccf0fed3b5680c559d3459cccf1abe9039bfa159e62e391c7270568", size = 86173, upload-time = "2025-10-10T03:54:29.5Z" }, + { url = "https://files.pythonhosted.org/packages/9c/08/17e07e8d89ab8f343c134616d72eebfe03798835058e2ab579dcc8353c06/httptools-0.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:474d3b7ab469fefcca3697a10d11a32ee2b9573250206ba1e50d5980910da657", size = 206521, upload-time = "2025-10-10T03:54:31.002Z" }, + { url = "https://files.pythonhosted.org/packages/aa/06/c9c1b41ff52f16aee526fd10fbda99fa4787938aa776858ddc4a1ea825ec/httptools-0.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3c3b7366bb6c7b96bd72d0dbe7f7d5eead261361f013be5f6d9590465ea1c70", size = 110375, upload-time = "2025-10-10T03:54:31.941Z" }, + { url = "https://files.pythonhosted.org/packages/cc/cc/10935db22fda0ee34c76f047590ca0a8bd9de531406a3ccb10a90e12ea21/httptools-0.7.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:379b479408b8747f47f3b253326183d7c009a3936518cdb70db58cffd369d9df", size = 456621, upload-time = "2025-10-10T03:54:33.176Z" }, + { url = "https://files.pythonhosted.org/packages/0e/84/875382b10d271b0c11aa5d414b44f92f8dd53e9b658aec338a79164fa548/httptools-0.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cad6b591a682dcc6cf1397c3900527f9affef1e55a06c4547264796bbd17cf5e", size = 454954, upload-time = "2025-10-10T03:54:34.226Z" }, + { url = "https://files.pythonhosted.org/packages/30/e1/44f89b280f7e46c0b1b2ccee5737d46b3bb13136383958f20b580a821ca0/httptools-0.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eb844698d11433d2139bbeeb56499102143beb582bd6c194e3ba69c22f25c274", size = 440175, upload-time = "2025-10-10T03:54:35.942Z" }, + { url = "https://files.pythonhosted.org/packages/6f/7e/b9287763159e700e335028bc1824359dc736fa9b829dacedace91a39b37e/httptools-0.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f65744d7a8bdb4bda5e1fa23e4ba16832860606fcc09d674d56e425e991539ec", size = 440310, upload-time = "2025-10-10T03:54:37.1Z" }, + { url = "https://files.pythonhosted.org/packages/b3/07/5b614f592868e07f5c94b1f301b5e14a21df4e8076215a3bccb830a687d8/httptools-0.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:135fbe974b3718eada677229312e97f3b31f8a9c8ffa3ae6f565bf808d5b6bcb", size = 86875, upload-time = "2025-10-10T03:54:38.421Z" }, + { url = "https://files.pythonhosted.org/packages/53/7f/403e5d787dc4942316e515e949b0c8a013d84078a915910e9f391ba9b3ed/httptools-0.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:38e0c83a2ea9746ebbd643bdfb521b9aa4a91703e2cd705c20443405d2fd16a5", size = 206280, upload-time = "2025-10-10T03:54:39.274Z" }, + { url = "https://files.pythonhosted.org/packages/2a/0d/7f3fd28e2ce311ccc998c388dd1c53b18120fda3b70ebb022b135dc9839b/httptools-0.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f25bbaf1235e27704f1a7b86cd3304eabc04f569c828101d94a0e605ef7205a5", size = 110004, upload-time = "2025-10-10T03:54:40.403Z" }, + { url = "https://files.pythonhosted.org/packages/84/a6/b3965e1e146ef5762870bbe76117876ceba51a201e18cc31f5703e454596/httptools-0.7.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2c15f37ef679ab9ecc06bfc4e6e8628c32a8e4b305459de7cf6785acd57e4d03", size = 517655, upload-time = "2025-10-10T03:54:41.347Z" }, + { url = "https://files.pythonhosted.org/packages/11/7d/71fee6f1844e6fa378f2eddde6c3e41ce3a1fb4b2d81118dd544e3441ec0/httptools-0.7.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7fe6e96090df46b36ccfaf746f03034e5ab723162bc51b0a4cf58305324036f2", size = 511440, upload-time = "2025-10-10T03:54:42.452Z" }, + { url = "https://files.pythonhosted.org/packages/22/a5/079d216712a4f3ffa24af4a0381b108aa9c45b7a5cc6eb141f81726b1823/httptools-0.7.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f72fdbae2dbc6e68b8239defb48e6a5937b12218e6ffc2c7846cc37befa84362", size = 495186, upload-time = "2025-10-10T03:54:43.937Z" }, + { url = "https://files.pythonhosted.org/packages/e9/9e/025ad7b65278745dee3bd0ebf9314934c4592560878308a6121f7f812084/httptools-0.7.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:e99c7b90a29fd82fea9ef57943d501a16f3404d7b9ee81799d41639bdaae412c", size = 499192, upload-time = "2025-10-10T03:54:45.003Z" }, + { url = "https://files.pythonhosted.org/packages/6d/de/40a8f202b987d43afc4d54689600ff03ce65680ede2f31df348d7f368b8f/httptools-0.7.1-cp312-cp312-win_amd64.whl", hash = "sha256:3e14f530fefa7499334a79b0cf7e7cd2992870eb893526fb097d51b4f2d0f321", size = 86694, upload-time = "2025-10-10T03:54:45.923Z" }, + { url = "https://files.pythonhosted.org/packages/09/8f/c77b1fcbfd262d422f12da02feb0d218fa228d52485b77b953832105bb90/httptools-0.7.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:6babce6cfa2a99545c60bfef8bee0cc0545413cb0018f617c8059a30ad985de3", size = 202889, upload-time = "2025-10-10T03:54:47.089Z" }, + { url = "https://files.pythonhosted.org/packages/0a/1a/22887f53602feaa066354867bc49a68fc295c2293433177ee90870a7d517/httptools-0.7.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:601b7628de7504077dd3dcb3791c6b8694bbd967148a6d1f01806509254fb1ca", size = 108180, upload-time = "2025-10-10T03:54:48.052Z" }, + { url = "https://files.pythonhosted.org/packages/32/6a/6aaa91937f0010d288d3d124ca2946d48d60c3a5ee7ca62afe870e3ea011/httptools-0.7.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:04c6c0e6c5fb0739c5b8a9eb046d298650a0ff38cf42537fc372b28dc7e4472c", size = 478596, upload-time = "2025-10-10T03:54:48.919Z" }, + { url = "https://files.pythonhosted.org/packages/6d/70/023d7ce117993107be88d2cbca566a7c1323ccbaf0af7eabf2064fe356f6/httptools-0.7.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:69d4f9705c405ae3ee83d6a12283dc9feba8cc6aaec671b412917e644ab4fa66", size = 473268, upload-time = "2025-10-10T03:54:49.993Z" }, + { url = "https://files.pythonhosted.org/packages/32/4d/9dd616c38da088e3f436e9a616e1d0cc66544b8cdac405cc4e81c8679fc7/httptools-0.7.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:44c8f4347d4b31269c8a9205d8a5ee2df5322b09bbbd30f8f862185bb6b05346", size = 455517, upload-time = "2025-10-10T03:54:51.066Z" }, + { url = "https://files.pythonhosted.org/packages/1d/3a/a6c595c310b7df958e739aae88724e24f9246a514d909547778d776799be/httptools-0.7.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:465275d76db4d554918aba40bf1cbebe324670f3dfc979eaffaa5d108e2ed650", size = 458337, upload-time = "2025-10-10T03:54:52.196Z" }, + { url = "https://files.pythonhosted.org/packages/fd/82/88e8d6d2c51edc1cc391b6e044c6c435b6aebe97b1abc33db1b0b24cd582/httptools-0.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:322d00c2068d125bd570f7bf78b2d367dad02b919d8581d7476d8b75b294e3e6", size = 85743, upload-time = "2025-10-10T03:54:53.448Z" }, + { url = "https://files.pythonhosted.org/packages/34/50/9d095fcbb6de2d523e027a2f304d4551855c2f46e0b82befd718b8b20056/httptools-0.7.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:c08fe65728b8d70b6923ce31e3956f859d5e1e8548e6f22ec520a962c6757270", size = 203619, upload-time = "2025-10-10T03:54:54.321Z" }, + { url = "https://files.pythonhosted.org/packages/07/f0/89720dc5139ae54b03f861b5e2c55a37dba9a5da7d51e1e824a1f343627f/httptools-0.7.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7aea2e3c3953521c3c51106ee11487a910d45586e351202474d45472db7d72d3", size = 108714, upload-time = "2025-10-10T03:54:55.163Z" }, + { url = "https://files.pythonhosted.org/packages/b3/cb/eea88506f191fb552c11787c23f9a405f4c7b0c5799bf73f2249cd4f5228/httptools-0.7.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0e68b8582f4ea9166be62926077a3334064d422cf08ab87d8b74664f8e9058e1", size = 472909, upload-time = "2025-10-10T03:54:56.056Z" }, + { url = "https://files.pythonhosted.org/packages/e0/4a/a548bdfae6369c0d078bab5769f7b66f17f1bfaa6fa28f81d6be6959066b/httptools-0.7.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df091cf961a3be783d6aebae963cc9b71e00d57fa6f149025075217bc6a55a7b", size = 470831, upload-time = "2025-10-10T03:54:57.219Z" }, + { url = "https://files.pythonhosted.org/packages/4d/31/14df99e1c43bd132eec921c2e7e11cda7852f65619bc0fc5bdc2d0cb126c/httptools-0.7.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:f084813239e1eb403ddacd06a30de3d3e09a9b76e7894dcda2b22f8a726e9c60", size = 452631, upload-time = "2025-10-10T03:54:58.219Z" }, + { url = "https://files.pythonhosted.org/packages/22/d2/b7e131f7be8d854d48cb6d048113c30f9a46dca0c9a8b08fcb3fcd588cdc/httptools-0.7.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7347714368fb2b335e9063bc2b96f2f87a9ceffcd9758ac295f8bbcd3ffbc0ca", size = 452910, upload-time = "2025-10-10T03:54:59.366Z" }, + { url = "https://files.pythonhosted.org/packages/53/cf/878f3b91e4e6e011eff6d1fa9ca39f7eb17d19c9d7971b04873734112f30/httptools-0.7.1-cp314-cp314-win_amd64.whl", hash = "sha256:cfabda2a5bb85aa2a904ce06d974a3f30fb36cc63d7feaddec05d2050acede96", size = 88205, upload-time = "2025-10-10T03:55:00.389Z" }, +] + [[package]] name = "httpx" version = "0.28.1" @@ -436,7 +479,7 @@ dependencies = [ { name = "pyyaml" }, { name = "redis" }, { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "uvicorn" }, + { name = "uvicorn", extra = ["standard"] }, { name = "websockets" }, ] @@ -460,7 +503,7 @@ requires-dist = [ { name = "pyyaml" }, { name = "redis", specifier = ">=5" }, { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "uvicorn" }, + { name = "uvicorn", extras = ["standard"] }, { name = "websockets", specifier = ">=14.0" }, ] @@ -1003,6 +1046,164 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/d9/d88e73ca598f4f6ff671fb5fde8a32925c2e08a637303a1d12883c7305fa/uvicorn-0.38.0-py3-none-any.whl", hash = "sha256:48c0afd214ceb59340075b4a052ea1ee91c16fbc2a9b1469cca0e54566977b02", size = 68109, upload-time = "2025-10-18T13:46:42.958Z" }, ] +[package.optional-dependencies] +standard = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "httptools" }, + { name = "python-dotenv" }, + { name = "pyyaml" }, + { name = "uvloop", marker = "platform_python_implementation != 'PyPy' and sys_platform != 'cygwin' and sys_platform != 'win32'" }, + { name = "watchfiles" }, + { name = "websockets" }, +] + +[[package]] +name = "uvloop" +version = "0.22.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/06/f0/18d39dbd1971d6d62c4629cc7fa67f74821b0dc1f5a77af43719de7936a7/uvloop-0.22.1.tar.gz", hash = "sha256:6c84bae345b9147082b17371e3dd5d42775bddce91f885499017f4607fdaf39f", size = 2443250, upload-time = "2025-10-16T22:17:19.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/eb/14/ecceb239b65adaaf7fde510aa8bd534075695d1e5f8dadfa32b5723d9cfb/uvloop-0.22.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ef6f0d4cc8a9fa1f6a910230cd53545d9a14479311e87e3cb225495952eb672c", size = 1343335, upload-time = "2025-10-16T22:16:11.43Z" }, + { url = "https://files.pythonhosted.org/packages/ba/ae/6f6f9af7f590b319c94532b9567409ba11f4fa71af1148cab1bf48a07048/uvloop-0.22.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7cd375a12b71d33d46af85a3343b35d98e8116134ba404bd657b3b1d15988792", size = 742903, upload-time = "2025-10-16T22:16:12.979Z" }, + { url = "https://files.pythonhosted.org/packages/09/bd/3667151ad0702282a1f4d5d29288fce8a13c8b6858bf0978c219cd52b231/uvloop-0.22.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac33ed96229b7790eb729702751c0e93ac5bc3bcf52ae9eccbff30da09194b86", size = 3648499, upload-time = "2025-10-16T22:16:14.451Z" }, + { url = "https://files.pythonhosted.org/packages/b3/f6/21657bb3beb5f8c57ce8be3b83f653dd7933c2fd00545ed1b092d464799a/uvloop-0.22.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:481c990a7abe2c6f4fc3d98781cc9426ebd7f03a9aaa7eb03d3bfc68ac2a46bd", size = 3700133, upload-time = "2025-10-16T22:16:16.272Z" }, + { url = "https://files.pythonhosted.org/packages/09/e0/604f61d004ded805f24974c87ddd8374ef675644f476f01f1df90e4cdf72/uvloop-0.22.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a592b043a47ad17911add5fbd087c76716d7c9ccc1d64ec9249ceafd735f03c2", size = 3512681, upload-time = "2025-10-16T22:16:18.07Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ce/8491fd370b0230deb5eac69c7aae35b3be527e25a911c0acdffb922dc1cd/uvloop-0.22.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1489cf791aa7b6e8c8be1c5a080bae3a672791fcb4e9e12249b05862a2ca9cec", size = 3615261, upload-time = "2025-10-16T22:16:19.596Z" }, + { url = "https://files.pythonhosted.org/packages/c7/d5/69900f7883235562f1f50d8184bb7dd84a2fb61e9ec63f3782546fdbd057/uvloop-0.22.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c60ebcd36f7b240b30788554b6f0782454826a0ed765d8430652621b5de674b9", size = 1352420, upload-time = "2025-10-16T22:16:21.187Z" }, + { url = "https://files.pythonhosted.org/packages/a8/73/c4e271b3bce59724e291465cc936c37758886a4868787da0278b3b56b905/uvloop-0.22.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b7f102bf3cb1995cfeaee9321105e8f5da76fdb104cdad8986f85461a1b7b77", size = 748677, upload-time = "2025-10-16T22:16:22.558Z" }, + { url = "https://files.pythonhosted.org/packages/86/94/9fb7fad2f824d25f8ecac0d70b94d0d48107ad5ece03769a9c543444f78a/uvloop-0.22.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53c85520781d84a4b8b230e24a5af5b0778efdb39142b424990ff1ef7c48ba21", size = 3753819, upload-time = "2025-10-16T22:16:23.903Z" }, + { url = "https://files.pythonhosted.org/packages/74/4f/256aca690709e9b008b7108bc85fba619a2bc37c6d80743d18abad16ee09/uvloop-0.22.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:56a2d1fae65fd82197cb8c53c367310b3eabe1bbb9fb5a04d28e3e3520e4f702", size = 3804529, upload-time = "2025-10-16T22:16:25.246Z" }, + { url = "https://files.pythonhosted.org/packages/7f/74/03c05ae4737e871923d21a76fe28b6aad57f5c03b6e6bfcfa5ad616013e4/uvloop-0.22.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:40631b049d5972c6755b06d0bfe8233b1bd9a8a6392d9d1c45c10b6f9e9b2733", size = 3621267, upload-time = "2025-10-16T22:16:26.819Z" }, + { url = "https://files.pythonhosted.org/packages/75/be/f8e590fe61d18b4a92070905497aec4c0e64ae1761498cad09023f3f4b3e/uvloop-0.22.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:535cc37b3a04f6cd2c1ef65fa1d370c9a35b6695df735fcff5427323f2cd5473", size = 3723105, upload-time = "2025-10-16T22:16:28.252Z" }, + { url = "https://files.pythonhosted.org/packages/3d/ff/7f72e8170be527b4977b033239a83a68d5c881cc4775fca255c677f7ac5d/uvloop-0.22.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:fe94b4564e865d968414598eea1a6de60adba0c040ba4ed05ac1300de402cd42", size = 1359936, upload-time = "2025-10-16T22:16:29.436Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c6/e5d433f88fd54d81ef4be58b2b7b0cea13c442454a1db703a1eea0db1a59/uvloop-0.22.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:51eb9bd88391483410daad430813d982010f9c9c89512321f5b60e2cddbdddd6", size = 752769, upload-time = "2025-10-16T22:16:30.493Z" }, + { url = "https://files.pythonhosted.org/packages/24/68/a6ac446820273e71aa762fa21cdcc09861edd3536ff47c5cd3b7afb10eeb/uvloop-0.22.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:700e674a166ca5778255e0e1dc4e9d79ab2acc57b9171b79e65feba7184b3370", size = 4317413, upload-time = "2025-10-16T22:16:31.644Z" }, + { url = "https://files.pythonhosted.org/packages/5f/6f/e62b4dfc7ad6518e7eff2516f680d02a0f6eb62c0c212e152ca708a0085e/uvloop-0.22.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7b5b1ac819a3f946d3b2ee07f09149578ae76066d70b44df3fa990add49a82e4", size = 4426307, upload-time = "2025-10-16T22:16:32.917Z" }, + { url = "https://files.pythonhosted.org/packages/90/60/97362554ac21e20e81bcef1150cb2a7e4ffdaf8ea1e5b2e8bf7a053caa18/uvloop-0.22.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e047cc068570bac9866237739607d1313b9253c3051ad84738cbb095be0537b2", size = 4131970, upload-time = "2025-10-16T22:16:34.015Z" }, + { url = "https://files.pythonhosted.org/packages/99/39/6b3f7d234ba3964c428a6e40006340f53ba37993f46ed6e111c6e9141d18/uvloop-0.22.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:512fec6815e2dd45161054592441ef76c830eddaad55c8aa30952e6fe1ed07c0", size = 4296343, upload-time = "2025-10-16T22:16:35.149Z" }, + { url = "https://files.pythonhosted.org/packages/89/8c/182a2a593195bfd39842ea68ebc084e20c850806117213f5a299dfc513d9/uvloop-0.22.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:561577354eb94200d75aca23fbde86ee11be36b00e52a4eaf8f50fb0c86b7705", size = 1358611, upload-time = "2025-10-16T22:16:36.833Z" }, + { url = "https://files.pythonhosted.org/packages/d2/14/e301ee96a6dc95224b6f1162cd3312f6d1217be3907b79173b06785f2fe7/uvloop-0.22.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1cdf5192ab3e674ca26da2eada35b288d2fa49fdd0f357a19f0e7c4e7d5077c8", size = 751811, upload-time = "2025-10-16T22:16:38.275Z" }, + { url = "https://files.pythonhosted.org/packages/b7/02/654426ce265ac19e2980bfd9ea6590ca96a56f10c76e63801a2df01c0486/uvloop-0.22.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e2ea3d6190a2968f4a14a23019d3b16870dd2190cd69c8180f7c632d21de68d", size = 4288562, upload-time = "2025-10-16T22:16:39.375Z" }, + { url = "https://files.pythonhosted.org/packages/15/c0/0be24758891ef825f2065cd5db8741aaddabe3e248ee6acc5e8a80f04005/uvloop-0.22.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0530a5fbad9c9e4ee3f2b33b148c6a64d47bbad8000ea63704fa8260f4cf728e", size = 4366890, upload-time = "2025-10-16T22:16:40.547Z" }, + { url = "https://files.pythonhosted.org/packages/d2/53/8369e5219a5855869bcee5f4d317f6da0e2c669aecf0ef7d371e3d084449/uvloop-0.22.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bc5ef13bbc10b5335792360623cc378d52d7e62c2de64660616478c32cd0598e", size = 4119472, upload-time = "2025-10-16T22:16:41.694Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ba/d69adbe699b768f6b29a5eec7b47dd610bd17a69de51b251126a801369ea/uvloop-0.22.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:1f38ec5e3f18c8a10ded09742f7fb8de0108796eb673f30ce7762ce1b8550cad", size = 4239051, upload-time = "2025-10-16T22:16:43.224Z" }, + { url = "https://files.pythonhosted.org/packages/90/cd/b62bdeaa429758aee8de8b00ac0dd26593a9de93d302bff3d21439e9791d/uvloop-0.22.1-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:3879b88423ec7e97cd4eba2a443aa26ed4e59b45e6b76aabf13fe2f27023a142", size = 1362067, upload-time = "2025-10-16T22:16:44.503Z" }, + { url = "https://files.pythonhosted.org/packages/0d/f8/a132124dfda0777e489ca86732e85e69afcd1ff7686647000050ba670689/uvloop-0.22.1-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:4baa86acedf1d62115c1dc6ad1e17134476688f08c6efd8a2ab076e815665c74", size = 752423, upload-time = "2025-10-16T22:16:45.968Z" }, + { url = "https://files.pythonhosted.org/packages/a3/94/94af78c156f88da4b3a733773ad5ba0b164393e357cc4bd0ab2e2677a7d6/uvloop-0.22.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:297c27d8003520596236bdb2335e6b3f649480bd09e00d1e3a99144b691d2a35", size = 4272437, upload-time = "2025-10-16T22:16:47.451Z" }, + { url = "https://files.pythonhosted.org/packages/b5/35/60249e9fd07b32c665192cec7af29e06c7cd96fa1d08b84f012a56a0b38e/uvloop-0.22.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c1955d5a1dd43198244d47664a5858082a3239766a839b2102a269aaff7a4e25", size = 4292101, upload-time = "2025-10-16T22:16:49.318Z" }, + { url = "https://files.pythonhosted.org/packages/02/62/67d382dfcb25d0a98ce73c11ed1a6fba5037a1a1d533dcbb7cab033a2636/uvloop-0.22.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b31dc2fccbd42adc73bc4e7cdbae4fc5086cf378979e53ca5d0301838c5682c6", size = 4114158, upload-time = "2025-10-16T22:16:50.517Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/f1171b4a882a5d13c8b7576f348acfe6074d72eaf52cccef752f748d4a9f/uvloop-0.22.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:93f617675b2d03af4e72a5333ef89450dfaa5321303ede6e67ba9c9d26878079", size = 4177360, upload-time = "2025-10-16T22:16:52.646Z" }, + { url = "https://files.pythonhosted.org/packages/79/7b/b01414f31546caf0919da80ad57cbfe24c56b151d12af68cee1b04922ca8/uvloop-0.22.1-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:37554f70528f60cad66945b885eb01f1bb514f132d92b6eeed1c90fd54ed6289", size = 1454790, upload-time = "2025-10-16T22:16:54.355Z" }, + { url = "https://files.pythonhosted.org/packages/d4/31/0bb232318dd838cad3fa8fb0c68c8b40e1145b32025581975e18b11fab40/uvloop-0.22.1-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b76324e2dc033a0b2f435f33eb88ff9913c156ef78e153fb210e03c13da746b3", size = 796783, upload-time = "2025-10-16T22:16:55.906Z" }, + { url = "https://files.pythonhosted.org/packages/42/38/c9b09f3271a7a723a5de69f8e237ab8e7803183131bc57c890db0b6bb872/uvloop-0.22.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:badb4d8e58ee08dad957002027830d5c3b06aea446a6a3744483c2b3b745345c", size = 4647548, upload-time = "2025-10-16T22:16:57.008Z" }, + { url = "https://files.pythonhosted.org/packages/c1/37/945b4ca0ac27e3dc4952642d4c900edd030b3da6c9634875af6e13ae80e5/uvloop-0.22.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b91328c72635f6f9e0282e4a57da7470c7350ab1c9f48546c0f2866205349d21", size = 4467065, upload-time = "2025-10-16T22:16:58.206Z" }, + { url = "https://files.pythonhosted.org/packages/97/cc/48d232f33d60e2e2e0b42f4e73455b146b76ebe216487e862700457fbf3c/uvloop-0.22.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:daf620c2995d193449393d6c62131b3fbd40a63bf7b307a1527856ace637fe88", size = 4328384, upload-time = "2025-10-16T22:16:59.36Z" }, + { url = "https://files.pythonhosted.org/packages/e4/16/c1fd27e9549f3c4baf1dc9c20c456cd2f822dbf8de9f463824b0c0357e06/uvloop-0.22.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6cde23eeda1a25c75b2e07d39970f3374105d5eafbaab2a4482be82f272d5a5e", size = 4296730, upload-time = "2025-10-16T22:17:00.744Z" }, +] + +[[package]] +name = "watchfiles" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c2/c9/8869df9b2a2d6c59d79220a4db37679e74f807c559ffe5265e08b227a210/watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2", size = 94440, upload-time = "2025-10-14T15:06:21.08Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/1a/206e8cf2dd86fddf939165a57b4df61607a1e0add2785f170a3f616b7d9f/watchfiles-1.1.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:eef58232d32daf2ac67f42dea51a2c80f0d03379075d44a587051e63cc2e368c", size = 407318, upload-time = "2025-10-14T15:04:18.753Z" }, + { url = "https://files.pythonhosted.org/packages/b3/0f/abaf5262b9c496b5dad4ed3c0e799cbecb1f8ea512ecb6ddd46646a9fca3/watchfiles-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03fa0f5237118a0c5e496185cafa92878568b652a2e9a9382a5151b1a0380a43", size = 394478, upload-time = "2025-10-14T15:04:20.297Z" }, + { url = "https://files.pythonhosted.org/packages/b1/04/9cc0ba88697b34b755371f5ace8d3a4d9a15719c07bdc7bd13d7d8c6a341/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ca65483439f9c791897f7db49202301deb6e15fe9f8fe2fed555bf986d10c31", size = 449894, upload-time = "2025-10-14T15:04:21.527Z" }, + { url = "https://files.pythonhosted.org/packages/d2/9c/eda4615863cd8621e89aed4df680d8c3ec3da6a4cf1da113c17decd87c7f/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f0ab1c1af0cb38e3f598244c17919fb1a84d1629cc08355b0074b6d7f53138ac", size = 459065, upload-time = "2025-10-14T15:04:22.795Z" }, + { url = "https://files.pythonhosted.org/packages/84/13/f28b3f340157d03cbc8197629bc109d1098764abe1e60874622a0be5c112/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bc570d6c01c206c46deb6e935a260be44f186a2f05179f52f7fcd2be086a94d", size = 488377, upload-time = "2025-10-14T15:04:24.138Z" }, + { url = "https://files.pythonhosted.org/packages/86/93/cfa597fa9389e122488f7ffdbd6db505b3b915ca7435ecd7542e855898c2/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e84087b432b6ac94778de547e08611266f1f8ffad28c0ee4c82e028b0fc5966d", size = 595837, upload-time = "2025-10-14T15:04:25.057Z" }, + { url = "https://files.pythonhosted.org/packages/57/1e/68c1ed5652b48d89fc24d6af905d88ee4f82fa8bc491e2666004e307ded1/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:620bae625f4cb18427b1bb1a2d9426dc0dd5a5ba74c7c2cdb9de405f7b129863", size = 473456, upload-time = "2025-10-14T15:04:26.497Z" }, + { url = "https://files.pythonhosted.org/packages/d5/dc/1a680b7458ffa3b14bb64878112aefc8f2e4f73c5af763cbf0bd43100658/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:544364b2b51a9b0c7000a4b4b02f90e9423d97fbbf7e06689236443ebcad81ab", size = 455614, upload-time = "2025-10-14T15:04:27.539Z" }, + { url = "https://files.pythonhosted.org/packages/61/a5/3d782a666512e01eaa6541a72ebac1d3aae191ff4a31274a66b8dd85760c/watchfiles-1.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bbe1ef33d45bc71cf21364df962af171f96ecaeca06bd9e3d0b583efb12aec82", size = 630690, upload-time = "2025-10-14T15:04:28.495Z" }, + { url = "https://files.pythonhosted.org/packages/9b/73/bb5f38590e34687b2a9c47a244aa4dd50c56a825969c92c9c5fc7387cea1/watchfiles-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1a0bb430adb19ef49389e1ad368450193a90038b5b752f4ac089ec6942c4dff4", size = 622459, upload-time = "2025-10-14T15:04:29.491Z" }, + { url = "https://files.pythonhosted.org/packages/f1/ac/c9bb0ec696e07a20bd58af5399aeadaef195fb2c73d26baf55180fe4a942/watchfiles-1.1.1-cp310-cp310-win32.whl", hash = "sha256:3f6d37644155fb5beca5378feb8c1708d5783145f2a0f1c4d5a061a210254844", size = 272663, upload-time = "2025-10-14T15:04:30.435Z" }, + { url = "https://files.pythonhosted.org/packages/11/a0/a60c5a7c2ec59fa062d9a9c61d02e3b6abd94d32aac2d8344c4bdd033326/watchfiles-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:a36d8efe0f290835fd0f33da35042a1bb5dc0e83cbc092dcf69bce442579e88e", size = 287453, upload-time = "2025-10-14T15:04:31.53Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f8/2c5f479fb531ce2f0564eda479faecf253d886b1ab3630a39b7bf7362d46/watchfiles-1.1.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f57b396167a2565a4e8b5e56a5a1c537571733992b226f4f1197d79e94cf0ae5", size = 406529, upload-time = "2025-10-14T15:04:32.899Z" }, + { url = "https://files.pythonhosted.org/packages/fe/cd/f515660b1f32f65df671ddf6f85bfaca621aee177712874dc30a97397977/watchfiles-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:421e29339983e1bebc281fab40d812742268ad057db4aee8c4d2bce0af43b741", size = 394384, upload-time = "2025-10-14T15:04:33.761Z" }, + { url = "https://files.pythonhosted.org/packages/7b/c3/28b7dc99733eab43fca2d10f55c86e03bd6ab11ca31b802abac26b23d161/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e43d39a741e972bab5d8100b5cdacf69db64e34eb19b6e9af162bccf63c5cc6", size = 448789, upload-time = "2025-10-14T15:04:34.679Z" }, + { url = "https://files.pythonhosted.org/packages/4a/24/33e71113b320030011c8e4316ccca04194bf0cbbaeee207f00cbc7d6b9f5/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f537afb3276d12814082a2e9b242bdcf416c2e8fd9f799a737990a1dbe906e5b", size = 460521, upload-time = "2025-10-14T15:04:35.963Z" }, + { url = "https://files.pythonhosted.org/packages/f4/c3/3c9a55f255aa57b91579ae9e98c88704955fa9dac3e5614fb378291155df/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2cd9e04277e756a2e2d2543d65d1e2166d6fd4c9b183f8808634fda23f17b14", size = 488722, upload-time = "2025-10-14T15:04:37.091Z" }, + { url = "https://files.pythonhosted.org/packages/49/36/506447b73eb46c120169dc1717fe2eff07c234bb3232a7200b5f5bd816e9/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3f58818dc0b07f7d9aa7fe9eb1037aecb9700e63e1f6acfed13e9fef648f5d", size = 596088, upload-time = "2025-10-14T15:04:38.39Z" }, + { url = "https://files.pythonhosted.org/packages/82/ab/5f39e752a9838ec4d52e9b87c1e80f1ee3ccdbe92e183c15b6577ab9de16/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb9f66367023ae783551042d31b1d7fd422e8289eedd91f26754a66f44d5cff", size = 472923, upload-time = "2025-10-14T15:04:39.666Z" }, + { url = "https://files.pythonhosted.org/packages/af/b9/a419292f05e302dea372fa7e6fda5178a92998411f8581b9830d28fb9edb/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aebfd0861a83e6c3d1110b78ad54704486555246e542be3e2bb94195eabb2606", size = 456080, upload-time = "2025-10-14T15:04:40.643Z" }, + { url = "https://files.pythonhosted.org/packages/b0/c3/d5932fd62bde1a30c36e10c409dc5d54506726f08cb3e1d8d0ba5e2bc8db/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5fac835b4ab3c6487b5dbad78c4b3724e26bcc468e886f8ba8cc4306f68f6701", size = 629432, upload-time = "2025-10-14T15:04:41.789Z" }, + { url = "https://files.pythonhosted.org/packages/f7/77/16bddd9779fafb795f1a94319dc965209c5641db5bf1edbbccace6d1b3c0/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:399600947b170270e80134ac854e21b3ccdefa11a9529a3decc1327088180f10", size = 623046, upload-time = "2025-10-14T15:04:42.718Z" }, + { url = "https://files.pythonhosted.org/packages/46/ef/f2ecb9a0f342b4bfad13a2787155c6ee7ce792140eac63a34676a2feeef2/watchfiles-1.1.1-cp311-cp311-win32.whl", hash = "sha256:de6da501c883f58ad50db3a32ad397b09ad29865b5f26f64c24d3e3281685849", size = 271473, upload-time = "2025-10-14T15:04:43.624Z" }, + { url = "https://files.pythonhosted.org/packages/94/bc/f42d71125f19731ea435c3948cad148d31a64fccde3867e5ba4edee901f9/watchfiles-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:35c53bd62a0b885bf653ebf6b700d1bf05debb78ad9292cf2a942b23513dc4c4", size = 287598, upload-time = "2025-10-14T15:04:44.516Z" }, + { url = "https://files.pythonhosted.org/packages/57/c9/a30f897351f95bbbfb6abcadafbaca711ce1162f4db95fc908c98a9165f3/watchfiles-1.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:57ca5281a8b5e27593cb7d82c2ac927ad88a96ed406aa446f6344e4328208e9e", size = 277210, upload-time = "2025-10-14T15:04:45.883Z" }, + { url = "https://files.pythonhosted.org/packages/74/d5/f039e7e3c639d9b1d09b07ea412a6806d38123f0508e5f9b48a87b0a76cc/watchfiles-1.1.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8c89f9f2f740a6b7dcc753140dd5e1ab9215966f7a3530d0c0705c83b401bd7d", size = 404745, upload-time = "2025-10-14T15:04:46.731Z" }, + { url = "https://files.pythonhosted.org/packages/a5/96/a881a13aa1349827490dab2d363c8039527060cfcc2c92cc6d13d1b1049e/watchfiles-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bd404be08018c37350f0d6e34676bd1e2889990117a2b90070b3007f172d0610", size = 391769, upload-time = "2025-10-14T15:04:48.003Z" }, + { url = "https://files.pythonhosted.org/packages/4b/5b/d3b460364aeb8da471c1989238ea0e56bec24b6042a68046adf3d9ddb01c/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8526e8f916bb5b9a0a777c8317c23ce65de259422bba5b31325a6fa6029d33af", size = 449374, upload-time = "2025-10-14T15:04:49.179Z" }, + { url = "https://files.pythonhosted.org/packages/b9/44/5769cb62d4ed055cb17417c0a109a92f007114a4e07f30812a73a4efdb11/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2edc3553362b1c38d9f06242416a5d8e9fe235c204a4072e988ce2e5bb1f69f6", size = 459485, upload-time = "2025-10-14T15:04:50.155Z" }, + { url = "https://files.pythonhosted.org/packages/19/0c/286b6301ded2eccd4ffd0041a1b726afda999926cf720aab63adb68a1e36/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30f7da3fb3f2844259cba4720c3fc7138eb0f7b659c38f3bfa65084c7fc7abce", size = 488813, upload-time = "2025-10-14T15:04:51.059Z" }, + { url = "https://files.pythonhosted.org/packages/c7/2b/8530ed41112dd4a22f4dcfdb5ccf6a1baad1ff6eed8dc5a5f09e7e8c41c7/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f8979280bdafff686ba5e4d8f97840f929a87ed9cdf133cbbd42f7766774d2aa", size = 594816, upload-time = "2025-10-14T15:04:52.031Z" }, + { url = "https://files.pythonhosted.org/packages/ce/d2/f5f9fb49489f184f18470d4f99f4e862a4b3e9ac2865688eb2099e3d837a/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dcc5c24523771db3a294c77d94771abcfcb82a0e0ee8efd910c37c59ec1b31bb", size = 475186, upload-time = "2025-10-14T15:04:53.064Z" }, + { url = "https://files.pythonhosted.org/packages/cf/68/5707da262a119fb06fbe214d82dd1fe4a6f4af32d2d14de368d0349eb52a/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1db5d7ae38ff20153d542460752ff397fcf5c96090c1230803713cf3147a6803", size = 456812, upload-time = "2025-10-14T15:04:55.174Z" }, + { url = "https://files.pythonhosted.org/packages/66/ab/3cbb8756323e8f9b6f9acb9ef4ec26d42b2109bce830cc1f3468df20511d/watchfiles-1.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:28475ddbde92df1874b6c5c8aaeb24ad5be47a11f87cde5a28ef3835932e3e94", size = 630196, upload-time = "2025-10-14T15:04:56.22Z" }, + { url = "https://files.pythonhosted.org/packages/78/46/7152ec29b8335f80167928944a94955015a345440f524d2dfe63fc2f437b/watchfiles-1.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:36193ed342f5b9842edd3532729a2ad55c4160ffcfa3700e0d54be496b70dd43", size = 622657, upload-time = "2025-10-14T15:04:57.521Z" }, + { url = "https://files.pythonhosted.org/packages/0a/bf/95895e78dd75efe9a7f31733607f384b42eb5feb54bd2eb6ed57cc2e94f4/watchfiles-1.1.1-cp312-cp312-win32.whl", hash = "sha256:859e43a1951717cc8de7f4c77674a6d389b106361585951d9e69572823f311d9", size = 272042, upload-time = "2025-10-14T15:04:59.046Z" }, + { url = "https://files.pythonhosted.org/packages/87/0a/90eb755f568de2688cb220171c4191df932232c20946966c27a59c400850/watchfiles-1.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:91d4c9a823a8c987cce8fa2690923b069966dabb196dd8d137ea2cede885fde9", size = 288410, upload-time = "2025-10-14T15:05:00.081Z" }, + { url = "https://files.pythonhosted.org/packages/36/76/f322701530586922fbd6723c4f91ace21364924822a8772c549483abed13/watchfiles-1.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:a625815d4a2bdca61953dbba5a39d60164451ef34c88d751f6c368c3ea73d404", size = 278209, upload-time = "2025-10-14T15:05:01.168Z" }, + { url = "https://files.pythonhosted.org/packages/bb/f4/f750b29225fe77139f7ae5de89d4949f5a99f934c65a1f1c0b248f26f747/watchfiles-1.1.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:130e4876309e8686a5e37dba7d5e9bc77e6ed908266996ca26572437a5271e18", size = 404321, upload-time = "2025-10-14T15:05:02.063Z" }, + { url = "https://files.pythonhosted.org/packages/2b/f9/f07a295cde762644aa4c4bb0f88921d2d141af45e735b965fb2e87858328/watchfiles-1.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f3bde70f157f84ece3765b42b4a52c6ac1a50334903c6eaf765362f6ccca88a", size = 391783, upload-time = "2025-10-14T15:05:03.052Z" }, + { url = "https://files.pythonhosted.org/packages/bc/11/fc2502457e0bea39a5c958d86d2cb69e407a4d00b85735ca724bfa6e0d1a/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14e0b1fe858430fc0251737ef3824c54027bedb8c37c38114488b8e131cf8219", size = 449279, upload-time = "2025-10-14T15:05:04.004Z" }, + { url = "https://files.pythonhosted.org/packages/e3/1f/d66bc15ea0b728df3ed96a539c777acfcad0eb78555ad9efcaa1274688f0/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f27db948078f3823a6bb3b465180db8ebecf26dd5dae6f6180bd87383b6b4428", size = 459405, upload-time = "2025-10-14T15:05:04.942Z" }, + { url = "https://files.pythonhosted.org/packages/be/90/9f4a65c0aec3ccf032703e6db02d89a157462fbb2cf20dd415128251cac0/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:059098c3a429f62fc98e8ec62b982230ef2c8df68c79e826e37b895bc359a9c0", size = 488976, upload-time = "2025-10-14T15:05:05.905Z" }, + { url = "https://files.pythonhosted.org/packages/37/57/ee347af605d867f712be7029bb94c8c071732a4b44792e3176fa3c612d39/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bfb5862016acc9b869bb57284e6cb35fdf8e22fe59f7548858e2f971d045f150", size = 595506, upload-time = "2025-10-14T15:05:06.906Z" }, + { url = "https://files.pythonhosted.org/packages/a8/78/cc5ab0b86c122047f75e8fc471c67a04dee395daf847d3e59381996c8707/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:319b27255aacd9923b8a276bb14d21a5f7ff82564c744235fc5eae58d95422ae", size = 474936, upload-time = "2025-10-14T15:05:07.906Z" }, + { url = "https://files.pythonhosted.org/packages/62/da/def65b170a3815af7bd40a3e7010bf6ab53089ef1b75d05dd5385b87cf08/watchfiles-1.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c755367e51db90e75b19454b680903631d41f9e3607fbd941d296a020c2d752d", size = 456147, upload-time = "2025-10-14T15:05:09.138Z" }, + { url = "https://files.pythonhosted.org/packages/57/99/da6573ba71166e82d288d4df0839128004c67d2778d3b566c138695f5c0b/watchfiles-1.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:c22c776292a23bfc7237a98f791b9ad3144b02116ff10d820829ce62dff46d0b", size = 630007, upload-time = "2025-10-14T15:05:10.117Z" }, + { url = "https://files.pythonhosted.org/packages/a8/51/7439c4dd39511368849eb1e53279cd3454b4a4dbace80bab88feeb83c6b5/watchfiles-1.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:3a476189be23c3686bc2f4321dd501cb329c0a0469e77b7b534ee10129ae6374", size = 622280, upload-time = "2025-10-14T15:05:11.146Z" }, + { url = "https://files.pythonhosted.org/packages/95/9c/8ed97d4bba5db6fdcdb2b298d3898f2dd5c20f6b73aee04eabe56c59677e/watchfiles-1.1.1-cp313-cp313-win32.whl", hash = "sha256:bf0a91bfb5574a2f7fc223cf95eeea79abfefa404bf1ea5e339c0c1560ae99a0", size = 272056, upload-time = "2025-10-14T15:05:12.156Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f3/c14e28429f744a260d8ceae18bf58c1d5fa56b50d006a7a9f80e1882cb0d/watchfiles-1.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:52e06553899e11e8074503c8e716d574adeeb7e68913115c4b3653c53f9bae42", size = 288162, upload-time = "2025-10-14T15:05:13.208Z" }, + { url = "https://files.pythonhosted.org/packages/dc/61/fe0e56c40d5cd29523e398d31153218718c5786b5e636d9ae8ae79453d27/watchfiles-1.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:ac3cc5759570cd02662b15fbcd9d917f7ecd47efe0d6b40474eafd246f91ea18", size = 277909, upload-time = "2025-10-14T15:05:14.49Z" }, + { url = "https://files.pythonhosted.org/packages/79/42/e0a7d749626f1e28c7108a99fb9bf524b501bbbeb9b261ceecde644d5a07/watchfiles-1.1.1-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:563b116874a9a7ce6f96f87cd0b94f7faf92d08d0021e837796f0a14318ef8da", size = 403389, upload-time = "2025-10-14T15:05:15.777Z" }, + { url = "https://files.pythonhosted.org/packages/15/49/08732f90ce0fbbc13913f9f215c689cfc9ced345fb1bcd8829a50007cc8d/watchfiles-1.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:3ad9fe1dae4ab4212d8c91e80b832425e24f421703b5a42ef2e4a1e215aff051", size = 389964, upload-time = "2025-10-14T15:05:16.85Z" }, + { url = "https://files.pythonhosted.org/packages/27/0d/7c315d4bd5f2538910491a0393c56bf70d333d51bc5b34bee8e68e8cea19/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce70f96a46b894b36eba678f153f052967a0d06d5b5a19b336ab0dbbd029f73e", size = 448114, upload-time = "2025-10-14T15:05:17.876Z" }, + { url = "https://files.pythonhosted.org/packages/c3/24/9e096de47a4d11bc4df41e9d1e61776393eac4cb6eb11b3e23315b78b2cc/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cb467c999c2eff23a6417e58d75e5828716f42ed8289fe6b77a7e5a91036ca70", size = 460264, upload-time = "2025-10-14T15:05:18.962Z" }, + { url = "https://files.pythonhosted.org/packages/cc/0f/e8dea6375f1d3ba5fcb0b3583e2b493e77379834c74fd5a22d66d85d6540/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:836398932192dae4146c8f6f737d74baeac8b70ce14831a239bdb1ca882fc261", size = 487877, upload-time = "2025-10-14T15:05:20.094Z" }, + { url = "https://files.pythonhosted.org/packages/ac/5b/df24cfc6424a12deb41503b64d42fbea6b8cb357ec62ca84a5a3476f654a/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:743185e7372b7bc7c389e1badcc606931a827112fbbd37f14c537320fca08620", size = 595176, upload-time = "2025-10-14T15:05:21.134Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b5/853b6757f7347de4e9b37e8cc3289283fb983cba1ab4d2d7144694871d9c/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afaeff7696e0ad9f02cbb8f56365ff4686ab205fcf9c4c5b6fdfaaa16549dd04", size = 473577, upload-time = "2025-10-14T15:05:22.306Z" }, + { url = "https://files.pythonhosted.org/packages/e1/f7/0a4467be0a56e80447c8529c9fce5b38eab4f513cb3d9bf82e7392a5696b/watchfiles-1.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f7eb7da0eb23aa2ba036d4f616d46906013a68caf61b7fdbe42fc8b25132e77", size = 455425, upload-time = "2025-10-14T15:05:23.348Z" }, + { url = "https://files.pythonhosted.org/packages/8e/e0/82583485ea00137ddf69bc84a2db88bd92ab4a6e3c405e5fb878ead8d0e7/watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:831a62658609f0e5c64178211c942ace999517f5770fe9436be4c2faeba0c0ef", size = 628826, upload-time = "2025-10-14T15:05:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/28/9a/a785356fccf9fae84c0cc90570f11702ae9571036fb25932f1242c82191c/watchfiles-1.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:f9a2ae5c91cecc9edd47e041a930490c31c3afb1f5e6d71de3dc671bfaca02bf", size = 622208, upload-time = "2025-10-14T15:05:25.45Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f4/0872229324ef69b2c3edec35e84bd57a1289e7d3fe74588048ed8947a323/watchfiles-1.1.1-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:d1715143123baeeaeadec0528bb7441103979a1d5f6fd0e1f915383fea7ea6d5", size = 404315, upload-time = "2025-10-14T15:05:26.501Z" }, + { url = "https://files.pythonhosted.org/packages/7b/22/16d5331eaed1cb107b873f6ae1b69e9ced582fcf0c59a50cd84f403b1c32/watchfiles-1.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:39574d6370c4579d7f5d0ad940ce5b20db0e4117444e39b6d8f99db5676c52fd", size = 390869, upload-time = "2025-10-14T15:05:27.649Z" }, + { url = "https://files.pythonhosted.org/packages/b2/7e/5643bfff5acb6539b18483128fdc0ef2cccc94a5b8fbda130c823e8ed636/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7365b92c2e69ee952902e8f70f3ba6360d0d596d9299d55d7d386df84b6941fb", size = 449919, upload-time = "2025-10-14T15:05:28.701Z" }, + { url = "https://files.pythonhosted.org/packages/51/2e/c410993ba5025a9f9357c376f48976ef0e1b1aefb73b97a5ae01a5972755/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bfff9740c69c0e4ed32416f013f3c45e2ae42ccedd1167ef2d805c000b6c71a5", size = 460845, upload-time = "2025-10-14T15:05:30.064Z" }, + { url = "https://files.pythonhosted.org/packages/8e/a4/2df3b404469122e8680f0fcd06079317e48db58a2da2950fb45020947734/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b27cf2eb1dda37b2089e3907d8ea92922b673c0c427886d4edc6b94d8dfe5db3", size = 489027, upload-time = "2025-10-14T15:05:31.064Z" }, + { url = "https://files.pythonhosted.org/packages/ea/84/4587ba5b1f267167ee715b7f66e6382cca6938e0a4b870adad93e44747e6/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:526e86aced14a65a5b0ec50827c745597c782ff46b571dbfe46192ab9e0b3c33", size = 595615, upload-time = "2025-10-14T15:05:32.074Z" }, + { url = "https://files.pythonhosted.org/packages/6a/0f/c6988c91d06e93cd0bb3d4a808bcf32375ca1904609835c3031799e3ecae/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04e78dd0b6352db95507fd8cb46f39d185cf8c74e4cf1e4fbad1d3df96faf510", size = 474836, upload-time = "2025-10-14T15:05:33.209Z" }, + { url = "https://files.pythonhosted.org/packages/b4/36/ded8aebea91919485b7bbabbd14f5f359326cb5ec218cd67074d1e426d74/watchfiles-1.1.1-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c85794a4cfa094714fb9c08d4a218375b2b95b8ed1666e8677c349906246c05", size = 455099, upload-time = "2025-10-14T15:05:34.189Z" }, + { url = "https://files.pythonhosted.org/packages/98/e0/8c9bdba88af756a2fce230dd365fab2baf927ba42cd47521ee7498fd5211/watchfiles-1.1.1-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:74d5012b7630714b66be7b7b7a78855ef7ad58e8650c73afc4c076a1f480a8d6", size = 630626, upload-time = "2025-10-14T15:05:35.216Z" }, + { url = "https://files.pythonhosted.org/packages/2a/84/a95db05354bf2d19e438520d92a8ca475e578c647f78f53197f5a2f17aaf/watchfiles-1.1.1-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:8fbe85cb3201c7d380d3d0b90e63d520f15d6afe217165d7f98c9c649654db81", size = 622519, upload-time = "2025-10-14T15:05:36.259Z" }, + { url = "https://files.pythonhosted.org/packages/1d/ce/d8acdc8de545de995c339be67711e474c77d643555a9bb74a9334252bd55/watchfiles-1.1.1-cp314-cp314-win32.whl", hash = "sha256:3fa0b59c92278b5a7800d3ee7733da9d096d4aabcfabb9a928918bd276ef9b9b", size = 272078, upload-time = "2025-10-14T15:05:37.63Z" }, + { url = "https://files.pythonhosted.org/packages/c4/c9/a74487f72d0451524be827e8edec251da0cc1fcf111646a511ae752e1a3d/watchfiles-1.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:c2047d0b6cea13b3316bdbafbfa0c4228ae593d995030fda39089d36e64fc03a", size = 287664, upload-time = "2025-10-14T15:05:38.95Z" }, + { url = "https://files.pythonhosted.org/packages/df/b8/8ac000702cdd496cdce998c6f4ee0ca1f15977bba51bdf07d872ebdfc34c/watchfiles-1.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:842178b126593addc05acf6fce960d28bc5fae7afbaa2c6c1b3a7b9460e5be02", size = 277154, upload-time = "2025-10-14T15:05:39.954Z" }, + { url = "https://files.pythonhosted.org/packages/47/a8/e3af2184707c29f0f14b1963c0aace6529f9d1b8582d5b99f31bbf42f59e/watchfiles-1.1.1-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:88863fbbc1a7312972f1c511f202eb30866370ebb8493aef2812b9ff28156a21", size = 403820, upload-time = "2025-10-14T15:05:40.932Z" }, + { url = "https://files.pythonhosted.org/packages/c0/ec/e47e307c2f4bd75f9f9e8afbe3876679b18e1bcec449beca132a1c5ffb2d/watchfiles-1.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:55c7475190662e202c08c6c0f4d9e345a29367438cf8e8037f3155e10a88d5a5", size = 390510, upload-time = "2025-10-14T15:05:41.945Z" }, + { url = "https://files.pythonhosted.org/packages/d5/a0/ad235642118090f66e7b2f18fd5c42082418404a79205cdfca50b6309c13/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f53fa183d53a1d7a8852277c92b967ae99c2d4dcee2bfacff8868e6e30b15f7", size = 448408, upload-time = "2025-10-14T15:05:43.385Z" }, + { url = "https://files.pythonhosted.org/packages/df/85/97fa10fd5ff3332ae17e7e40e20784e419e28521549780869f1413742e9d/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6aae418a8b323732fa89721d86f39ec8f092fc2af67f4217a2b07fd3e93c6101", size = 458968, upload-time = "2025-10-14T15:05:44.404Z" }, + { url = "https://files.pythonhosted.org/packages/47/c2/9059c2e8966ea5ce678166617a7f75ecba6164375f3b288e50a40dc6d489/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f096076119da54a6080e8920cbdaac3dbee667eb91dcc5e5b78840b87415bd44", size = 488096, upload-time = "2025-10-14T15:05:45.398Z" }, + { url = "https://files.pythonhosted.org/packages/94/44/d90a9ec8ac309bc26db808a13e7bfc0e4e78b6fc051078a554e132e80160/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:00485f441d183717038ed2e887a7c868154f216877653121068107b227a2f64c", size = 596040, upload-time = "2025-10-14T15:05:46.502Z" }, + { url = "https://files.pythonhosted.org/packages/95/68/4e3479b20ca305cfc561db3ed207a8a1c745ee32bf24f2026a129d0ddb6e/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a55f3e9e493158d7bfdb60a1165035f1cf7d320914e7b7ea83fe22c6023b58fc", size = 473847, upload-time = "2025-10-14T15:05:47.484Z" }, + { url = "https://files.pythonhosted.org/packages/4f/55/2af26693fd15165c4ff7857e38330e1b61ab8c37d15dc79118cdba115b7a/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c91ed27800188c2ae96d16e3149f199d62f86c7af5f5f4d2c61a3ed8cd3666c", size = 455072, upload-time = "2025-10-14T15:05:48.928Z" }, + { url = "https://files.pythonhosted.org/packages/66/1d/d0d200b10c9311ec25d2273f8aad8c3ef7cc7ea11808022501811208a750/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:311ff15a0bae3714ffb603e6ba6dbfba4065ab60865d15a6ec544133bdb21099", size = 629104, upload-time = "2025-10-14T15:05:49.908Z" }, + { url = "https://files.pythonhosted.org/packages/e3/bd/fa9bb053192491b3867ba07d2343d9f2252e00811567d30ae8d0f78136fe/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:a916a2932da8f8ab582f242c065f5c81bed3462849ca79ee357dd9551b0e9b01", size = 622112, upload-time = "2025-10-14T15:05:50.941Z" }, + { url = "https://files.pythonhosted.org/packages/ba/4c/a888c91e2e326872fa4705095d64acd8aa2fb9c1f7b9bd0588f33850516c/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:17ef139237dfced9da49fb7f2232c86ca9421f666d78c264c7ffca6601d154c3", size = 409611, upload-time = "2025-10-14T15:06:05.809Z" }, + { url = "https://files.pythonhosted.org/packages/1e/c7/5420d1943c8e3ce1a21c0a9330bcf7edafb6aa65d26b21dbb3267c9e8112/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:672b8adf25b1a0d35c96b5888b7b18699d27d4194bac8beeae75be4b7a3fc9b2", size = 396889, upload-time = "2025-10-14T15:06:07.035Z" }, + { url = "https://files.pythonhosted.org/packages/0c/e5/0072cef3804ce8d3aaddbfe7788aadff6b3d3f98a286fdbee9fd74ca59a7/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a13aea58bc2b90173bc69f2a90de8e282648939a00a602e1dc4ee23e26b66d", size = 451616, upload-time = "2025-10-14T15:06:08.072Z" }, + { url = "https://files.pythonhosted.org/packages/83/4e/b87b71cbdfad81ad7e83358b3e447fedd281b880a03d64a760fe0a11fc2e/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b495de0bb386df6a12b18335a0285dda90260f51bdb505503c02bcd1ce27a8b", size = 458413, upload-time = "2025-10-14T15:06:09.209Z" }, + { url = "https://files.pythonhosted.org/packages/d3/8e/e500f8b0b77be4ff753ac94dc06b33d8f0d839377fee1b78e8c8d8f031bf/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:db476ab59b6765134de1d4fe96a1a9c96ddf091683599be0f26147ea1b2e4b88", size = 408250, upload-time = "2025-10-14T15:06:10.264Z" }, + { url = "https://files.pythonhosted.org/packages/bd/95/615e72cd27b85b61eec764a5ca51bd94d40b5adea5ff47567d9ebc4d275a/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89eef07eee5e9d1fda06e38822ad167a044153457e6fd997f8a858ab7564a336", size = 396117, upload-time = "2025-10-14T15:06:11.28Z" }, + { url = "https://files.pythonhosted.org/packages/c9/81/e7fe958ce8a7fb5c73cc9fb07f5aeaf755e6aa72498c57d760af760c91f8/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce19e06cbda693e9e7686358af9cd6f5d61312ab8b00488bc36f5aabbaf77e24", size = 450493, upload-time = "2025-10-14T15:06:12.321Z" }, + { url = "https://files.pythonhosted.org/packages/6e/d4/ed38dd3b1767193de971e694aa544356e63353c33a85d948166b5ff58b9e/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e6f39af2eab0118338902798b5aa6664f46ff66bc0280de76fca67a7f262a49", size = 457546, upload-time = "2025-10-14T15:06:13.372Z" }, +] + [[package]] name = "websocket-client" version = "1.9.0" From 773278802afafaf7ca424e475017fb0a6ec2a33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 17 May 2026 15:17:20 +0800 Subject: [PATCH 16/58] perf(server): unblock event loop by running blocking routes in threadpool Sandbox/snapshot/pool route handlers were async def but called synchronous service methods that issue blocking Kubernetes/Docker API requests (50-200 ms each). Each in-flight call stalled the entire event loop, serializing every concurrent request. Convert blocking-only handlers to sync def so FastAPI offloads them to the anyio threadpool, letting concurrent requests run in parallel. create_sandbox stays async (its service is async with cooperative polling). - api/lifecycle.py: 12 handlers async -> sync; drop manual to_thread in create_snapshot now that the route itself runs in the threadpool; drop unused asyncio import - api/pool.py: 5 pool handlers async -> sync - tests/test_routes_list_sandboxes.py: regression locks in threadpool parallelism (8 x 200 ms calls finish in ~250 ms, not ~1.6 s) Co-Authored-By: Claude Opus 4.7 --- server/opensandbox_server/api/lifecycle.py | 31 ++++++-------- server/opensandbox_server/api/pool.py | 10 ++--- server/tests/test_routes_list_sandboxes.py | 48 ++++++++++++++++++++++ 3 files changed, 66 insertions(+), 23 deletions(-) diff --git a/server/opensandbox_server/api/lifecycle.py b/server/opensandbox_server/api/lifecycle.py index 576e0fb42..39bbc62b8 100644 --- a/server/opensandbox_server/api/lifecycle.py +++ b/server/opensandbox_server/api/lifecycle.py @@ -19,7 +19,6 @@ All business logic is delegated to the service layer that backs each operation. """ -import asyncio from typing import List, Optional from fastapi import APIRouter, Body, Header, Query, Request, status @@ -111,7 +110,7 @@ async def create_sandbox( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def list_sandboxes( +def list_sandboxes( state: Optional[List[str]] = Query(None, description="Filter by lifecycle state. Pass multiple times for OR logic."), metadata: Optional[str] = Query(None, description="Arbitrary metadata key-value pairs for filtering (URL encoded)."), page: int = Query(1, ge=1, description="Page number for pagination"), @@ -176,7 +175,7 @@ async def list_sandboxes( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def get_sandbox( +def get_sandbox( sandbox_id: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), ) -> Sandbox: @@ -214,7 +213,7 @@ async def get_sandbox( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def patch_sandbox_metadata( +def patch_sandbox_metadata( sandbox_id: str, patch: PatchSandboxMetadataRequest = Body(...), x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), @@ -239,7 +238,7 @@ async def patch_sandbox_metadata( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def delete_sandbox( +def delete_sandbox( sandbox_id: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), ) -> Response: @@ -279,7 +278,7 @@ async def delete_sandbox( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def pause_sandbox( +def pause_sandbox( sandbox_id: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), ) -> Response: @@ -316,7 +315,7 @@ async def pause_sandbox( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def resume_sandbox( +def resume_sandbox( sandbox_id: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), ) -> Response: @@ -355,7 +354,7 @@ async def resume_sandbox( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def renew_sandbox_expiration( +def renew_sandbox_expiration( sandbox_id: str, request: RenewSandboxExpirationRequest, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), @@ -402,7 +401,7 @@ async def renew_sandbox_expiration( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def create_snapshot( +def create_snapshot( sandbox_id: str, response: Response, request: Optional[CreateSnapshotRequest] = None, @@ -412,11 +411,7 @@ async def create_snapshot( Create a persistent point-in-time snapshot from a sandbox. """ create_request = request or CreateSnapshotRequest() - snapshot = await asyncio.to_thread( - snapshot_service.create_snapshot, - sandbox_id, - create_request, - ) + snapshot = snapshot_service.create_snapshot(sandbox_id, create_request) response.headers["Location"] = f"/v1/snapshots/{snapshot.id}" return snapshot @@ -433,7 +428,7 @@ async def create_snapshot( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def list_snapshots( +def list_snapshots( sandbox_id: Optional[str] = Query(None, alias="sandboxId", description="Filter snapshots by source sandbox identifier"), state: Optional[List[str]] = Query(None, description="Filter by snapshot lifecycle state. Pass multiple times for OR logic."), page: int = Query(1, ge=1, description="Page number for pagination"), @@ -464,7 +459,7 @@ async def list_snapshots( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def get_snapshot( +def get_snapshot( snapshot_id: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), ) -> Snapshot: @@ -488,7 +483,7 @@ async def get_snapshot( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def delete_snapshot( +def delete_snapshot( snapshot_id: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID", description="Unique request identifier for tracing"), ) -> Response: @@ -515,7 +510,7 @@ async def delete_snapshot( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def get_sandbox_endpoint( +def get_sandbox_endpoint( request: Request, sandbox_id: str, port: int, diff --git a/server/opensandbox_server/api/pool.py b/server/opensandbox_server/api/pool.py index 4f33f6bdc..11231bc1e 100644 --- a/server/opensandbox_server/api/pool.py +++ b/server/opensandbox_server/api/pool.py @@ -88,7 +88,7 @@ def _get_pool_service(): 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def create_pool( +def create_pool( request: CreatePoolRequest, x_request_id: Optional[str] = Header(None, alias="X-Request-ID"), ) -> PoolResponse: @@ -121,7 +121,7 @@ async def create_pool( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def list_pools( +def list_pools( x_request_id: Optional[str] = Header(None, alias="X-Request-ID"), ) -> ListPoolsResponse: """ @@ -151,7 +151,7 @@ async def list_pools( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def get_pool( +def get_pool( pool_name: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID"), ) -> PoolResponse: @@ -182,7 +182,7 @@ async def get_pool( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def update_pool( +def update_pool( pool_name: str, request: UpdatePoolRequest, x_request_id: Optional[str] = Header(None, alias="X-Request-ID"), @@ -217,7 +217,7 @@ async def update_pool( 500: {"model": ErrorResponse, "description": "An unexpected server error occurred"}, }, ) -async def delete_pool( +def delete_pool( pool_name: str, x_request_id: Optional[str] = Header(None, alias="X-Request-ID"), ) -> Response: diff --git a/server/tests/test_routes_list_sandboxes.py b/server/tests/test_routes_list_sandboxes.py index 0474ffee9..753addeb3 100644 --- a/server/tests/test_routes_list_sandboxes.py +++ b/server/tests/test_routes_list_sandboxes.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timedelta, timezone from fastapi.testclient import TestClient @@ -207,3 +209,49 @@ def test_list_sandboxes_requires_api_key(client: TestClient) -> None: assert response.status_code == 401 assert response.json()["code"] == "MISSING_API_KEY" + + +def test_list_sandboxes_runs_in_threadpool_for_concurrency( + client: TestClient, + auth_headers: dict, + monkeypatch, +) -> None: + """Blocking list calls must run in the threadpool so concurrent requests + do not serialize on the event loop. With sync def routes, FastAPI offloads + the handler to anyio's threadpool; 8 calls each sleeping 200ms should + complete well under the 1.6s serial bound. + """ + sleep_seconds = 0.2 + concurrency = 8 + + class SlowService: + @staticmethod + def list_sandboxes(_request) -> ListSandboxesResponse: + time.sleep(sleep_seconds) + return ListSandboxesResponse( + items=[], + pagination=PaginationInfo( + page=1, + pageSize=20, + totalItems=0, + totalPages=0, + hasNextPage=False, + ), + ) + + monkeypatch.setattr(lifecycle, "sandbox_service", SlowService()) + + def call() -> int: + return client.get("/v1/sandboxes", headers=auth_headers).status_code + + started = time.monotonic() + with ThreadPoolExecutor(max_workers=concurrency) as pool: + statuses = list(pool.map(lambda _: call(), range(concurrency))) + elapsed = time.monotonic() - started + + assert statuses == [200] * concurrency + serial_floor = sleep_seconds * concurrency + assert elapsed < serial_floor * 0.6, ( + f"list_sandboxes serialized: elapsed={elapsed:.2f}s, " + f"serial floor={serial_floor:.2f}s (threadpool offload broken)" + ) From 35d9bf470e47dba62b2bd74746f564869bde730d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 17 May 2026 15:29:45 +0800 Subject: [PATCH 17/58] perf(server): serve list_custom_objects from informer cache list_custom_objects always issued a direct apiserver call, even though the informer is already watching the same namespace and serves get_custom_object from cache. Under multi-worker deployments the list QPS scales with workers x replicas and pressures the apiserver unnecessarily. Prefer the informer cache when synced and the label selector falls within the supported in-memory grammar (empty, bare key existence, key=value, comma-joined AND). Anything else falls back to the existing direct API path, preserving today's behavior. - services/k8s/label_selector.py: minimal parser/matcher for the subset of selectors callers in this repo actually emit - services/k8s/informer.py: WorkloadInformer.list() snapshot helper - services/k8s/client.py: list_custom_objects consults the cache first - tests/k8s: cover label_selector grammar + cache-hit/miss/fallback behavior on the client Co-Authored-By: Claude Opus 4.7 --- .../opensandbox_server/services/k8s/client.py | 21 ++++- .../services/k8s/informer.py | 7 +- .../services/k8s/label_selector.py | 93 ++++++++++++++++++ server/tests/k8s/test_k8s_client.py | 94 +++++++++++++++++++ server/tests/k8s/test_label_selector.py | 80 ++++++++++++++++ 5 files changed, 293 insertions(+), 2 deletions(-) create mode 100644 server/opensandbox_server/services/k8s/label_selector.py create mode 100644 server/tests/k8s/test_label_selector.py diff --git a/server/opensandbox_server/services/k8s/client.py b/server/opensandbox_server/services/k8s/client.py index ad9774927..99a763b0f 100644 --- a/server/opensandbox_server/services/k8s/client.py +++ b/server/opensandbox_server/services/k8s/client.py @@ -27,6 +27,7 @@ from opensandbox_server.config import KubernetesRuntimeConfig from opensandbox_server.services.k8s.informer import WorkloadInformer +from opensandbox_server.services.k8s.label_selector import matches, parse_selector from opensandbox_server.services.k8s.rate_limiter import TokenBucketRateLimiter logger = logging.getLogger(__name__) @@ -183,7 +184,25 @@ def list_custom_objects( plural: str, label_selector: str = "", ) -> List[Dict[str, Any]]: - """List namespaced custom resources, returning the items list.""" + """List namespaced custom resources, returning the items list. + + Tries the informer cache first when available, synced, and the label + selector falls within the supported in-memory grammar. Falls back to + a direct API call (with rate limiting) otherwise. + """ + informer = self._get_informer(group, version, plural, namespace) + if informer and informer.has_synced: + terms = parse_selector(label_selector) + if terms is not None: + cached = informer.list() + if not terms: + return cached + return [ + obj + for obj in cached + if matches(obj.get("metadata", {}).get("labels") or {}, terms) + ] + if self._read_limiter: self._read_limiter.acquire() try: diff --git a/server/opensandbox_server/services/k8s/informer.py b/server/opensandbox_server/services/k8s/informer.py index 5eec36349..fa95e39d9 100644 --- a/server/opensandbox_server/services/k8s/informer.py +++ b/server/opensandbox_server/services/k8s/informer.py @@ -16,7 +16,7 @@ import logging import threading -from typing import Any, Callable, Dict, Optional +from typing import Any, Callable, Dict, List, Optional from kubernetes import watch from kubernetes.client import ApiException @@ -85,6 +85,11 @@ def get(self, name: str) -> Optional[Dict[str, Any]]: with self._lock: return self._cache.get(name) + def list(self) -> List[Dict[str, Any]]: + """Return a snapshot of every cached object.""" + with self._lock: + return list(self._cache.values()) + def update_cache(self, obj: Dict[str, Any]) -> None: """Upsert a single object into the cache. diff --git a/server/opensandbox_server/services/k8s/label_selector.py b/server/opensandbox_server/services/k8s/label_selector.py new file mode 100644 index 000000000..437bf9fc8 --- /dev/null +++ b/server/opensandbox_server/services/k8s/label_selector.py @@ -0,0 +1,93 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Minimal Kubernetes label selector parser for in-memory matching. + +Supports only the subset that callers in this codebase actually emit: + +- empty string ............ matches every object +- ``key`` ................. key existence +- ``key=value`` ........... equality (``==`` accepted as alias) +- ``a=1,b=2`` ............. comma-joined AND of the above + +When the selector contains anything outside this grammar (set-based ops +like ``in``, ``notin``, ``!key``), :func:`parse_selector` returns ``None`` +so the caller falls back to issuing a real Kubernetes API list request. +""" + +from __future__ import annotations + +from typing import List, Literal, Mapping, Optional, Tuple + +Op = Literal["exists", "eq"] +Term = Tuple[str, Op, Optional[str]] + + +_LABEL_KEY_CHARS = set( + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_./" +) + + +def _is_valid_key(key: str) -> bool: + if not key: + return False + return all(c in _LABEL_KEY_CHARS for c in key) + + +def parse_selector(selector: str) -> Optional[List[Term]]: + """Parse a label selector into a list of AND terms. + + Returns ``None`` when the selector uses syntax beyond what this minimal + parser supports. The empty selector parses to ``[]`` (match-all). + """ + selector = (selector or "").strip() + if not selector: + return [] + + terms: List[Term] = [] + for raw in selector.split(","): + clause = raw.strip() + if not clause: + return None + + if "==" in clause: + key, _, value = clause.partition("==") + elif "=" in clause: + key, _, value = clause.partition("=") + else: + key, value = clause, None + + key = key.strip() + if not _is_valid_key(key): + return None + if value is None: + terms.append((key, "exists", None)) + else: + terms.append((key, "eq", value.strip())) + + return terms + + +def matches(labels: Mapping[str, str], terms: List[Term]) -> bool: + """Return True if ``labels`` satisfy every AND term.""" + for key, op, expected in terms: + if op == "exists": + if key not in labels: + return False + elif op == "eq": + if labels.get(key) != expected: + return False + else: # pragma: no cover - exhaustive on Op + return False + return True diff --git a/server/tests/k8s/test_k8s_client.py b/server/tests/k8s/test_k8s_client.py index 3c8d36dbc..10333cec5 100644 --- a/server/tests/k8s/test_k8s_client.py +++ b/server/tests/k8s/test_k8s_client.py @@ -233,6 +233,100 @@ def test_list_custom_objects_reraises_non_404(self, k8s_runtime_config): with pytest.raises(ApiException): c.list_custom_objects("g", "v1", "ns", "foos") + def _attach_synced_informer(self, c, items): + fake_informer = MagicMock() + fake_informer.has_synced = True + fake_informer.list.return_value = list(items) + c._informers[("g", "v1", "foos", "ns")] = fake_informer + c.config = MagicMock( + informer_enabled=True, + informer_resync_seconds=300, + informer_watch_timeout_seconds=60, + read_qps=0.0, + write_qps=0.0, + ) + return fake_informer + + def test_list_custom_objects_returns_cached_when_synced(self, k8s_runtime_config): + """When the informer is synced, list_custom_objects serves from cache.""" + c = self._make_client(k8s_runtime_config) + items = [ + {"metadata": {"name": "a", "labels": {"opensandbox.io/id": "a"}}}, + {"metadata": {"name": "b", "labels": {"opensandbox.io/id": "b"}}}, + ] + self._attach_synced_informer(c, items) + result = c.list_custom_objects("g", "v1", "ns", "foos") + assert result == items + c._custom_objects_api.list_namespaced_custom_object.assert_not_called() + + def test_list_custom_objects_filters_cached_by_label_existence( + self, k8s_runtime_config + ): + """Bare-key selector filters cached items in memory without an API call.""" + c = self._make_client(k8s_runtime_config) + items = [ + {"metadata": {"name": "with-id", "labels": {"opensandbox.io/id": "x"}}}, + {"metadata": {"name": "no-id", "labels": {"other": "y"}}}, + ] + self._attach_synced_informer(c, items) + result = c.list_custom_objects( + "g", "v1", "ns", "foos", label_selector="opensandbox.io/id" + ) + assert [obj["metadata"]["name"] for obj in result] == ["with-id"] + c._custom_objects_api.list_namespaced_custom_object.assert_not_called() + + def test_list_custom_objects_filters_cached_by_equality(self, k8s_runtime_config): + """key=value selector filters cached items in memory without an API call.""" + c = self._make_client(k8s_runtime_config) + items = [ + {"metadata": {"name": "alpha", "labels": {"team": "infra"}}}, + {"metadata": {"name": "beta", "labels": {"team": "data"}}}, + ] + self._attach_synced_informer(c, items) + result = c.list_custom_objects( + "g", "v1", "ns", "foos", label_selector="team=infra" + ) + assert [obj["metadata"]["name"] for obj in result] == ["alpha"] + c._custom_objects_api.list_namespaced_custom_object.assert_not_called() + + def test_list_custom_objects_falls_back_when_informer_unsynced( + self, k8s_runtime_config + ): + """Cache miss when has_synced=False routes to direct API.""" + c = self._make_client(k8s_runtime_config) + fake_informer = MagicMock() + fake_informer.has_synced = False + c._informers[("g", "v1", "foos", "ns")] = fake_informer + c.config = MagicMock( + informer_enabled=True, + informer_resync_seconds=300, + informer_watch_timeout_seconds=60, + read_qps=0.0, + write_qps=0.0, + ) + c._custom_objects_api.list_namespaced_custom_object.return_value = { + "items": [{"metadata": {"name": "z"}}] + } + result = c.list_custom_objects("g", "v1", "ns", "foos") + assert [obj["metadata"]["name"] for obj in result] == ["z"] + fake_informer.list.assert_not_called() + c._custom_objects_api.list_namespaced_custom_object.assert_called_once() + + def test_list_custom_objects_falls_back_on_unsupported_selector( + self, k8s_runtime_config + ): + """Set-based selectors (in/notin) bypass the cache parser and hit the API.""" + c = self._make_client(k8s_runtime_config) + self._attach_synced_informer(c, [{"metadata": {"name": "x"}}]) + c._custom_objects_api.list_namespaced_custom_object.return_value = { + "items": [{"metadata": {"name": "from-api"}}] + } + result = c.list_custom_objects( + "g", "v1", "ns", "foos", label_selector="env in (prod, staging)" + ) + assert [obj["metadata"]["name"] for obj in result] == ["from-api"] + c._custom_objects_api.list_namespaced_custom_object.assert_called_once() + def test_delete_custom_object_delegates_to_api(self, k8s_runtime_config): """delete_custom_object forwards arguments to the raw API.""" c = self._make_client(k8s_runtime_config) diff --git a/server/tests/k8s/test_label_selector.py b/server/tests/k8s/test_label_selector.py new file mode 100644 index 000000000..a7e51acfc --- /dev/null +++ b/server/tests/k8s/test_label_selector.py @@ -0,0 +1,80 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from opensandbox_server.services.k8s.label_selector import ( + matches, + parse_selector, +) + + +class TestParseSelector: + def test_empty_selector_returns_match_all_terms(self): + assert parse_selector("") == [] + assert parse_selector(" ") == [] + + def test_bare_key_parses_as_existence_term(self): + assert parse_selector("opensandbox.io/id") == [ + ("opensandbox.io/id", "exists", None) + ] + + def test_equality_parses_as_eq_term(self): + assert parse_selector("team=infra") == [("team", "eq", "infra")] + + def test_double_equals_parses_as_eq_term(self): + assert parse_selector("team==infra") == [("team", "eq", "infra")] + + def test_comma_joined_clauses_parse_as_and(self): + assert parse_selector("team=infra,project") == [ + ("team", "eq", "infra"), + ("project", "exists", None), + ] + + def test_whitespace_around_clauses_is_tolerated(self): + assert parse_selector(" team = infra , project ") == [ + ("team", "eq", "infra"), + ("project", "exists", None), + ] + + def test_set_based_operator_returns_none(self): + assert parse_selector("env in (prod, staging)") is None + + def test_negation_returns_none(self): + assert parse_selector("!retired") is None + + def test_inequality_returns_none(self): + assert parse_selector("team!=infra") is None + + def test_empty_clause_returns_none(self): + assert parse_selector("team=infra,") is None + assert parse_selector(",team=infra") is None + + +class TestMatches: + @pytest.mark.parametrize( + "labels,terms,expected", + [ + ({"a": "1"}, [], True), + ({}, [("a", "exists", None)], False), + ({"a": ""}, [("a", "exists", None)], True), + ({"a": "1"}, [("a", "exists", None)], True), + ({"a": "1"}, [("a", "eq", "1")], True), + ({"a": "2"}, [("a", "eq", "1")], False), + ({"a": "1", "b": "x"}, [("a", "eq", "1"), ("b", "exists", None)], True), + ({"a": "1"}, [("a", "eq", "1"), ("b", "exists", None)], False), + ], + ) + def test_matches(self, labels, terms, expected): + assert matches(labels, terms) is expected From 225e5ececb72ea4b9696a26126626d1b20a1ce95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Sun, 17 May 2026 15:40:06 +0800 Subject: [PATCH 18/58] perf(server): grow anyio threadpool, unblock create path The previous round moved blocking list/get/delete handlers onto sync def routes so FastAPI offloads them to anyio's default threadpool. Two follow-up bottlenecks remain: 1. anyio's default threadpool is 40 tokens; bursts of concurrent sandbox CRUD requests start queueing once that ceiling is hit. 2. lifecycle.create_sandbox is async and the Kubernetes service body still issues sync K8s calls (_ensure_pvc_volumes, workload_provider create/get/delete) directly on the event loop. Each 50-200 ms round-trip stalls every other in-flight request, and the rate limiter's time.sleep makes it worse when read/write QPS is set. Add a configurable thread_pool_size (default 200) applied at lifespan startup, and wrap the blocking K8s calls inside the create path with asyncio.to_thread so the event loop stays responsive. - config.py: ServerConfig.thread_pool_size - main.py: lifespan sets anyio current_default_thread_limiter total_tokens - services/k8s/kubernetes_service.py: to_thread wraps the four sync K8s calls in create_sandbox / _wait_for_sandbox_ready - configuration.md, tests/test_config.py: doc and field tests Co-Authored-By: Claude Opus 4.7 --- server/configuration.md | 1 + server/opensandbox_server/config.py | 10 ++++++++++ server/opensandbox_server/main.py | 4 ++++ .../services/k8s/kubernetes_service.py | 14 ++++++++++---- server/tests/test_config.py | 8 ++++++++ 5 files changed, 33 insertions(+), 4 deletions(-) diff --git a/server/configuration.md b/server/configuration.md index 26b8aed21..4069fbf80 100644 --- a/server/configuration.md +++ b/server/configuration.md @@ -69,6 +69,7 @@ Example files in this repository: | `workers` | integer | `1` | Number of uvicorn worker processes. Each worker is a separate Python process with its own event loop and (under the Kubernetes runtime) its own informer watch streams to the apiserver. Default `1` keeps apiserver pressure predictable; bump to 2–8 based on CPU quota and apiserver capacity. Ignored when `--reload` is set. | | `limit_concurrency` | integer \| omitted | `1024` | Maximum concurrent connections per worker before returning 503. Provides backpressure protection under burst load. Omit to disable. | | `backlog` | integer | `2048` | Socket listen backlog passed to uvicorn. | +| `thread_pool_size` | integer | `200` | Maximum size of the anyio default threadpool used by FastAPI to run sync route handlers. The anyio default of 40 throttles bursts of blocking sandbox list/get/delete operations under high concurrency. | | `loop` | `"auto"` \| `"uvloop"` \| `"asyncio"` | `"auto"` | Event loop implementation. `auto` prefers uvloop and falls back to asyncio. | | `http` | `"auto"` \| `"httptools"` \| `"h11"` | `"auto"` | HTTP protocol parser. `auto` prefers httptools and falls back to h11. | diff --git a/server/opensandbox_server/config.py b/server/opensandbox_server/config.py index 54ad9729b..9da6cc35a 100644 --- a/server/opensandbox_server/config.py +++ b/server/opensandbox_server/config.py @@ -478,6 +478,16 @@ class ServerConfig(BaseModel): ge=1, description="Socket listen backlog passed to uvicorn.", ) + thread_pool_size: int = Field( + default=200, + ge=1, + description=( + "Maximum size of the anyio default threadpool used by FastAPI " + "to run sync route handlers. Default anyio limit is 40, which " + "throttles bursts of blocking sandbox list/get/delete operations " + "under high concurrency." + ), + ) loop: Literal["auto", "uvloop", "asyncio"] = Field( default="auto", description=( diff --git a/server/opensandbox_server/main.py b/server/opensandbox_server/main.py index f9d0e7f40..f33633ed7 100644 --- a/server/opensandbox_server/main.py +++ b/server/opensandbox_server/main.py @@ -61,6 +61,10 @@ async def lifespan(app: FastAPI): logger.error("API key startup confirmation failed: %s", exc) os._exit(1) + from anyio.to_thread import current_default_thread_limiter + + current_default_thread_limiter().total_tokens = app_config.server.thread_pool_size + app.state.http_client = httpx.AsyncClient(timeout=180.0) # Validate secure runtime configuration at startup diff --git a/server/opensandbox_server/services/k8s/kubernetes_service.py b/server/opensandbox_server/services/k8s/kubernetes_service.py index 28a9fd361..196fd8600 100644 --- a/server/opensandbox_server/services/k8s/kubernetes_service.py +++ b/server/opensandbox_server/services/k8s/kubernetes_service.py @@ -191,7 +191,8 @@ async def _wait_for_sandbox_ready( while time.time() - start_time < timeout_seconds: try: - workload = self.workload_provider.get_workload( + workload = await asyncio.to_thread( + self.workload_provider.get_workload, sandbox_id=sandbox_id, namespace=self.namespace, ) @@ -440,10 +441,11 @@ async def create_sandbox(self, request: CreateSandboxRequest) -> CreateSandboxRe # Auto-create PVCs that don't exist yet if request.volumes: - self._ensure_pvc_volumes(request.volumes) + await asyncio.to_thread(self._ensure_pvc_volumes, request.volumes) # Create workload - workload_info = self.workload_provider.create_workload( + workload_info = await asyncio.to_thread( + self.workload_provider.create_workload, sandbox_id=sandbox_id, namespace=self.namespace, image_spec=request.image, @@ -499,7 +501,11 @@ async def create_sandbox(self, request: CreateSandboxRequest) -> CreateSandboxRe except HTTPException as e: try: logger.error(f"Creation failed, cleaning up sandbox {sandbox_id}: {e}") - self.workload_provider.delete_workload(sandbox_id, self.namespace) + await asyncio.to_thread( + self.workload_provider.delete_workload, + sandbox_id, + self.namespace, + ) except Exception as cleanup_ex: logger.error(f"Failed to cleanup sandbox {sandbox_id}", exc_info=cleanup_ex) raise diff --git a/server/tests/test_config.py b/server/tests/test_config.py index 578c5bd15..7a7de58bb 100644 --- a/server/tests/test_config.py +++ b/server/tests/test_config.py @@ -170,6 +170,7 @@ def test_server_config_uvicorn_tuning_defaults(): assert server_cfg.workers == 1 assert server_cfg.limit_concurrency == 1024 assert server_cfg.backlog == 2048 + assert server_cfg.thread_pool_size == 200 assert server_cfg.loop == "auto" assert server_cfg.http == "auto" @@ -206,6 +207,13 @@ def test_server_config_backlog_must_be_positive(): ServerConfig(backlog=0) +def test_server_config_thread_pool_size_must_be_positive(): + with pytest.raises(ValidationError): + ServerConfig(thread_pool_size=0) + cfg = ServerConfig(thread_pool_size=512) + assert cfg.thread_pool_size == 512 + + def test_server_config_loop_and_http_reject_unknown_values(): with pytest.raises(ValidationError): ServerConfig(loop="trio") # type: ignore[arg-type] From 7e5743899eeaef0a86cab5a904dbcc5e941466df Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Sun, 17 May 2026 16:24:59 +0800 Subject: [PATCH 19/58] fix(server): k8s patch_sandbox_metadata correctly deletes keys and returns post-patch state (#899) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(server): k8s patch_sandbox_metadata correctly deletes keys and returns post-patch state Two bugs in KubernetesSandboxService.patch_sandbox_metadata caused the nightly k8s mini E2E test_02_metadata_filter_and_logic to fail: 1. JSON merge patch (RFC 7396) on metadata.labels merges keys recursively — keys absent from the patch body are kept. The previous code computed the desired final labels dict (with deleted keys removed) and sent it, so deleted keys were never actually removed on the API server. 2. After the PATCH, the code re-fetched the workload via _get_workload_or_404, which goes through K8sClient.get_custom_object that prefers the informer cache. The informer is eventually consistent, so the read could land before the watch event arrived and return the pre-patch labels. Fix both by: - Building the merge-patch body with explicit None for deleted keys. - Using the API server's PATCH response (returned from patch_labels) directly, instead of re-reading via the cache. WorkloadProvider.patch_labels now accepts Dict[str, Optional[str]] and returns the patched workload dict. Co-Authored-By: Claude Opus 4.7 * chore(ci): touch trigger comment to run k8s mini E2E on this PR The k8s-nightly-build workflow only runs on PRs that touch one of its trigger paths. This PR fixes server-side k8s logic but does not modify those paths, so add a date stamp to the existing trigger comment to include this PR in the matrix. Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- scripts/python-k8s-e2e.sh | 2 +- .../services/k8s/kubernetes_service.py | 18 +++--- .../services/k8s/workload_provider.py | 12 +++- server/tests/k8s/test_kubernetes_service.py | 59 +++++++++++++++++++ 4 files changed, 80 insertions(+), 11 deletions(-) diff --git a/scripts/python-k8s-e2e.sh b/scripts/python-k8s-e2e.sh index bfcb09c42..9cc220b1e 100644 --- a/scripts/python-k8s-e2e.sh +++ b/scripts/python-k8s-e2e.sh @@ -1,5 +1,5 @@ #!/bin/bash -# trigger k8s e2e +# trigger k8s e2e (2026-05-17) # Copyright 2026 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/server/opensandbox_server/services/k8s/kubernetes_service.py b/server/opensandbox_server/services/k8s/kubernetes_service.py index f5958c644..505895d34 100644 --- a/server/opensandbox_server/services/k8s/kubernetes_service.py +++ b/server/opensandbox_server/services/k8s/kubernetes_service.py @@ -791,21 +791,25 @@ def patch_sandbox_metadata(self, sandbox_id: str, patch: PatchSandboxMetadataReq new_labels = self._apply_metadata_patch(labels, patch) + # JSON merge patch (RFC 7396) on metadata.labels treats keys absent + # from the body as kept. To delete a label we must send the key with + # an explicit null. Build the merge body from the desired final labels + # plus null markers for keys removed by this patch. + label_patch: Dict[str, Optional[str]] = dict(new_labels) + for key, value in patch.items(): + if value is None: + label_patch[key] = None + try: - self.workload_provider.patch_labels( + updated = self.workload_provider.patch_labels( name=name, namespace=self.namespace, - labels=new_labels, + labels=label_patch, ) except Exception as e: logger.error("Error patching labels for sandbox %s: %s", sandbox_id, e) raise _build_k8s_api_error("patch sandbox labels", e) from e - updated = _get_workload_or_404( - self.workload_provider, - self.namespace, - sandbox_id, - ) return _build_sandbox_from_workload(updated, self.workload_provider) def get_endpoint( diff --git a/server/opensandbox_server/services/k8s/workload_provider.py b/server/opensandbox_server/services/k8s/workload_provider.py index c85b8f567..40bfdba6d 100644 --- a/server/opensandbox_server/services/k8s/workload_provider.py +++ b/server/opensandbox_server/services/k8s/workload_provider.py @@ -202,10 +202,16 @@ def resume_sandbox(self, sandbox_id: str, namespace: str) -> None: """ raise NotImplementedError("Resume is not supported by this provider") - def patch_labels(self, name: str, namespace: str, labels: Dict[str, str]) -> None: - """Patch workload metadata.labels via JSON merge patch.""" + def patch_labels( + self, name: str, namespace: str, labels: Dict[str, Optional[str]] + ) -> Dict[str, Any]: + """Patch workload metadata.labels via JSON merge patch. + + A None value for a label key deletes that label per RFC 7396. + Returns the API server response (the patched workload). + """ body = {"metadata": {"labels": labels}} - self.k8s_client.patch_custom_object( + return self.k8s_client.patch_custom_object( group=self.group, version=self.version, namespace=namespace, diff --git a/server/tests/k8s/test_kubernetes_service.py b/server/tests/k8s/test_kubernetes_service.py index 3aa6add8b..96ed3162f 100644 --- a/server/tests/k8s/test_kubernetes_service.py +++ b/server/tests/k8s/test_kubernetes_service.py @@ -1297,3 +1297,62 @@ def test_signed_endpoint_different_expires_produces_different_endpoints(self, k8 ep2 = k8s_service.get_endpoint("sbx-001", 8080, expires=2000000500) assert ep1.endpoint != ep2.endpoint + + +class TestPatchSandboxMetadata: + """Verify patch_sandbox_metadata builds the JSON merge-patch body correctly + and uses the API server's PATCH response (not a cache-prone re-fetch).""" + + @staticmethod + def _workload(labels: dict) -> dict: + return { + "metadata": { + "name": "sandbox-sbx-001", + "labels": dict(labels), + "creationTimestamp": datetime(2026, 1, 1, tzinfo=timezone.utc), + }, + "spec": {}, + "status": {"conditions": []}, + } + + @staticmethod + def _stub_provider_status(k8s_service) -> None: + k8s_service.workload_provider.get_status.return_value = { + "state": "Running", + "reason": None, + "message": None, + "last_transition_at": None, + } + k8s_service.workload_provider.get_expiration.return_value = None + + def test_patch_body_sends_null_for_deleted_keys(self, k8s_service): + initial = {"opensandbox.io/id": "sbx-001", "team": "infra", "env": "dev"} + patched = {"opensandbox.io/id": "sbx-001", "env": "stage"} + + k8s_service.workload_provider.get_workload.return_value = self._workload(initial) + k8s_service.workload_provider.patch_labels.return_value = self._workload(patched) + self._stub_provider_status(k8s_service) + + k8s_service.patch_sandbox_metadata("sbx-001", {"env": "stage", "team": None}) + + k8s_service.workload_provider.patch_labels.assert_called_once() + body_labels = k8s_service.workload_provider.patch_labels.call_args.kwargs["labels"] + assert body_labels["env"] == "stage" + assert body_labels["team"] is None + assert body_labels["opensandbox.io/id"] == "sbx-001" + + def test_returns_sandbox_from_patch_response(self, k8s_service): + """The PATCH response is authoritative; re-reading via get_workload + could hit a stale informer cache.""" + initial = {"opensandbox.io/id": "sbx-001", "env": "dev"} + patched = {"opensandbox.io/id": "sbx-001", "env": "stage"} + + k8s_service.workload_provider.get_workload.return_value = self._workload(initial) + k8s_service.workload_provider.patch_labels.return_value = self._workload(patched) + self._stub_provider_status(k8s_service) + + sandbox = k8s_service.patch_sandbox_metadata("sbx-001", {"env": "stage"}) + + assert sandbox.metadata == {"env": "stage"} + # Pre-patch read only; no second get_workload after patch_labels. + assert k8s_service.workload_provider.get_workload.call_count == 1 From d2b9b2fa75a1c83136885223400a7cf78d7d4166 Mon Sep 17 00:00:00 2001 From: immanuwell Date: Sun, 17 May 2026 20:03:26 +0400 Subject: [PATCH 20/58] docs: fix server example config links --- examples/host-volume-mount/README.md | 2 +- examples/host-volume-mount/README_zh.md | 2 +- server/README.md | 4 ++-- server/configuration.md | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/host-volume-mount/README.md b/examples/host-volume-mount/README.md index 60fbaeb85..a6ad88372 100644 --- a/examples/host-volume-mount/README.md +++ b/examples/host-volume-mount/README.md @@ -224,4 +224,4 @@ Sandbox sandbox = Sandbox.builder() - [OSEP-0003: Volume and VolumeBinding Support](../../oseps/0003-volume-and-volumebinding-support.md) — Design proposal - [Sandbox Lifecycle API Spec](../../specs/sandbox-lifecycle.yml) — OpenAPI schema for volume definitions -- [Server Configuration](../../server/example.config.toml) — `[storage]` section for `allowed_host_paths` +- [Server Configuration](../../server/opensandbox_server/examples/example.config.toml) — `[storage]` section for `allowed_host_paths` diff --git a/examples/host-volume-mount/README_zh.md b/examples/host-volume-mount/README_zh.md index 86d7bf414..fce9ccb68 100644 --- a/examples/host-volume-mount/README_zh.md +++ b/examples/host-volume-mount/README_zh.md @@ -233,4 +233,4 @@ Sandbox sandbox = Sandbox.builder() - [OSEP-0003: Volume 与 VolumeBinding 支持](../../oseps/0003-volume-and-volumebinding-support.md) — 设计提案 - [Sandbox Lifecycle API 规范](../../specs/sandbox-lifecycle.yml) — Volume 定义的 OpenAPI 规范 -- [服务端配置示例](../../server/example.config.zh.toml) — `[storage]` 段中的 `allowed_host_paths` 配置 +- [服务端配置示例](../../server/opensandbox_server/examples/example.config.zh.toml) — `[storage]` 段中的 `allowed_host_paths` 配置 diff --git a/server/README.md b/server/README.md index a7c4af441..f764a859a 100644 --- a/server/README.md +++ b/server/README.md @@ -221,11 +221,11 @@ Single source of truth for TOML: **[configuration.md](configuration.md)** (inclu ## Experimental features -Optional **🧪 experimental** behavior; **off by default** in [`example.config.toml`](example.config.toml) (and mirrored copies under `opensandbox_server/examples/`). See release notes before production. +Optional **🧪 experimental** behavior; **off by default** in [`example.config.toml`](opensandbox_server/examples/example.config.toml). See release notes before production. ### Auto-renew on access -Extends sandbox TTL when traffic is observed (lifecycle **proxy** and/or **ingress** + optional **Redis** queue). Design and operations: **[OSEP-0009](../oseps/0009-auto-renew-sandbox-on-ingress-access.md)**. TOML keys (`[renew_intent]`, including nested `redis.*`): see **[configuration.md](configuration.md)** and [`example.config.toml`](example.config.toml). +Extends sandbox TTL when traffic is observed (lifecycle **proxy** and/or **ingress** + optional **Redis** queue). Design and operations: **[OSEP-0009](../oseps/0009-auto-renew-sandbox-on-ingress-access.md)**. TOML keys (`[renew_intent]`, including nested `redis.*`): see **[configuration.md](configuration.md)** and [`example.config.toml`](opensandbox_server/examples/example.config.toml). Per-sandbox: on **create**, set `extensions["access.renew.extend.seconds"]` (string integer **300**–**86400**). Clients using the server proxy: request endpoints with `use_server_proxy=true` (REST) or SDK `ConnectionConfig(..., use_server_proxy=True)` — details in OSEP-0009. diff --git a/server/configuration.md b/server/configuration.md index a178eebbf..52f0cfc34 100644 --- a/server/configuration.md +++ b/server/configuration.md @@ -10,10 +10,10 @@ Example files in this repository: | File | Purpose | |------|---------| -| [`example.config.toml`](example.config.toml) | Docker runtime (English) | -| [`example.config.zh.toml`](example.config.zh.toml) | Docker runtime (中文) | -| [`example.config.k8s.toml`](example.config.k8s.toml) | Kubernetes runtime (English) | -| [`example.config.k8s.zh.toml`](example.config.k8s.zh.toml) | Kubernetes runtime (中文) | +| [`example.config.toml`](opensandbox_server/examples/example.config.toml) | Docker runtime (English) | +| [`example.config.zh.toml`](opensandbox_server/examples/example.config.zh.toml) | Docker runtime (中文) | +| [`example.config.k8s.toml`](opensandbox_server/examples/example.config.k8s.toml) | Kubernetes runtime (English) | +| [`example.config.k8s.zh.toml`](opensandbox_server/examples/example.config.k8s.zh.toml) | Kubernetes runtime (中文) | --- From f34900c123c2cd1f6e83d534264128808cded3a0 Mon Sep 17 00:00:00 2001 From: gujishh Date: Mon, 18 May 2026 11:20:28 +0900 Subject: [PATCH 21/58] fix(js-sdk): accept empty ping responses --- .../javascript/src/adapters/healthAdapter.ts | 2 +- sdks/sandbox/javascript/tests/health.test.mjs | 42 +++++++++++++++++++ 2 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 sdks/sandbox/javascript/tests/health.test.mjs diff --git a/sdks/sandbox/javascript/src/adapters/healthAdapter.ts b/sdks/sandbox/javascript/src/adapters/healthAdapter.ts index a2ffce224..836f46a9b 100644 --- a/sdks/sandbox/javascript/src/adapters/healthAdapter.ts +++ b/sdks/sandbox/javascript/src/adapters/healthAdapter.ts @@ -20,7 +20,7 @@ export class HealthAdapter implements ExecdHealth { constructor(private readonly client: ExecdClient) {} async ping(): Promise { - const { error, response } = await this.client.GET("/ping"); + const { error, response } = await this.client.GET("/ping", { parseAs: "text" }); throwOnOpenApiFetchError({ error, response }, "Execd ping failed"); return true; } diff --git a/sdks/sandbox/javascript/tests/health.test.mjs b/sdks/sandbox/javascript/tests/health.test.mjs new file mode 100644 index 000000000..7f892cdfa --- /dev/null +++ b/sdks/sandbox/javascript/tests/health.test.mjs @@ -0,0 +1,42 @@ +// Copyright 2026 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +import assert from "node:assert/strict"; +import test from "node:test"; + +import { HealthAdapter, createExecdClient } from "../dist/internal.js"; +import { SandboxApiException } from "../dist/index.js"; + +test("HealthAdapter treats empty 200 ping responses as healthy", async () => { + const health = new HealthAdapter(createExecdClient({ + baseUrl: "http://execd.test", + async fetch(request) { + assert.equal(new URL(request.url).pathname, "/ping"); + return new Response("", { status: 200 }); + }, + })); + + assert.equal(await health.ping(), true); +}); + +test("HealthAdapter still maps ping API errors", async () => { + const health = new HealthAdapter(createExecdClient({ + baseUrl: "http://execd.test", + async fetch() { + return Response.json({ code: "UNAVAILABLE", message: "not ready" }, { status: 503 }); + }, + })); + + await assert.rejects(() => health.ping(), SandboxApiException); +}); From 73973afd7a47fe490ae6b188eef1cab904517aea Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Mon, 18 May 2026 11:05:09 +0800 Subject: [PATCH 22/58] fix(egress): unblock SSE/chunked streaming through mitmproxy (#898) * fix(egress): unblock SSE/chunked streaming through mitmproxy The mitmdump --set stream_large_bodies=1m option (added to prevent OOM on large downloads) buffers any response under 1 MiB before forwarding, which stalls LLM SSE / chunked streams of small chunks until the stream ends or the threshold is hit, producing perceptible stutter downstream. Introduce a bundled system addon that is always loaded by the egress launcher and forces flow.response.stream = True for responses with content-type: text/event-stream or transfer-encoding: chunked. Large- body OOM protection from stream_large_bodies stays intact for everything else. The previous example script add_header.py is renamed to system.py and repurposed as the always-on system addon (wire-transparent: no headers added or altered). User-supplied addons via OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT are still loaded after the system addon and may observe or override its hooks. Refs: https://project.aone.alibaba-inc.com/v2/project/2135082/req/82131871 Co-Authored-By: Claude Opus 4.7 * fix(egress): case-insensitive header match and correct system addon docs Normalize content-type and transfer-encoding to lowercase before substring matching in the system addon, so legal mixed-case values like Transfer-Encoding: Chunked or Content-Type: Text/Event-Stream still trigger streaming instead of getting buffered by stream_large_bodies=1m. Also fix the transparent-mode doc, which referenced a non-existent OPENSANDBOX_EGRESS_MITMPROXY_SYSTEM_SCRIPT env var as a way to disable or swap the system addon. The launcher always appends the bundled system addon unconditionally; document that users override behavior via OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT (loaded after the system addon). Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- .../egress/docs/mitmproxy-transparent.md | 24 ++++++++----- components/egress/mitmscripts/add_header.py | 12 ------- components/egress/mitmscripts/system.py | 36 +++++++++++++++++++ components/egress/pkg/mitmproxy/launch.go | 11 ++++-- 4 files changed, 61 insertions(+), 22 deletions(-) delete mode 100644 components/egress/mitmscripts/add_header.py create mode 100644 components/egress/mitmscripts/system.py diff --git a/components/egress/docs/mitmproxy-transparent.md b/components/egress/docs/mitmproxy-transparent.md index 1253a1745..3df101527 100644 --- a/components/egress/docs/mitmproxy-transparent.md +++ b/components/egress/docs/mitmproxy-transparent.md @@ -30,8 +30,8 @@ By default, mitmproxy listens on `18081` and transparent redirect rules are set # Optional: change listening port (default: 18081) export OPENSANDBOX_EGRESS_MITMPROXY_PORT=18081 -# Optional: enable mitm addon script (e.g., inject request headers) -export OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT=/opt/opensandbox/mitmscripts/add_header.py +# Optional: load an additional user-defined mitm addon (loaded after the system addon) +export OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT=/path/to/your/addon.py # Optional: bypass decryption for selected domains (semicolon-separated regex list) export OPENSANDBOX_EGRESS_MITMPROXY_IGNORE_HOSTS='.*\.log\.aliyuncs\.com;.*\.example\.internal' @@ -43,7 +43,7 @@ export OPENSANDBOX_EGRESS_MITMPROXY_IGNORE_HOSTS='.*\.log\.aliyuncs\.com;.*\.exa |------|----------|------|--------| | `OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT` | Yes | Enable transparent mitmproxy (`1/true/on`, etc.) | Disabled | | `OPENSANDBOX_EGRESS_MITMPROXY_PORT` | No | mitmdump listen port; `iptables` redirects `80/443` here | `18081` | -| `OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT` | No | mitm addon script path (`-s`) | Empty | +| `OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT` | No | Additional user mitm addon script path (`-s`); loaded after the system addon | Empty | | `OPENSANDBOX_EGRESS_MITMPROXY_IGNORE_HOSTS` | No | Host/IP regex list for TLS pass-through (`;` separated) | Empty | | `OPENSANDBOX_EGRESS_MITMPROXY_CONFDIR` | No | mitm config and CA directory (passed as `--set confdir=`, also used as `HOME`) | Default directory under `/var/lib/mitmproxy` | | `OPENSANDBOX_EGRESS_MITMPROXY_UPSTREAM_TRUST_DIR` | No | Trust directory for upstream TLS verification (OpenSSL style) | `/etc/ssl/certs` | @@ -62,23 +62,31 @@ Notes: export OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT=true ``` -### 2) Enable with Header Injection +### 2) System Addon (Always On) + +The bundled system addon at `/var/egress/mitmscripts/system.py` is shipped in the egress image and loaded automatically whenever transparent mode is enabled. It stays wire-transparent (no headers added or altered) and currently provides: + +- Forces streaming (`flow.response.stream = True`) for SSE (`text/event-stream`) and chunked responses, so each chunk is forwarded immediately instead of being buffered up to the `stream_large_bodies=1m` threshold (critical for LLM streaming UX). + +The system addon is always loaded and cannot be disabled via configuration. To override its behavior, supply a user addon via `OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT`; user addons are loaded after the system addon and may observe or override its hooks. + +### 3) Add a User Addon Alongside the System Addon ```bash export OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT=true -export OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT=/opt/opensandbox/mitmscripts/add_header.py +export OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT=/path/to/your/addon.py ``` -Built-in example script: `/opt/opensandbox/mitmscripts/add_header.py` (adds `X-OpenSandbox-Egress: 1`). +The user addon is loaded after the system addon (`-s system.py -s user.py`), so user hooks observe and may override system behavior. -### 3) Bypass Decryption for Specific Domains (e.g. log upload) +### 4) Bypass Decryption for Specific Domains (e.g. log upload) ```bash export OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT=true export OPENSANDBOX_EGRESS_MITMPROXY_IGNORE_HOSTS='.*\.log\.aliyuncs\.com' ``` -### 4) Use a Fixed CA (consistent fingerprint across replicas) +### 5) Use a Fixed CA (consistent fingerprint across replicas) If CA files already exist in `confdir`, mitmproxy reuses them instead of regenerating on each startup. Typical paths: diff --git a/components/egress/mitmscripts/add_header.py b/components/egress/mitmscripts/add_header.py deleted file mode 100644 index c3a3430b2..000000000 --- a/components/egress/mitmscripts/add_header.py +++ /dev/null @@ -1,12 +0,0 @@ -# Example mitmproxy addon: add a static header to every request. -# Use: OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT=/opt/opensandbox/mitmscripts/add_header.py -# Optional addon: OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT can point to this file. -from mitmproxy import http - -HEADER_NAME = "X-OpenSandbox-Egress" -HEADER_VALUE = "1" - - -def request(flow: http.HTTPFlow) -> None: - if flow.request: - flow.request.headers[HEADER_NAME] = HEADER_VALUE diff --git a/components/egress/mitmscripts/system.py b/components/egress/mitmscripts/system.py new file mode 100644 index 000000000..71a988896 --- /dev/null +++ b/components/egress/mitmscripts/system.py @@ -0,0 +1,36 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# OpenSandbox egress system addon. +# +# Always loaded by the egress mitmproxy launcher. Stays transparent on the +# wire (does not add or alter headers that would reveal the proxy to peers). +# +# Behavior: +# Forces streaming for SSE / chunked responses so each chunk is forwarded +# immediately, bypassing the stream_large_bodies=1m buffer set in launch.go +# (which otherwise stalls LLM-style small-chunk streams). +# +# User-defined addons can be loaded alongside this script via +# OPENSANDBOX_EGRESS_MITMPROXY_SCRIPT. +from mitmproxy import http + + +def responseheaders(flow: http.HTTPFlow) -> None: + if flow.response is None: + return + content_type = flow.response.headers.get("content-type", "").lower() + transfer_encoding = flow.response.headers.get("transfer-encoding", "").lower() + if "text/event-stream" in content_type or "chunked" in transfer_encoding: + flow.response.stream = True diff --git a/components/egress/pkg/mitmproxy/launch.go b/components/egress/pkg/mitmproxy/launch.go index 0065d672e..2a1b4e205 100644 --- a/components/egress/pkg/mitmproxy/launch.go +++ b/components/egress/pkg/mitmproxy/launch.go @@ -34,11 +34,16 @@ const RunAsUser = "mitmproxy" // Loopback: transparent mode receives via REDIRECT; do not listen on 0.0.0.0 in the netns. const listenHostLoopback = "127.0.0.1" +// systemScriptPath: bundled system addon shipped via the egress Dockerfile +// (COPY components/egress/mitmscripts /var/egress/mitmscripts). Always loaded. +const systemScriptPath = "/var/egress/mitmscripts/system.py" + // Config: mitmdump --mode transparent; UserName must match iptables ! --uid-owner, ConfDir is mitm state/CA. type Config struct { ListenPort int UserName string ConfDir string + // ScriptPath is an optional user-supplied addon, loaded after the system addon. ScriptPath string // OnExit is called (if non-nil) when mitmdump exits. Called from a background goroutine. OnExit func(error) @@ -120,8 +125,10 @@ func Launch(cfg Config) (*Running, error) { args = append(args, "--set", "confdir="+cd) homeEnv = cd } - if strings.TrimSpace(cfg.ScriptPath) != "" { - args = append(args, "-s", strings.TrimSpace(cfg.ScriptPath)) + // Load the system addon first so user addons can observe / override its hooks. + args = append(args, "-s", systemScriptPath) + if user := strings.TrimSpace(cfg.ScriptPath); user != "" { + args = append(args, "-s", user) } // Upstream passthrough: each pattern becomes --set ignore_hosts= (regex; IP ranges are practical in transparent mode). From abadbf1fa7e7dcb7ebd21d7a325b29a4aa1e7f21 Mon Sep 17 00:00:00 2001 From: "pingshan.wj" Date: Mon, 18 May 2026 11:30:58 +0800 Subject: [PATCH 23/58] feat(k8s): Fix pool scale expecation stuck --- kubernetes/internal/controller/pool_controller.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kubernetes/internal/controller/pool_controller.go b/kubernetes/internal/controller/pool_controller.go index f7c6ad4e7..f56a7939c 100644 --- a/kubernetes/internal/controller/pool_controller.go +++ b/kubernetes/internal/controller/pool_controller.go @@ -674,8 +674,14 @@ func (r *PoolReconciler) scalePool(ctx context.Context, pool *sandboxv1alpha1.Po errs := make([]error, 0) pods := args.pods if satisfied, unsatisfiedDuration, dirtyPods := PoolScaleExpectations.SatisfiedExpectations(controllerutils.GetControllerKey(pool)); !satisfied { - log.Info("Pool scale is not ready, requeue", "unsatisfiedDuration", unsatisfiedDuration, "dirtyPods", dirtyPods) - return fmt.Errorf("pool scale is not ready, %v", pool.Name) + if unsatisfiedDuration >= expectations.ExpectationTimeout { + log.Info("Pool scale expectations timed out, clearing stale expectations", + "unsatisfiedDuration", unsatisfiedDuration, "dirtyPods", dirtyPods) + PoolScaleExpectations.DeleteExpectations(controllerutils.GetControllerKey(pool)) + } else { + log.Info("Pool scale is not ready, requeue", "unsatisfiedDuration", unsatisfiedDuration, "dirtyPods", dirtyPods) + return fmt.Errorf("pool scale is not ready, %v", pool.Name) + } } schedulableCnt := int32(len(args.pods)) totalPodCnt := args.totalPodCnt From d53463669c5f4889cba58d5af75d97ccdea78b4a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 18 May 2026 03:40:09 +0000 Subject: [PATCH 24/58] chore: bump egress to v1.0.12 --- .../charts/opensandbox-server/values.yaml | 2 +- server/docker-compose.example.yaml | 4 +-- .../examples/example.config.k8s.toml | 2 +- .../examples/example.config.k8s.zh.toml | 2 +- .../examples/example.config.toml | 2 +- .../examples/example.config.zh.toml | 2 +- .../tests/k8s/test_agent_sandbox_provider.py | 16 +++++----- .../tests/k8s/test_batchsandbox_provider.py | 20 ++++++------- server/tests/k8s/test_egress_helper.py | 30 +++++++++---------- server/tests/k8s/test_kubernetes_service.py | 4 +-- 10 files changed, 42 insertions(+), 42 deletions(-) diff --git a/kubernetes/charts/opensandbox-server/values.yaml b/kubernetes/charts/opensandbox-server/values.yaml index 8f5d74064..c1891dcc1 100644 --- a/kubernetes/charts/opensandbox-server/values.yaml +++ b/kubernetes/charts/opensandbox-server/values.yaml @@ -96,5 +96,5 @@ configToml: | batchsandbox_template_file = "/etc/opensandbox/example.batchsandbox-template.yaml" [egress] - image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.11" + image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.12" mode = "dns+nft" diff --git a/server/docker-compose.example.yaml b/server/docker-compose.example.yaml index 500571521..4263b5c93 100644 --- a/server/docker-compose.example.yaml +++ b/server/docker-compose.example.yaml @@ -14,8 +14,8 @@ configs: execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" [egress] - image = "opensandbox/egress:v1.0.11" - # image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.11" + image = "opensandbox/egress:v1.0.12" + # image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.12" [docker] network_mode = "bridge" diff --git a/server/opensandbox_server/examples/example.config.k8s.toml b/server/opensandbox_server/examples/example.config.k8s.toml index 230e9dfb9..3d35d1bc7 100644 --- a/server/opensandbox_server/examples/example.config.k8s.toml +++ b/server/opensandbox_server/examples/example.config.k8s.toml @@ -75,7 +75,7 @@ batchsandbox_template_file = "~/batchsandbox-template.yaml" mode = "direct" [egress] -image = "opensandbox/egress:v1.0.11" +image = "opensandbox/egress:v1.0.12" mode = "dns" # Default is true (recommended for dual-stack CNI). Set false only if you need IPv6 in the netns (see server/configuration.md). # disable_ipv6 = false diff --git a/server/opensandbox_server/examples/example.config.k8s.zh.toml b/server/opensandbox_server/examples/example.config.k8s.zh.toml index 034927741..7df9f4dc2 100644 --- a/server/opensandbox_server/examples/example.config.k8s.zh.toml +++ b/server/opensandbox_server/examples/example.config.k8s.zh.toml @@ -76,7 +76,7 @@ batchsandbox_template_file = "~/batchsandbox-template.yaml" mode = "direct" [egress] -image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.11" +image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.12" mode = "dns" # Default is true (recommended for dual-stack CNI). Set false only if you need IPv6 in the netns (see server/configuration.md). # disable_ipv6 = false diff --git a/server/opensandbox_server/examples/example.config.toml b/server/opensandbox_server/examples/example.config.toml index ba1d2d06b..4fe707308 100644 --- a/server/opensandbox_server/examples/example.config.toml +++ b/server/opensandbox_server/examples/example.config.toml @@ -63,7 +63,7 @@ seccomp_profile = "" mode = "direct" [egress] -image = "opensandbox/egress:v1.0.11" +image = "opensandbox/egress:v1.0.12" mode = "dns" # 🧪 [EXPERIMENTAL] Renew-on-access. Off by default — see server/README.md. diff --git a/server/opensandbox_server/examples/example.config.zh.toml b/server/opensandbox_server/examples/example.config.zh.toml index d8b6aabf5..3645d9586 100644 --- a/server/opensandbox_server/examples/example.config.zh.toml +++ b/server/opensandbox_server/examples/example.config.zh.toml @@ -61,7 +61,7 @@ seccomp_profile = "" mode = "direct" [egress] -image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.11" +image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/egress:v1.0.12" mode = "dns" # 🧪 [EXPERIMENTAL] 按访问续期。默认关闭 — 见 server/README_zh.md。 diff --git a/server/tests/k8s/test_agent_sandbox_provider.py b/server/tests/k8s/test_agent_sandbox_provider.py index bfc138f2a..5f69b7e33 100644 --- a/server/tests/k8s/test_agent_sandbox_provider.py +++ b/server/tests/k8s/test_agent_sandbox_provider.py @@ -772,7 +772,7 @@ def test_create_workload_with_network_policy_adds_sidecar(self, mock_k8s_client) expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -785,7 +785,7 @@ def test_create_workload_with_network_policy_adds_sidecar(self, mock_k8s_client) # Find sidecar container sidecar = next((c for c in containers if c["name"] == "egress"), None) assert sidecar is not None - assert sidecar["image"] == "opensandbox/egress:v1.0.11" + assert sidecar["image"] == "opensandbox/egress:v1.0.12" # Verify sidecar has environment variable env_vars = {e["name"]: e["value"] for e in sidecar.get("env", [])} @@ -822,7 +822,7 @@ def test_create_workload_with_network_policy_persists_annotation_and_sidecar_tok expires_at=None, execd_image="execd:latest", network_policy=NetworkPolicy(default_action="deny", egress=[]), - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", annotations={SANDBOX_EGRESS_AUTH_TOKEN_METADATA_KEY: "egress-token"}, egress_auth_token="egress-token", ) @@ -854,7 +854,7 @@ def test_create_workload_with_egress_mode_dns_nft(self, mock_k8s_client): expires_at=None, execd_image="execd:latest", network_policy=NetworkPolicy(default_action="deny", egress=[]), - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", egress_mode=EGRESS_MODE_DNS_NFT, ) @@ -891,7 +891,7 @@ def test_create_workload_with_network_policy_does_not_add_pod_ipv6_sysctls(self, expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -931,7 +931,7 @@ def test_create_workload_with_egress_skips_ipv6_disable_when_not_configured(self expires_at=None, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -964,7 +964,7 @@ def test_create_workload_with_network_policy_drops_net_admin_from_main_container expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -1041,7 +1041,7 @@ def test_egress_sidecar_contains_network_policy_in_env(self, mock_k8s_client): expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] diff --git a/server/tests/k8s/test_batchsandbox_provider.py b/server/tests/k8s/test_batchsandbox_provider.py index 39150fcc1..15aa51a34 100644 --- a/server/tests/k8s/test_batchsandbox_provider.py +++ b/server/tests/k8s/test_batchsandbox_provider.py @@ -1655,7 +1655,7 @@ def test_create_workload_with_network_policy_adds_sidecar(self, mock_k8s_client) expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -1668,7 +1668,7 @@ def test_create_workload_with_network_policy_adds_sidecar(self, mock_k8s_client) # Find sidecar container sidecar = next((c for c in containers if c["name"] == "egress"), None) assert sidecar is not None - assert sidecar["image"] == "opensandbox/egress:v1.0.11" + assert sidecar["image"] == "opensandbox/egress:v1.0.12" # Verify sidecar has environment variable env_vars = {e["name"]: e["value"] for e in sidecar.get("env", [])} @@ -1709,7 +1709,7 @@ def test_create_workload_windows_profile_with_network_policy_keeps_ipv6_disable( execd_image="execd:latest", platform=PlatformSpec(os="windows", arch="amd64"), network_policy=NetworkPolicy(default_action="deny", egress=[]), - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -1746,7 +1746,7 @@ def test_create_workload_with_network_policy_persists_annotation_and_sidecar_tok expires_at=None, execd_image="execd:latest", network_policy=NetworkPolicy(default_action="deny", egress=[]), - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", annotations={SANDBOX_EGRESS_AUTH_TOKEN_METADATA_KEY: "egress-token"}, egress_auth_token="egress-token", ) @@ -1778,7 +1778,7 @@ def test_create_workload_with_egress_mode_dns_nft(self, mock_k8s_client): expires_at=None, execd_image="execd:latest", network_policy=NetworkPolicy(default_action="deny", egress=[]), - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", egress_mode=EGRESS_MODE_DNS_NFT, ) @@ -1816,7 +1816,7 @@ def test_create_workload_with_network_policy_does_not_add_pod_ipv6_sysctls(self, expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -1856,7 +1856,7 @@ def test_create_workload_with_egress_skips_ipv6_disable_when_not_configured(self expires_at=None, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -1889,7 +1889,7 @@ def test_create_workload_with_network_policy_drops_net_admin_from_main_container expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -1966,7 +1966,7 @@ def test_egress_sidecar_contains_network_policy_in_env(self, mock_k8s_client): expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] @@ -2051,7 +2051,7 @@ def test_create_workload_with_network_policy_works_with_template(self, mock_k8s_ expires_at=expires_at, execd_image="execd:latest", network_policy=network_policy, - egress_image="opensandbox/egress:v1.0.11", + egress_image="opensandbox/egress:v1.0.12", ) body = mock_k8s_client.create_custom_object.call_args.kwargs["body"] diff --git a/server/tests/k8s/test_egress_helper.py b/server/tests/k8s/test_egress_helper.py index ead2ccead..29aba8a12 100644 --- a/server/tests/k8s/test_egress_helper.py +++ b/server/tests/k8s/test_egress_helper.py @@ -47,7 +47,7 @@ class TestEgressSidecarViaApply: def test_builds_container_with_basic_config(self): """Test that container is built with correct basic configuration.""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[ @@ -64,7 +64,7 @@ def test_builds_container_with_basic_config(self): def test_contains_egress_rules_environment_variable(self): """Test that container includes OPENSANDBOX_EGRESS_RULES environment variable.""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], @@ -80,7 +80,7 @@ def test_contains_egress_rules_environment_variable(self): assert env_vars[1]["value"] == EGRESS_MODE_DNS def test_contains_egress_token_when_provided(self): - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], @@ -97,7 +97,7 @@ def test_contains_egress_token_when_provided(self): assert env_vars[EGRESS_MODE_ENV] == EGRESS_MODE_DNS def test_egress_mode_dns_nft(self): - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], @@ -114,7 +114,7 @@ def test_egress_mode_dns_nft(self): def test_serializes_network_policy_correctly(self): """Test that network policy is correctly serialized to JSON.""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[ @@ -139,7 +139,7 @@ def test_serializes_network_policy_correctly(self): def test_handles_empty_egress_rules(self): """Test that empty egress rules are handled correctly.""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="allow", egress=[], @@ -155,7 +155,7 @@ def test_handles_empty_egress_rules(self): def test_handles_missing_default_action(self): """Test that missing default_action is handled (exclude_none=True).""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( egress=[NetworkRule(action="allow", target="example.com")], ) @@ -170,7 +170,7 @@ def test_handles_missing_default_action(self): def test_security_context_adds_net_admin_not_privileged(self): """Egress sidecar uses NET_ADMIN only (IPv6 is disabled in execd init when egress is on).""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[], @@ -184,14 +184,14 @@ def test_security_context_adds_net_admin_not_privileged(self): def test_no_command_uses_image_entrypoint(self): container = _egress_container( - "opensandbox/egress:v1.0.11", + "opensandbox/egress:v1.0.12", NetworkPolicy(default_action="deny", egress=[]), ) assert "command" not in container def test_container_spec_is_valid_kubernetes_format(self): """Test that returned container spec is in valid Kubernetes format.""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], @@ -212,7 +212,7 @@ def test_container_spec_is_valid_kubernetes_format(self): def test_handles_wildcard_domains(self): """Test that wildcard domains in egress rules are handled correctly.""" - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" network_policy = NetworkPolicy( default_action="deny", egress=[ @@ -254,7 +254,7 @@ def test_adds_egress_sidecar_container(self): default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], ) - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" apply_egress_to_spec( containers, @@ -273,7 +273,7 @@ def test_does_not_touch_unrelated_pod_state(self): default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], ) - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" apply_egress_to_spec( containers, @@ -298,7 +298,7 @@ def test_preserves_existing_pod_sysctls_when_not_passed_in(self): default_action="deny", egress=[NetworkRule(action="allow", target="example.com")], ) - egress_image = "opensandbox/egress:v1.0.11" + egress_image = "opensandbox/egress:v1.0.12" apply_egress_to_spec( containers, @@ -320,7 +320,7 @@ def test_no_op_when_no_network_policy(self): apply_egress_to_spec( containers, None, - "opensandbox/egress:v1.0.11", + "opensandbox/egress:v1.0.12", ) assert len(containers) == 0 diff --git a/server/tests/k8s/test_kubernetes_service.py b/server/tests/k8s/test_kubernetes_service.py index 96ed3162f..3f5f583fe 100644 --- a/server/tests/k8s/test_kubernetes_service.py +++ b/server/tests/k8s/test_kubernetes_service.py @@ -224,7 +224,7 @@ async def test_create_sandbox_with_network_policy_passes_egress_token_and_annota self, k8s_service, create_sandbox_request ): create_sandbox_request.network_policy = NetworkPolicy(default_action="deny", egress=[]) - k8s_service.app_config.egress = EgressConfig(image="opensandbox/egress:v1.0.11") + k8s_service.app_config.egress = EgressConfig(image="opensandbox/egress:v1.0.12") k8s_service.workload_provider.create_workload.return_value = { "name": "test-id", "uid": "uid-1" } @@ -298,7 +298,7 @@ async def test_create_sandbox_with_network_policy_passes_egress_mode_dns_nft_fro ): create_sandbox_request.network_policy = NetworkPolicy(default_action="deny", egress=[]) k8s_service.app_config.egress = EgressConfig( - image="opensandbox/egress:v1.0.11", + image="opensandbox/egress:v1.0.12", mode=EGRESS_MODE_DNS_NFT, ) k8s_service.workload_provider.create_workload.return_value = { From f3763f4f16d5aa68aab9ad92ee0430da9cfc8d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 18 May 2026 12:35:16 +0800 Subject: [PATCH 25/58] chore: trigger kubernetes mini e2e test --- scripts/python-k8s-e2e.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/python-k8s-e2e.sh b/scripts/python-k8s-e2e.sh index 9cc220b1e..76801410d 100644 --- a/scripts/python-k8s-e2e.sh +++ b/scripts/python-k8s-e2e.sh @@ -1,5 +1,5 @@ #!/bin/bash -# trigger k8s e2e (2026-05-17) +# trigger k8s e2e (2026-05-18) # Copyright 2026 Alibaba Group Holding Ltd. # # Licensed under the Apache License, Version 2.0 (the "License"); From 9cfd2725ae2d7d48611b2bf54cb90ed2fffae29b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 18 May 2026 13:40:50 +0800 Subject: [PATCH 26/58] fix(server): defer importing main until after uvicorn worker setup Importing opensandbox_server.main in the CLI eagerly constructed sandbox_service, restoring containers and starting expiration Timer threads in the supervisor process before uvicorn.run was called. With [server].workers > 1 that left orphan timers in the supervisor and (on spawn) duplicated them across workers. Read config and logging directly in the CLI so only worker processes initialize the service graph. Co-Authored-By: Claude Opus 4.7 --- server/opensandbox_server/cli.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/server/opensandbox_server/cli.py b/server/opensandbox_server/cli.py index a542b6bff..ce45f973c 100644 --- a/server/opensandbox_server/cli.py +++ b/server/opensandbox_server/cli.py @@ -37,7 +37,9 @@ RuntimeConfig, ServerConfig, StorageConfig, + load_config, ) +from opensandbox_server.logging_config import configure_logging def _strip_optional(annotation: Any) -> Any: @@ -284,9 +286,14 @@ def main() -> None: if args.config: os.environ[CONFIG_ENV_VAR] = args.config - from opensandbox_server import main as server_main # local import after env is set + # Load config + logging without importing opensandbox_server.main: importing + # main eagerly constructs sandbox_service (restoring containers and starting + # expiration timers) in this process. With workers > 1 that leaks timers to + # the uvicorn supervisor and (on spawn) duplicates them across workers. + app_config = load_config() + log_config = configure_logging(app_config.log) + server_cfg = app_config.server - server_cfg = server_main.app_config.server workers = 1 if args.reload else server_cfg.workers if args.reload and server_cfg.workers > 1: print( @@ -298,7 +305,7 @@ def main() -> None: host=server_cfg.host, port=server_cfg.port, reload=args.reload, - log_config=server_main._log_config, + log_config=log_config, timeout_keep_alive=server_cfg.timeout_keep_alive, workers=workers, limit_concurrency=server_cfg.limit_concurrency, From 145555c178c833fb4d9409d11d07b3ea49bc7939 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 18 May 2026 13:40:59 +0800 Subject: [PATCH 27/58] fix(server/k8s): invalidate informer cache on custom object writes list_custom_objects returns the informer cache snapshot once synced, but create/patch/delete previously left the cache untouched, so a list immediately after a write could include the old or freshly-deleted object until the watch event arrived. Add delete_from_cache to the informer and have the K8sClient write paths upsert or evict cache entries through a non-creating informer lookup. Co-Authored-By: Claude Opus 4.7 --- .../opensandbox_server/services/k8s/client.py | 25 ++++++++++- .../services/k8s/informer.py | 5 +++ server/tests/k8s/test_k8s_client.py | 44 +++++++++++++++++++ 3 files changed, 72 insertions(+), 2 deletions(-) diff --git a/server/opensandbox_server/services/k8s/client.py b/server/opensandbox_server/services/k8s/client.py index 99a763b0f..72f39a686 100644 --- a/server/opensandbox_server/services/k8s/client.py +++ b/server/opensandbox_server/services/k8s/client.py @@ -88,6 +88,16 @@ def get_node_v1_api(self) -> NodeV1Api: return self._node_v1_api + def _lookup_informer(self, group: str, version: str, plural: str, namespace: str) -> Optional[WorkloadInformer]: + """Return an existing informer without starting one. Used by write paths + to invalidate cache entries; never auto-create on writes since list paths + own the lazy-start contract.""" + if not self.config.informer_enabled: + return None + key: _InformerKey = (group, version, plural, namespace) + with self._informers_lock: + return self._informers.get(key) + def _get_informer(self, group: str, version: str, plural: str, namespace: str) -> Optional[WorkloadInformer]: """Return the informer for this resource+namespace, starting it lazily.""" if not self.config.informer_enabled: @@ -131,13 +141,17 @@ def create_custom_object( """Create a namespaced custom resource.""" if self._write_limiter: self._write_limiter.acquire() - return self.get_custom_objects_api().create_namespaced_custom_object( + obj = self.get_custom_objects_api().create_namespaced_custom_object( group=group, version=version, namespace=namespace, plural=plural, body=body, ) + informer = self._lookup_informer(group, version, plural, namespace) + if informer: + informer.update_cache(obj) + return obj def get_custom_object( self, @@ -239,6 +253,9 @@ def delete_custom_object( name=name, grace_period_seconds=grace_period_seconds, ) + informer = self._lookup_informer(group, version, plural, namespace) + if informer: + informer.delete_from_cache(name) def patch_custom_object( self, @@ -252,7 +269,7 @@ def patch_custom_object( """Patch a namespaced custom resource.""" if self._write_limiter: self._write_limiter.acquire() - return self.get_custom_objects_api().patch_namespaced_custom_object( + obj = self.get_custom_objects_api().patch_namespaced_custom_object( group=group, version=version, namespace=namespace, @@ -260,6 +277,10 @@ def patch_custom_object( name=name, body=body, ) + informer = self._lookup_informer(group, version, plural, namespace) + if informer: + informer.update_cache(obj) + return obj # ------------------------------------------------------------------ # PersistentVolumeClaim operations diff --git a/server/opensandbox_server/services/k8s/informer.py b/server/opensandbox_server/services/k8s/informer.py index fa95e39d9..38b440ac6 100644 --- a/server/opensandbox_server/services/k8s/informer.py +++ b/server/opensandbox_server/services/k8s/informer.py @@ -105,6 +105,11 @@ def update_cache(self, obj: Dict[str, Any]) -> None: self._cache[name] = obj self._advance_resource_version(metadata.get("resourceVersion")) + def delete_from_cache(self, name: str) -> None: + """Evict a single object from the cache by name.""" + with self._lock: + self._cache.pop(name, None) + def _advance_resource_version(self, rv: Optional[str]) -> None: """Advance ``_resource_version`` only when *rv* is strictly newer. diff --git a/server/tests/k8s/test_k8s_client.py b/server/tests/k8s/test_k8s_client.py index 10333cec5..dadf3e773 100644 --- a/server/tests/k8s/test_k8s_client.py +++ b/server/tests/k8s/test_k8s_client.py @@ -144,6 +144,50 @@ def test_create_custom_object_delegates_to_api(self, k8s_runtime_config): group="g", version="v1", namespace="ns", plural="foos", body=body ) + def test_create_custom_object_updates_informer_cache(self, k8s_runtime_config): + """create_custom_object upserts the new object into an existing informer cache.""" + c = self._make_client(k8s_runtime_config) + created = {"metadata": {"name": "foo-1", "resourceVersion": "11"}} + c._custom_objects_api.create_namespaced_custom_object.return_value = created + fake_informer = MagicMock() + c._informers[("g", "v1", "foos", "ns")] = fake_informer + c.config = MagicMock(informer_enabled=True, read_qps=0.0, write_qps=0.0) + result = c.create_custom_object("g", "v1", "ns", "foos", {"metadata": {"name": "foo-1"}}) + assert result == created + fake_informer.update_cache.assert_called_once_with(created) + + def test_patch_custom_object_updates_informer_cache(self, k8s_runtime_config): + """patch_custom_object upserts the patched object into an existing informer cache.""" + c = self._make_client(k8s_runtime_config) + patched = {"metadata": {"name": "foo-1", "resourceVersion": "12"}} + c._custom_objects_api.patch_namespaced_custom_object.return_value = patched + fake_informer = MagicMock() + c._informers[("g", "v1", "foos", "ns")] = fake_informer + c.config = MagicMock(informer_enabled=True, read_qps=0.0, write_qps=0.0) + result = c.patch_custom_object("g", "v1", "ns", "foos", "foo-1", {"spec": {"x": 1}}) + assert result == patched + fake_informer.update_cache.assert_called_once_with(patched) + + def test_delete_custom_object_evicts_informer_cache(self, k8s_runtime_config): + """delete_custom_object removes the object from an existing informer cache.""" + c = self._make_client(k8s_runtime_config) + fake_informer = MagicMock() + c._informers[("g", "v1", "foos", "ns")] = fake_informer + c.config = MagicMock(informer_enabled=True, read_qps=0.0, write_qps=0.0) + c.delete_custom_object("g", "v1", "ns", "foos", "foo-1") + fake_informer.delete_from_cache.assert_called_once_with("foo-1") + + def test_write_paths_skip_cache_when_no_informer(self, k8s_runtime_config): + """Write paths must not crash when no informer has been started yet.""" + c = self._make_client(k8s_runtime_config) + c._custom_objects_api.create_namespaced_custom_object.return_value = {"metadata": {"name": "x"}} + c._custom_objects_api.patch_namespaced_custom_object.return_value = {"metadata": {"name": "x"}} + c.config = MagicMock(informer_enabled=True, read_qps=0.0, write_qps=0.0) + # No informers registered → _lookup_informer returns None + c.create_custom_object("g", "v1", "ns", "foos", {"metadata": {"name": "x"}}) + c.patch_custom_object("g", "v1", "ns", "foos", "x", {}) + c.delete_custom_object("g", "v1", "ns", "foos", "x") + def test_get_custom_object_returns_none_on_404(self, k8s_runtime_config): """get_custom_object returns None when the API raises a 404.""" c = self._make_client(k8s_runtime_config) From 458348c8c8d8980f384c97170f5ef682677da133 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 18 May 2026 14:16:22 +0800 Subject: [PATCH 28/58] fix(server): reject docker runtime with workers > 1 Docker expiration timers live in process-local state on DockerSandboxService, so each uvicorn worker schedules its own threading.Timer per sandbox. A renewal handled by one worker only updates that process's _sandbox_expirations, leaving other workers to fire stale timers at the pre-renewal time and remove the sandbox. Reject the combination at AppConfig validation until the Docker runtime grows shared expiration state. Kubernetes is unaffected. Co-Authored-By: Claude Opus 4.7 --- server/configuration.md | 2 +- server/opensandbox_server/config.py | 18 +++++++++++++++++- server/tests/test_config.py | 24 ++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/server/configuration.md b/server/configuration.md index ff6116f9b..8ed8382e7 100644 --- a/server/configuration.md +++ b/server/configuration.md @@ -66,7 +66,7 @@ Example files in this repository: | `eip` | string \| omitted | `null` | Public IP or hostname used as the **host part** when the server returns sandbox endpoint URLs (notably Docker runtime). | | `max_sandbox_timeout_seconds` | integer \| omitted | `null` | Upper bound on sandbox TTL in seconds for **create** requests that specify `timeout`. Must be ≥ **60** if set. Omit to disable the server-side cap. | | `timeout_keep_alive` | integer | `30` | Idle keep-alive timeout (seconds) passed to uvicorn. | -| `workers` | integer | `1` | Number of uvicorn worker processes. Each worker is a separate Python process with its own event loop and (under the Kubernetes runtime) its own informer watch streams to the apiserver. Default `1` keeps apiserver pressure predictable; bump to 2–8 based on CPU quota and apiserver capacity. Ignored when `--reload` is set. | +| `workers` | integer | `1` | Number of uvicorn worker processes. Each worker is a separate Python process with its own event loop and (under the Kubernetes runtime) its own informer watch streams to the apiserver. Default `1` keeps apiserver pressure predictable; bump to 2–8 based on CPU quota and apiserver capacity. Ignored when `--reload` is set. **Must be `1` when `runtime.type = "docker"`** — Docker expiration timers are per-process, so multiple workers race on renewals and can expire renewed sandboxes early. | | `limit_concurrency` | integer \| omitted | `1024` | Maximum concurrent connections per worker before returning 503. Provides backpressure protection under burst load. Omit to disable. | | `backlog` | integer | `2048` | Socket listen backlog passed to uvicorn. | | `thread_pool_size` | integer | `200` | Maximum size of the anyio default threadpool used by FastAPI to run sync route handlers. The anyio default of 40 throttles bursts of blocking sandbox list/get/delete operations under high concurrency. | diff --git a/server/opensandbox_server/config.py b/server/opensandbox_server/config.py index 9da6cc35a..6feb8a2c5 100644 --- a/server/opensandbox_server/config.py +++ b/server/opensandbox_server/config.py @@ -462,7 +462,8 @@ class ServerConfig(BaseModel): "runtime) its own informer watch streams to the apiserver. " "Default 1 to keep apiserver pressure predictable; bump to 2-8 " "based on CPU quota and apiserver capacity. Ignored when " - "--reload is set." + "--reload is set. Must be 1 when runtime.type = 'docker' " + "because Docker expiration timers live in process-local state." ), ) limit_concurrency: Optional[int] = Field( @@ -896,6 +897,21 @@ def validate_runtime_blocks(self) -> "AppConfig": raise ValueError("ingress.mode must be 'direct' when runtime.type = 'docker'.") if self.secure_runtime is not None and self.secure_runtime.type == "firecracker": raise ValueError( "secure_runtime.type 'firecracker' is only compatible with runtime.type='kubernetes'.") + # The Docker service tracks sandbox expirations with in-process + # threading.Timer objects keyed by sandbox_id. Each uvicorn worker + # would build an independent DockerSandboxService with its own + # timers, so a renewal handled by one worker would not cancel the + # stale timer in another worker — that worker would still expire + # the sandbox at the pre-renewal time. Refuse to start until the + # operator either drops back to a single worker or the Docker + # runtime grows shared expiration state. + if self.server.workers > 1: + raise ValueError( + "server.workers must be 1 when runtime.type = 'docker'; " + "Docker expiration timers are per-process, so multiple " + "workers can race on renew_expiration and expire renewed " + "sandboxes early." + ) elif self.runtime.type == "kubernetes": if self.kubernetes is None: self.kubernetes = KubernetesRuntimeConfig() diff --git a/server/tests/test_config.py b/server/tests/test_config.py index 7a7de58bb..5aa667c0c 100644 --- a/server/tests/test_config.py +++ b/server/tests/test_config.py @@ -159,6 +159,30 @@ def test_docker_runtime_disallows_kubernetes_block(): AppConfig(server=server_cfg, runtime=runtime_cfg, kubernetes=kubernetes_cfg) +def test_docker_runtime_rejects_multiple_workers(): + """Docker runtime keeps expiration timers in process-local state, so + workers > 1 would race on renew_expiration and expire renewed sandboxes + early. Reject the combination at config validation time.""" + server_cfg = ServerConfig(workers=2) + runtime_cfg = RuntimeConfig(type="docker", execd_image="busybox:latest") + with pytest.raises(ValueError, match="server.workers must be 1"): + AppConfig(server=server_cfg, runtime=runtime_cfg) + + +def test_docker_runtime_allows_single_worker(): + server_cfg = ServerConfig(workers=1) + runtime_cfg = RuntimeConfig(type="docker", execd_image="busybox:latest") + cfg = AppConfig(server=server_cfg, runtime=runtime_cfg) + assert cfg.server.workers == 1 + + +def test_kubernetes_runtime_allows_multiple_workers(): + server_cfg = ServerConfig(workers=4) + runtime_cfg = RuntimeConfig(type="kubernetes", execd_image="busybox:latest") + cfg = AppConfig(server=server_cfg, runtime=runtime_cfg) + assert cfg.server.workers == 4 + + def test_server_config_defaults_include_max_sandbox_timeout(): server_cfg = ServerConfig() assert server_cfg.max_sandbox_timeout_seconds is None From ae74253a66ba15538f8f9e93f9af96063d89df34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 18 May 2026 14:30:55 +0800 Subject: [PATCH 29/58] refactor(server): drop workers knob, make limit_concurrency disable-able from TOML Remove the [server].workers field. Multi-worker mode exposed too many foot-guns (per-process Docker expiration timers racing on renew, k8s informer cache divergence, import-time side effects in the supervisor) and the supported way to scale on Kubernetes is replica count, not in-process worker fan-out. uvicorn now runs single-process; the deferred-import comment in cli.py is kept for the reload supervisor. Fix [server].limit_concurrency so the documented disable path actually works from TOML. TOML has no null literal, so Optional[int] could not be set to None: the field now accepts 0 as a sentinel and a field_validator collapses it to None before uvicorn sees it. Default 1024 is unchanged. Co-Authored-By: Claude Opus 4.7 --- server/configuration.md | 3 +- server/opensandbox_server/cli.py | 11 ++----- server/opensandbox_server/config.py | 43 ++++++++------------------ server/tests/test_config.py | 48 +++++++++-------------------- 4 files changed, 29 insertions(+), 76 deletions(-) diff --git a/server/configuration.md b/server/configuration.md index 8ed8382e7..e2531bcf3 100644 --- a/server/configuration.md +++ b/server/configuration.md @@ -66,8 +66,7 @@ Example files in this repository: | `eip` | string \| omitted | `null` | Public IP or hostname used as the **host part** when the server returns sandbox endpoint URLs (notably Docker runtime). | | `max_sandbox_timeout_seconds` | integer \| omitted | `null` | Upper bound on sandbox TTL in seconds for **create** requests that specify `timeout`. Must be ≥ **60** if set. Omit to disable the server-side cap. | | `timeout_keep_alive` | integer | `30` | Idle keep-alive timeout (seconds) passed to uvicorn. | -| `workers` | integer | `1` | Number of uvicorn worker processes. Each worker is a separate Python process with its own event loop and (under the Kubernetes runtime) its own informer watch streams to the apiserver. Default `1` keeps apiserver pressure predictable; bump to 2–8 based on CPU quota and apiserver capacity. Ignored when `--reload` is set. **Must be `1` when `runtime.type = "docker"`** — Docker expiration timers are per-process, so multiple workers race on renewals and can expire renewed sandboxes early. | -| `limit_concurrency` | integer \| omitted | `1024` | Maximum concurrent connections per worker before returning 503. Provides backpressure protection under burst load. Omit to disable. | +| `limit_concurrency` | integer | `1024` | Maximum concurrent connections before returning 503. Provides backpressure protection under burst load. Set to `0` to disable the cap (TOML cannot express `null`). | | `backlog` | integer | `2048` | Socket listen backlog passed to uvicorn. | | `thread_pool_size` | integer | `200` | Maximum size of the anyio default threadpool used by FastAPI to run sync route handlers. The anyio default of 40 throttles bursts of blocking sandbox list/get/delete operations under high concurrency. | | `loop` | `"auto"` \| `"uvloop"` \| `"asyncio"` | `"auto"` | Event loop implementation. `auto` prefers uvloop and falls back to asyncio. | diff --git a/server/opensandbox_server/cli.py b/server/opensandbox_server/cli.py index ce45f973c..e2a7a7b5d 100644 --- a/server/opensandbox_server/cli.py +++ b/server/opensandbox_server/cli.py @@ -288,18 +288,12 @@ def main() -> None: # Load config + logging without importing opensandbox_server.main: importing # main eagerly constructs sandbox_service (restoring containers and starting - # expiration timers) in this process. With workers > 1 that leaks timers to - # the uvicorn supervisor and (on spawn) duplicates them across workers. + # expiration timers), which we defer to the actual worker process so the + # uvicorn reloader supervisor does not run them. app_config = load_config() log_config = configure_logging(app_config.log) server_cfg = app_config.server - workers = 1 if args.reload else server_cfg.workers - if args.reload and server_cfg.workers > 1: - print( - f"--reload set; ignoring workers={server_cfg.workers}, using 1\n" - ) - uvicorn.run( "opensandbox_server.main:app", host=server_cfg.host, @@ -307,7 +301,6 @@ def main() -> None: reload=args.reload, log_config=log_config, timeout_keep_alive=server_cfg.timeout_keep_alive, - workers=workers, limit_concurrency=server_cfg.limit_concurrency, backlog=server_cfg.backlog, loop=server_cfg.loop, diff --git a/server/opensandbox_server/config.py b/server/opensandbox_server/config.py index 6feb8a2c5..369c716d7 100644 --- a/server/opensandbox_server/config.py +++ b/server/opensandbox_server/config.py @@ -453,27 +453,23 @@ class ServerConfig(BaseModel): "Connections idle longer than this may be closed by the server." ), ) - workers: int = Field( - default=1, - ge=1, - description=( - "Number of uvicorn worker processes. Each worker is a separate " - "Python process with its own event loop and (under the Kubernetes " - "runtime) its own informer watch streams to the apiserver. " - "Default 1 to keep apiserver pressure predictable; bump to 2-8 " - "based on CPU quota and apiserver capacity. Ignored when " - "--reload is set. Must be 1 when runtime.type = 'docker' " - "because Docker expiration timers live in process-local state." - ), - ) limit_concurrency: Optional[int] = Field( default=1024, - ge=1, + ge=0, description=( - "Maximum concurrent connections per worker before returning 503. " - "Set null to disable. Provides backpressure protection under burst load." + "Maximum concurrent connections before returning 503. " + "Set to 0 to disable (TOML cannot express null). " + "Provides backpressure protection under burst load." ), ) + + @field_validator("limit_concurrency", mode="after") + @classmethod + def _zero_disables_limit_concurrency(cls, value: Optional[int]) -> Optional[int]: + # Translate the TOML-friendly sentinel 0 into None so uvicorn applies + # no concurrency cap. TOML has no null literal, so 0 is the only way + # to disable the limit from the config file. + return None if value == 0 else value backlog: int = Field( default=2048, ge=1, @@ -897,21 +893,6 @@ def validate_runtime_blocks(self) -> "AppConfig": raise ValueError("ingress.mode must be 'direct' when runtime.type = 'docker'.") if self.secure_runtime is not None and self.secure_runtime.type == "firecracker": raise ValueError( "secure_runtime.type 'firecracker' is only compatible with runtime.type='kubernetes'.") - # The Docker service tracks sandbox expirations with in-process - # threading.Timer objects keyed by sandbox_id. Each uvicorn worker - # would build an independent DockerSandboxService with its own - # timers, so a renewal handled by one worker would not cancel the - # stale timer in another worker — that worker would still expire - # the sandbox at the pre-renewal time. Refuse to start until the - # operator either drops back to a single worker or the Docker - # runtime grows shared expiration state. - if self.server.workers > 1: - raise ValueError( - "server.workers must be 1 when runtime.type = 'docker'; " - "Docker expiration timers are per-process, so multiple " - "workers can race on renew_expiration and expire renewed " - "sandboxes early." - ) elif self.runtime.type == "kubernetes": if self.kubernetes is None: self.kubernetes = KubernetesRuntimeConfig() diff --git a/server/tests/test_config.py b/server/tests/test_config.py index 5aa667c0c..2dd6e6e79 100644 --- a/server/tests/test_config.py +++ b/server/tests/test_config.py @@ -159,39 +159,14 @@ def test_docker_runtime_disallows_kubernetes_block(): AppConfig(server=server_cfg, runtime=runtime_cfg, kubernetes=kubernetes_cfg) -def test_docker_runtime_rejects_multiple_workers(): - """Docker runtime keeps expiration timers in process-local state, so - workers > 1 would race on renew_expiration and expire renewed sandboxes - early. Reject the combination at config validation time.""" - server_cfg = ServerConfig(workers=2) - runtime_cfg = RuntimeConfig(type="docker", execd_image="busybox:latest") - with pytest.raises(ValueError, match="server.workers must be 1"): - AppConfig(server=server_cfg, runtime=runtime_cfg) - - -def test_docker_runtime_allows_single_worker(): - server_cfg = ServerConfig(workers=1) - runtime_cfg = RuntimeConfig(type="docker", execd_image="busybox:latest") - cfg = AppConfig(server=server_cfg, runtime=runtime_cfg) - assert cfg.server.workers == 1 - - -def test_kubernetes_runtime_allows_multiple_workers(): - server_cfg = ServerConfig(workers=4) - runtime_cfg = RuntimeConfig(type="kubernetes", execd_image="busybox:latest") - cfg = AppConfig(server=server_cfg, runtime=runtime_cfg) - assert cfg.server.workers == 4 - - def test_server_config_defaults_include_max_sandbox_timeout(): server_cfg = ServerConfig() assert server_cfg.max_sandbox_timeout_seconds is None def test_server_config_uvicorn_tuning_defaults(): - """ServerConfig exposes uvicorn worker/concurrency knobs with sensible defaults.""" + """ServerConfig exposes uvicorn concurrency knobs with sensible defaults.""" server_cfg = ServerConfig() - assert server_cfg.workers == 1 assert server_cfg.limit_concurrency == 1024 assert server_cfg.backlog == 2048 assert server_cfg.thread_pool_size == 200 @@ -201,29 +176,34 @@ def test_server_config_uvicorn_tuning_defaults(): def test_server_config_uvicorn_tuning_overrides(): server_cfg = ServerConfig( - workers=8, limit_concurrency=256, backlog=4096, loop="uvloop", http="httptools", ) - assert server_cfg.workers == 8 assert server_cfg.limit_concurrency == 256 assert server_cfg.backlog == 4096 assert server_cfg.loop == "uvloop" assert server_cfg.http == "httptools" -def test_server_config_workers_must_be_positive(): - with pytest.raises(ValidationError): - ServerConfig(workers=0) +def test_server_config_limit_concurrency_zero_disables_cap(): + """0 is the TOML-friendly disable sentinel and must collapse to None so + uvicorn applies no concurrency limit.""" + cfg = ServerConfig(limit_concurrency=0) + assert cfg.limit_concurrency is None -def test_server_config_limit_concurrency_must_be_positive_when_set(): - with pytest.raises(ValidationError): - ServerConfig(limit_concurrency=0) +def test_server_config_limit_concurrency_accepts_none_and_positive(): cfg = ServerConfig(limit_concurrency=None) assert cfg.limit_concurrency is None + cfg = ServerConfig(limit_concurrency=512) + assert cfg.limit_concurrency == 512 + + +def test_server_config_limit_concurrency_rejects_negative(): + with pytest.raises(ValidationError): + ServerConfig(limit_concurrency=-1) def test_server_config_backlog_must_be_positive(): From e462eb1ee5870d2346230cfc409803e90ee23143 Mon Sep 17 00:00:00 2001 From: "panqingyu.5" Date: Mon, 18 May 2026 17:50:35 +0800 Subject: [PATCH 30/58] fix(server): handle null spec.template in pool-mode BatchSandbox MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In pool mode (extensions.poolRef), BatchSandbox CRs have spec.poolRef and spec.taskTemplate but no spec.template. The K8s API returns the CR with spec.template: null because the CRD declares it as an optional preserve-unknown-fields object. _extract_platform_from_workload did spec.get("template", {}).get("spec"), which only returns {} when the key is absent — not when its value is None — so the second .get() crashes with 'NoneType' object has no attribute 'get'. The sandbox is actually created and reaches Running, but the server fails to build the response and returns HTTP 500 to the client. Fix: treat null and missing as the same case when reading template / podTemplate. Same pattern is already used correctly in _build_sandbox_from_workload (workload_mapper.py:50). Add regression tests covering pool mode (null template, missing template), template-mode platform extraction, podTemplate alias, null spec, and empty workload. --- .../services/k8s/workload_mapper.py | 8 +- server/tests/k8s/test_workload_mapper.py | 105 ++++++++++++++++++ 2 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 server/tests/k8s/test_workload_mapper.py diff --git a/server/opensandbox_server/services/k8s/workload_mapper.py b/server/opensandbox_server/services/k8s/workload_mapper.py index 6a7297c3d..40e1310cf 100644 --- a/server/opensandbox_server/services/k8s/workload_mapper.py +++ b/server/opensandbox_server/services/k8s/workload_mapper.py @@ -84,10 +84,12 @@ def _build_sandbox_from_workload(workload: Any, workload_provider: Any) -> Sandb def _extract_platform_from_workload(workload: Any) -> Optional[PlatformSpec]: if isinstance(workload, dict): - spec = workload.get("spec", {}) + spec = workload.get("spec") or {} + template = spec.get("template") or {} + pod_template = spec.get("podTemplate") or {} pod_spec = ( - spec.get("template", {}).get("spec") - or spec.get("podTemplate", {}).get("spec") + (template.get("spec") if isinstance(template, dict) else None) + or (pod_template.get("spec") if isinstance(pod_template, dict) else None) or {} ) else: diff --git a/server/tests/k8s/test_workload_mapper.py b/server/tests/k8s/test_workload_mapper.py new file mode 100644 index 000000000..657086516 --- /dev/null +++ b/server/tests/k8s/test_workload_mapper.py @@ -0,0 +1,105 @@ +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from opensandbox_server.services.k8s.workload_mapper import ( + _extract_platform_from_workload, +) + + +class TestExtractPlatformFromWorkload: + """Regression tests for _extract_platform_from_workload. + + The BatchSandbox CRD declares spec.template as an optional preserve-unknown-fields + object. In pool mode, the BatchSandbox CR is created with only ``poolRef`` and + ``taskTemplate`` under spec; the Kubernetes API server may then return the object + with ``spec.template`` explicitly set to ``None`` (because the field is part of the + schema but unset). Earlier code did ``spec.get("template", {}).get("spec")`` which + crashed in that case because the default ``{}`` is only returned when the key is + absent, not when its value is ``None``. + """ + + def test_pool_mode_workload_with_null_template_returns_none(self): + """Pool-mode BatchSandbox CR has spec.template == None; must not crash.""" + workload = { + "metadata": {"name": "sb-1", "namespace": "opensandbox-system"}, + "spec": { + "replicas": 1, + "poolRef": "pool-runc", + "template": None, # <-- this used to crash + "taskTemplate": {}, + }, + "status": {"replicas": 1, "ready": 1, "allocated": 1}, + } + # Should return None (no platform info), not raise. + assert _extract_platform_from_workload(workload) is None + + def test_pool_mode_workload_without_template_key_returns_none(self): + """Pool-mode BatchSandbox CR may also omit spec.template entirely.""" + workload = { + "metadata": {"name": "sb-1"}, + "spec": { + "replicas": 1, + "poolRef": "pool-runc", + }, + } + assert _extract_platform_from_workload(workload) is None + + def test_template_mode_with_full_platform_returns_platform(self): + """Template-mode workload with nodeSelector returns the declared platform.""" + workload = { + "metadata": {"name": "sb-1"}, + "spec": { + "replicas": 1, + "template": { + "spec": { + "nodeSelector": { + "kubernetes.io/os": "linux", + "kubernetes.io/arch": "amd64", + }, + }, + }, + }, + } + platform = _extract_platform_from_workload(workload) + assert platform is not None + assert platform.os == "linux" + assert platform.arch == "amd64" + + def test_pod_template_alias_still_works(self): + """Some workload types use ``podTemplate`` instead of ``template``.""" + workload = { + "spec": { + "podTemplate": { + "spec": { + "nodeSelector": { + "kubernetes.io/os": "linux", + "kubernetes.io/arch": "arm64", + }, + }, + }, + }, + } + platform = _extract_platform_from_workload(workload) + assert platform is not None + assert platform.os == "linux" + assert platform.arch == "arm64" + + def test_null_spec_returns_none(self): + """spec itself being None must not crash.""" + workload = {"metadata": {"name": "sb-1"}, "spec": None} + assert _extract_platform_from_workload(workload) is None + + def test_empty_workload_returns_none(self): + workload = {} + assert _extract_platform_from_workload(workload) is None From de4414aa5a8dda139282cde67cda8a07c0bad374 Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Mon, 18 May 2026 18:38:11 +0800 Subject: [PATCH 31/58] fix(deps): pin fast-uri to 3.1.2 to close GHSA-q3j6-qgpj-74h6 / GHSA-v39h-62p7-jpjc (#909) Add a pnpm override so the transitive `fast-uri` (pulled in via `openapi-typescript > @redocly/openapi-core > @redocly/ajv`) resolves to 3.1.2. The previously locked 3.1.0 is affected by two high-severity advisories: - CVE-2026-6321 (GHSA-q3j6-qgpj-74h6): path traversal via percent-encoded dot segments in `normalize()` / `equal()`. - CVE-2026-6322 (GHSA-v39h-62p7-jpjc): host confusion via percent-encoded authority delimiters. `pnpm audit` now reports 0 vulnerabilities for the sdks workspace. `docs/` and `tests/javascript/` audits are already clean. Co-authored-by: Claude Opus 4.7 --- sdks/package.json | 3 ++- sdks/pnpm-lock.yaml | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sdks/package.json b/sdks/package.json index a7732a545..110323d80 100644 --- a/sdks/package.json +++ b/sdks/package.json @@ -17,7 +17,8 @@ "picomatch@^4.0.0": "4.0.4", "brace-expansion@^1.0.0": "1.1.13", "brace-expansion@^2.0.0": "2.0.3", - "flatted@^3.0.0": "3.4.2" + "flatted@^3.0.0": "3.4.2", + "fast-uri@^3.0.0": "3.1.2" } }, "devDependencies": { diff --git a/sdks/pnpm-lock.yaml b/sdks/pnpm-lock.yaml index a9434f93a..43612f5fc 100644 --- a/sdks/pnpm-lock.yaml +++ b/sdks/pnpm-lock.yaml @@ -11,6 +11,7 @@ overrides: brace-expansion@^1.0.0: 1.1.13 brace-expansion@^2.0.0: 2.0.3 flatted@^3.0.0: 3.4.2 + fast-uri@^3.0.0: 3.1.2 importers: @@ -695,8 +696,8 @@ packages: fast-levenshtein@2.0.6: resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==} - fast-uri@3.1.0: - resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==} + fast-uri@3.1.2: + resolution: {integrity: sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==} fdir@6.5.0: resolution: {integrity: sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==} @@ -1263,7 +1264,7 @@ snapshots: '@redocly/ajv@8.17.1': dependencies: fast-deep-equal: 3.1.3 - fast-uri: 3.1.0 + fast-uri: 3.1.2 json-schema-traverse: 1.0.0 require-from-string: 2.0.2 @@ -1650,7 +1651,7 @@ snapshots: fast-levenshtein@2.0.6: {} - fast-uri@3.1.0: {} + fast-uri@3.1.2: {} fdir@6.5.0(picomatch@4.0.4): optionalDependencies: From a78ca3c7a6b091a26c5e9761efe082955547a856 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 18 May 2026 14:12:30 +0000 Subject: [PATCH 32/58] chore(chart): bump opensandbox-server image to v0.1.14 --- kubernetes/charts/opensandbox-server/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/charts/opensandbox-server/values.yaml b/kubernetes/charts/opensandbox-server/values.yaml index c1891dcc1..c832d5153 100644 --- a/kubernetes/charts/opensandbox-server/values.yaml +++ b/kubernetes/charts/opensandbox-server/values.yaml @@ -25,7 +25,7 @@ server: # -- Server image configuration image: repository: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/server - tag: "v0.1.13" + tag: "v0.1.14" # -- Number of server replicas replicaCount: 2 From 4f7926824d38f2c65812f93f89162e4a3305ce6a Mon Sep 17 00:00:00 2001 From: yoogo Date: Mon, 18 May 2026 15:41:52 +0800 Subject: [PATCH 33/58] feat(chart): support volumes and volumeMounts in opensandbox-server --- kubernetes/charts/opensandbox-server/templates/server.yaml | 6 ++++++ kubernetes/charts/opensandbox-server/values.yaml | 2 ++ 2 files changed, 8 insertions(+) diff --git a/kubernetes/charts/opensandbox-server/templates/server.yaml b/kubernetes/charts/opensandbox-server/templates/server.yaml index b2b5def22..2dcf1d7fa 100644 --- a/kubernetes/charts/opensandbox-server/templates/server.yaml +++ b/kubernetes/charts/opensandbox-server/templates/server.yaml @@ -114,6 +114,9 @@ spec: mountPath: /etc/opensandbox/config.toml subPath: config.toml readOnly: true + {{- with .Values.server.volumeMounts }} + {{- toYaml . | nindent 12 }} + {{- end }} livenessProbe: httpGet: path: /health @@ -134,6 +137,9 @@ spec: - name: config configMap: name: {{ include "opensandbox-server.fullname" . }}-config + {{- with .Values.server.volumes }} + {{- toYaml . | nindent 8 }} + {{- end }} {{- with .Values.server.affinity }} affinity: {{- toYaml . | nindent 8 }} diff --git a/kubernetes/charts/opensandbox-server/values.yaml b/kubernetes/charts/opensandbox-server/values.yaml index 8f5d74064..dd65e9de9 100644 --- a/kubernetes/charts/opensandbox-server/values.yaml +++ b/kubernetes/charts/opensandbox-server/values.yaml @@ -41,6 +41,8 @@ server: tolerations: [] affinity: {} + volumeMounts: [] + volumes: [] # Gateway (components/ingress): when enabled, writes config [ingress] and deploys the gateway gateway: From 31484053a532b0be2c17b17e2f50fe95bdf6ba6e Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Tue, 19 May 2026 09:59:13 +0800 Subject: [PATCH 34/58] fix(execd): defer SSE response headers until first event fires (#912) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test(e2e): harden flaky cross-SDK sandbox tests Recurring failures in `real-e2e.yml` traced to three races: 1. Go `TestE2E_FullLifecycle` pings execd via the server proxy the moment sandbox state flips to Running; execd's TCP listener can accept before routes register, surfacing as `read: connection reset by peer` on `/proxy/44772/ping`. Wrap Ping + the first SSE RunCommand in `require.Eventually` retries. 2. Go `TestE2E_PauseResume` and `TestFilesystem_SetPermissions` hit `opensandbox: empty sse stream` immediately after Resume / readiness. Add `runCommandWithRetry` helper in base_e2e_test.go and use it in the affected call sites. 3. Java/C# NetworkPolicy tests asserted egress blocking after a fixed `Thread.sleep`/`Task.Delay`; the sidecar accepts the sandbox before iptables/proxy rules apply, so curl occasionally succeeds and `assertNotNull(error)` fails. Replace fixed sleeps with `waitUntilEgressBlocks` / `WaitUntilEgressBlocksAsync` helpers that poll curl until the policy actually blocks (or fail with the last observation). Also retry transient single-line stdout drops on the workingDirectory `pwd` checks (JS, Java) and the C# env-injection baseline — same SSE blank-line / first-event race that has bitten multiple SDKs. Co-Authored-By: Claude Opus 4.7 * fix(execd): defer SSE response headers until first event fires `RunCommand`, `RunCode`, and `RunInSession` previously committed the response as `text/event-stream` (via `setupSSEResponse`) before invoking the runtime. If the runtime returned a synchronous error — for example because `stdLogDescriptor` could not (re)create `/tmp` after a sandbox restart, `pathutil.ExpandPathWithEnv` failed to resolve a working directory under transient env conditions, or `buildCredential` raced with a uid/gid lookup — the handler then called `RespondError`, which set Content-Type to `application/json` and wrote a JSON body on top of the already-committed event-stream response. Clients saw HTTP 200 with `text/event-stream` and a JSON body that no SSE parser could decode, producing zero events; the Go SDK reported `opensandbox: empty sse stream` and JS/Java/C# SDKs surfaced the same race as missing init events or a vanished single-line stdout (e.g. `pwd` with `workingDirectory: "/tmp"` returning empty `stdout[0]`). Make `setupSSEResponse` idempotent (guarded by `sync.Once`) and call it lazily from `writeSingleEvent`, so headers commit only once an event is actually being written. Drop the eager `setupSSEResponse` calls in the three streaming endpoints. Pre-execution synchronous errors now flow through `RespondError` cleanly with `application/json`, and successful runs still emit `text/event-stream` on the first event. Add three regression tests: - `TestRunCodeSyncErrorEmitsJSONNotSSE` - `TestRunInSessionSyncErrorEmitsJSONNotSSE` - `TestRunCodeSuccessStillEmitsSSE` Co-Authored-By: Claude Opus 4.7 * test(e2e): drop SDK-side retries now that execd SSE bug is fixed The execd "empty sse stream" / dropped-event race is fixed at the source in the previous commit (lazy SSE headers so synchronous runtime errors return JSON, not a half-formed event-stream). Real SDK clients do not retry these failures, so the e2e suite shouldn't either — the retries were masking the bug, not exercising production behaviour. Revert the SDK-side workarounds: - JS `pwd workingDirectory:/tmp` retry loop - Java echo + pwd retry loops - C# env-injection `RunWithRetryAsync` wrapper - Go `runCommandWithRetry` helper and its callers in `TestE2E_PauseResume` and `TestFilesystem_SetPermissions` Keep the targeted polls that cover other races not addressed by the execd fix: - Go `TestE2E_FullLifecycle` execd-Ping `Eventually` — bypasses the high-level SDK and pings the server-side proxy directly; the proxy drops the very first connection in the gap between sandbox state Running and execd routes registering. Real users go through `CreateSandbox`/`WaitUntilReady`, which already handles this; the low-level test does not. - Java/C# `waitUntilEgressBlocks` polls — egress sidecar policy is applied asynchronously after the sandbox is marked ready, so a fixed sleep is inherently flaky. This is a separate readiness-gating bug that should be addressed server-side. Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- components/execd/pkg/web/controller/basic.go | 4 +- .../pkg/web/controller/codeinterpreting.go | 8 +- .../web/controller/codeinterpreting_test.go | 88 +++++++++++++++++++ .../execd/pkg/web/controller/command.go | 4 +- components/execd/pkg/web/controller/sse.go | 24 +++-- .../OpenSandbox.E2ETests/SandboxE2ETests.cs | 37 +++++++- tests/go/e2e_test.go | 13 ++- .../opensandbox/e2e/SandboxE2ETest.java | 56 ++++++++---- 8 files changed, 200 insertions(+), 34 deletions(-) diff --git a/components/execd/pkg/web/controller/basic.go b/components/execd/pkg/web/controller/basic.go index b33c660b0..2008fd63c 100644 --- a/components/execd/pkg/web/controller/basic.go +++ b/components/execd/pkg/web/controller/basic.go @@ -18,6 +18,7 @@ import ( "encoding/json" "net/http" "strconv" + "sync" "github.com/gin-gonic/gin" @@ -25,7 +26,8 @@ import ( ) type basicController struct { - ctx *gin.Context + ctx *gin.Context + sseSetupOnce sync.Once } func newBasicController(ctx *gin.Context) *basicController { diff --git a/components/execd/pkg/web/controller/codeinterpreting.go b/components/execd/pkg/web/controller/codeinterpreting.go index 93680e3f4..9b369d7d8 100644 --- a/components/execd/pkg/web/controller/codeinterpreting.go +++ b/components/execd/pkg/web/controller/codeinterpreting.go @@ -171,7 +171,9 @@ func (c *CodeInterpretingController) RunCode() { } runCodeRequest.Hooks = eventsHandler - c.setupSSEResponse() + // SSE headers are committed lazily on the first event write + // (see writeSingleEvent), so a synchronous error from Execute below can + // still be surfaced as a structured JSON error response. err = codeRunner.Execute(runCodeRequest) if err != nil { recordExecution("failure") @@ -400,7 +402,9 @@ func (c *CodeInterpretingController) RunInSession() { } runReq.Hooks = hooks - c.setupSSEResponse() + // SSE headers are committed lazily on the first event write + // (see writeSingleEvent), so a synchronous error from + // RunInBashSession can still be surfaced as a structured JSON error. err := codeRunner.RunInBashSession(ctx, runReq) if err != nil { recordExecution("failure") diff --git a/components/execd/pkg/web/controller/codeinterpreting_test.go b/components/execd/pkg/web/controller/codeinterpreting_test.go index 2beec41e5..af3c343b2 100644 --- a/components/execd/pkg/web/controller/codeinterpreting_test.go +++ b/components/execd/pkg/web/controller/codeinterpreting_test.go @@ -17,6 +17,7 @@ package controller import ( "context" "encoding/json" + "errors" "net/http" "testing" "time" @@ -333,3 +334,90 @@ func TestRunInSessionReturnsBeforeGracefulShutdownTimeoutAfterImmediateError(t * require.Equal(t, http.StatusOK, w.Code) require.Less(t, elapsed, flag.ApiGracefulShutdownTimeout/2) } + +// TestRunCodeSyncErrorEmitsJSONNotSSE guards against regression of the bug +// where Execute returning a synchronous error after setupSSEResponse caused +// the client to receive a text/event-stream response with a JSON body, which +// SDKs parsed as zero events ("empty sse stream"). Headers must stay +// uncommitted until the first event so RespondError can produce a proper +// application/json error response. +func TestRunCodeSyncErrorEmitsJSONNotSSE(t *testing.T) { + previousRunner := codeRunner + codeRunner = &fakeCodeRunner{ + execute: func(_ *runtime.ExecuteCodeRequest) error { + return errors.New("synchronous runtime failure") + }, + } + t.Cleanup(func() { codeRunner = previousRunner }) + + body := []byte(`{"code":"print(1)","context":{"id":"ctx-1","language":"python"}}`) + ctx, w := newTestContext(http.MethodPost, "/code/run", body) + ctrl := NewCodeInterpretingController(ctx) + + ctrl.RunCode() + + require.Equal(t, http.StatusInternalServerError, w.Code) + contentType := w.Header().Get("Content-Type") + require.Contains(t, contentType, "application/json", "should not commit text/event-stream when no event fires") + + var resp model.ErrorResponse + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &resp)) + require.Equal(t, model.ErrorCodeRuntimeError, resp.Code) + require.Contains(t, resp.Message, "synchronous runtime failure") +} + +// TestRunInSessionSyncErrorEmitsJSONNotSSE — see TestRunCodeSyncErrorEmitsJSONNotSSE. +func TestRunInSessionSyncErrorEmitsJSONNotSSE(t *testing.T) { + previousRunner := codeRunner + codeRunner = &fakeCodeRunner{ + runInBashSession: func(_ context.Context, _ *runtime.ExecuteCodeRequest) error { + return errors.New("synchronous session failure") + }, + } + t.Cleanup(func() { codeRunner = previousRunner }) + + body := []byte(`{"command":"echo hi","timeout":0}`) + ctx, w := newTestContext(http.MethodPost, "/sessions/session-1/run", body) + ctx.Params = append(ctx.Params, gin.Param{Key: "sessionId", Value: "session-1"}) + ctrl := NewCodeInterpretingController(ctx) + + ctrl.RunInSession() + + require.Equal(t, http.StatusInternalServerError, w.Code) + contentType := w.Header().Get("Content-Type") + require.Contains(t, contentType, "application/json", "should not commit text/event-stream when no event fires") + + var resp model.ErrorResponse + require.NoError(t, json.Unmarshal(w.Body.Bytes(), &resp)) + require.Equal(t, model.ErrorCodeRuntimeError, resp.Code) + require.Contains(t, resp.Message, "synchronous session failure") +} + +// TestRunCodeSuccessStillEmitsSSE confirms the lazy header path still produces +// a text/event-stream response when at least one event fires. +func TestRunCodeSuccessStillEmitsSSE(t *testing.T) { + previousRunner := codeRunner + previousTimeout := flag.ApiGracefulShutdownTimeout + codeRunner = &fakeCodeRunner{ + execute: func(request *runtime.ExecuteCodeRequest) error { + request.Hooks.OnExecuteInit("session-1") + request.Hooks.OnExecuteComplete(time.Millisecond) + return nil + }, + } + flag.ApiGracefulShutdownTimeout = 50 * time.Millisecond + t.Cleanup(func() { + codeRunner = previousRunner + flag.ApiGracefulShutdownTimeout = previousTimeout + }) + + body := []byte(`{"code":"print(1)","context":{"id":"ctx-1","language":"python"}}`) + ctx, w := newTestContext(http.MethodPost, "/code/run", body) + ctrl := NewCodeInterpretingController(ctx) + + ctrl.RunCode() + + require.Equal(t, http.StatusOK, w.Code) + require.Contains(t, w.Header().Get("Content-Type"), "text/event-stream") + require.NotEmpty(t, w.Body.Bytes(), "successful run should write SSE events") +} diff --git a/components/execd/pkg/web/controller/command.go b/components/execd/pkg/web/controller/command.go index f89e570f4..9c21ec6fe 100644 --- a/components/execd/pkg/web/controller/command.go +++ b/components/execd/pkg/web/controller/command.go @@ -80,7 +80,9 @@ func (c *CodeInterpretingController) RunCommand() { } runCodeRequest.Hooks = eventsHandler - c.setupSSEResponse() + // SSE headers are committed lazily on the first event write + // (see writeSingleEvent), so a synchronous error from Execute below can + // still be surfaced as a structured JSON error response. err = codeRunner.Execute(runCodeRequest) if err != nil { recordExecution("failure") diff --git a/components/execd/pkg/web/controller/sse.go b/components/execd/pkg/web/controller/sse.go index 573315260..29dddb9d2 100644 --- a/components/execd/pkg/web/controller/sse.go +++ b/components/execd/pkg/web/controller/sse.go @@ -36,13 +36,21 @@ var sseHeaders = map[string]string{ "X-Accel-Buffering": "no", } +// setupSSEResponse is idempotent: once headers are committed, subsequent calls +// no-op. Callers that need the headers up front (e.g. long-running streaming +// endpoints with no early-error path) can call it explicitly. Endpoints that +// may fail synchronously before any event fires should leave header commit to +// the lazy path inside writeSingleEvent so pre-execution errors can return a +// proper JSON body instead of a half-formed text/event-stream response. func (c *basicController) setupSSEResponse() { - for key, value := range sseHeaders { - c.ctx.Writer.Header().Set(key, value) - } - if flusher, ok := c.ctx.Writer.(http.Flusher); ok { - flusher.Flush() - } + c.sseSetupOnce.Do(func() { + for key, value := range sseHeaders { + c.ctx.Writer.Header().Set(key, value) + } + if flusher, ok := c.ctx.Writer.(http.Flusher); ok { + flusher.Flush() + } + }) } // setServerEventsHandler adapts runtime callbacks to SSE events. @@ -167,6 +175,10 @@ func (c *CodeInterpretingController) writeSingleEvent(handler string, data []byt c.chunkWriter.Lock() defer c.chunkWriter.Unlock() + // Lazily commit SSE response headers on the first event. This lets the + // surrounding handler return a proper JSON error via RespondError if the + // runtime fails synchronously before any event fires. + c.setupSSEResponse() defer func() { if flusher, ok := c.ctx.Writer.(http.Flusher); ok { flusher.Flush() diff --git a/tests/csharp/OpenSandbox.E2ETests/SandboxE2ETests.cs b/tests/csharp/OpenSandbox.E2ETests/SandboxE2ETests.cs index 580748a83..b38daf351 100644 --- a/tests/csharp/OpenSandbox.E2ETests/SandboxE2ETests.cs +++ b/tests/csharp/OpenSandbox.E2ETests/SandboxE2ETests.cs @@ -163,7 +163,7 @@ public async Task Sandbox_Create_With_NetworkPolicy_Get_And_Patch_Egress() try { - await Task.Delay(5000); + await WaitUntilEgressBlocksAsync(policySandbox, "https://www.github.com", TimeSpan.FromSeconds(30)); var initialPolicy = await policySandbox.GetEgressPolicyAsync(); Assert.NotNull(initialPolicy); @@ -184,7 +184,7 @@ await policySandbox.PatchEgressRulesAsync(new List new() { Action = NetworkRuleAction.Allow, Target = "www.github.com" }, new() { Action = NetworkRuleAction.Deny, Target = "pypi.org" } }); - await Task.Delay(2000); + await WaitUntilEgressBlocksAsync(policySandbox, "https://pypi.org", TimeSpan.FromSeconds(30)); var patchedPolicy = await policySandbox.GetEgressPolicyAsync(); Assert.NotNull(patchedPolicy.Egress); @@ -233,7 +233,7 @@ public async Task Sandbox_Create_With_NetworkPolicy_Get_And_Patch_Egress_Via_Ser try { - await Task.Delay(5000); + await WaitUntilEgressBlocksAsync(policySandbox, "https://www.github.com", TimeSpan.FromSeconds(30)); var egressEndpoint = await policySandbox.GetEndpointAsync(Constants.DefaultEgressPort); Assert.Contains( @@ -259,7 +259,7 @@ await policySandbox.PatchEgressRulesAsync(new List new() { Action = NetworkRuleAction.Allow, Target = "www.github.com" }, new() { Action = NetworkRuleAction.Deny, Target = "pypi.org" } }); - await Task.Delay(2000); + await WaitUntilEgressBlocksAsync(policySandbox, "https://pypi.org", TimeSpan.FromSeconds(30)); var patchedPolicy = await policySandbox.GetEgressPolicyAsync(); Assert.NotNull(patchedPolicy.Egress); @@ -1112,6 +1112,35 @@ private static async Task RunWithRetryAsync(Sandbox sandbox, string c } return result!; } + + /// + /// Polls curl against until the egress sidecar blocks + /// it (Execution.Error becomes non-null), or the timeout elapses. NetworkPolicy + /// sidecars sometimes accept connections before iptables/proxy rules apply, + /// so a fixed sleep is flaky. + /// + private static async Task WaitUntilEgressBlocksAsync(Sandbox sandbox, string url, TimeSpan timeout) + { + var deadline = DateTime.UtcNow + timeout; + Execution? last = null; + while (DateTime.UtcNow < deadline) + { + try + { + last = await sandbox.Commands.RunAsync($"curl -I {url}"); + if (last?.Error != null) + { + return; + } + } + catch + { + // Transient SDK/SSE errors during sidecar warmup — keep polling. + } + await Task.Delay(500); + } + Assert.Fail($"Egress policy did not block {url} within {timeout} (last error={last?.Error?.ToString() ?? "null"})"); + } } public sealed class SandboxE2ETestFixture : IAsyncLifetime diff --git a/tests/go/e2e_test.go b/tests/go/e2e_test.go index bc204b0da..631c571ac 100644 --- a/tests/go/e2e_test.go +++ b/tests/go/e2e_test.go @@ -117,8 +117,16 @@ func TestE2E_FullLifecycle(t *testing.T) { } execClient := opensandbox.NewExecdClient(execdURL, execToken) - err = execClient.Ping(ctx) - require.NoError(t, err) + // This test bypasses the SDK's high-level CreateSandbox helper (which calls + // WaitUntilReady) and pings execd directly through the server-side proxy. + // The state-Running flag is satisfied as soon as the container is up, but + // execd's HTTP routes may register a few ms later and the proxy can drop + // the very first connection it sees ("connection reset by peer"). Poll + // until ping succeeds — real users go through CreateSandbox which already + // handles this. + require.Eventually(t, func() bool { + return execClient.Ping(ctx) == nil + }, 30*time.Second, 500*time.Millisecond, "execd ping never succeeded") t.Log("Execd ping: OK") // 6. Test Execd — run a command with SSE streaming @@ -131,7 +139,6 @@ func TestE2E_FullLifecycle(t *testing.T) { return nil }) require.NoError(t, err) - t.Logf("Command raw output (%d bytes): %q", output.Len(), output.String()) // 7. Test Execd — file operations fileInfoMap, err := execClient.GetFileInfo(ctx, "/etc/os-release") diff --git a/tests/java/src/test/java/com/alibaba/opensandbox/e2e/SandboxE2ETest.java b/tests/java/src/test/java/com/alibaba/opensandbox/e2e/SandboxE2ETest.java index 5c51e0d5f..ba0449f27 100644 --- a/tests/java/src/test/java/com/alibaba/opensandbox/e2e/SandboxE2ETest.java +++ b/tests/java/src/test/java/com/alibaba/opensandbox/e2e/SandboxE2ETest.java @@ -270,11 +270,10 @@ void testSandboxCreateWithNetworkPolicy() { .readyTimeout(Duration.ofSeconds(60)) .networkPolicy(networkPolicy) .build(); - // Wait for NetworkPolicy sidecar to be fully initialized - try { - Thread.sleep(2000); - } catch (InterruptedException ignored) { - } + // Wait for NetworkPolicy sidecar to be fully initialized. + // The sidecar may accept the sandbox before iptables/proxy rules apply, + // so poll a denied target until the policy actually blocks it. + waitUntilEgressBlocks(policySandbox, "https://www.github.com", Duration.ofSeconds(30)); try { NetworkPolicy initialPolicy = policySandbox.getEgressPolicy(); @@ -319,10 +318,8 @@ void testSandboxCreateWithNetworkPolicy() { .target("pypi.org") .build())); - try { - Thread.sleep(2000); - } catch (InterruptedException ignored) { - } + // Poll until the patched rule takes effect (pypi now blocked). + waitUntilEgressBlocks(policySandbox, "https://pypi.org", Duration.ofSeconds(30)); NetworkPolicy patchedPolicy = policySandbox.getEgressPolicy(); assertNotNull(patchedPolicy); @@ -393,10 +390,8 @@ void testSandboxCreateWithNetworkPolicyViaServerProxy() { .readyTimeout(Duration.ofSeconds(60)) .networkPolicy(networkPolicy) .build(); - try { - Thread.sleep(2000); - } catch (InterruptedException ignored) { - } + // Wait for NetworkPolicy sidecar/iptables rules to be active. + waitUntilEgressBlocks(policySandbox, "https://www.github.com", Duration.ofSeconds(30)); try { SandboxEndpoint egressEndpoint = policySandbox.getEndpoint(18080); @@ -447,10 +442,8 @@ void testSandboxCreateWithNetworkPolicyViaServerProxy() { .target("pypi.org") .build())); - try { - Thread.sleep(2000); - } catch (InterruptedException ignored) { - } + // Poll until patched rule applied (pypi now blocked). + waitUntilEgressBlocks(policySandbox, "https://pypi.org", Duration.ofSeconds(30)); NetworkPolicy patchedPolicy = policySandbox.getEgressPolicy(); assertNotNull(patchedPolicy.getEgress()); @@ -1597,4 +1590,33 @@ private Execution runWithRetry(Sandbox sandbox, String command, int maxAttempts, } return result; } + + /** + * Polls the sandbox running curl until the given URL is blocked by the + * network policy. Returns once curl reports an error (egress active), or + * fails the test if the timeout elapses. + */ + private void waitUntilEgressBlocks(Sandbox sandbox, String url, Duration timeout) { + long deadline = System.currentTimeMillis() + timeout.toMillis(); + Execution last = null; + while (System.currentTimeMillis() < deadline) { + try { + last = sandbox.commands().run( + RunCommandRequest.builder().command("curl -I " + url).build()); + if (last != null && last.getError() != null) { + return; + } + } catch (Exception ignored) { + // Transient SDK/SSE errors during sidecar warmup — keep polling. + } + try { + Thread.sleep(500); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + } + fail("Egress policy did not block " + url + " within " + timeout + + " (last execution error=" + (last == null ? "null" : last.getError()) + ")"); + } } From c1d19b7c9dbb6ba464559f383b642e9dba5be3f2 Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Tue, 19 May 2026 10:50:01 +0800 Subject: [PATCH 35/58] fix(execd): preserve blank lines in command stdout SSE stream (#902) * fix(execd): preserve blank lines in command stdout SSE stream readFromPos previously skipped consecutive line terminators when the buffer was empty, dropping standalone newline lines from the SSE output. Emit "\n" for blank lines while keeping the existing line-content emit behavior (terminator is still stripped from non-empty lines). \r\n pairs are coalesced via lastWasCR to avoid duplicate blank emits. Adds a unit test covering blank lines, leading blank, and CRLF, and an end-to-end smoke test that runs `printf 'a\n\nb\n\n\nc\n'` and asserts the stdout event sequence preserves the blanks. Co-Authored-By: Claude Opus 4.7 * fix(execd): wait for stdout/stderr tail to flush before complete on Windows Windows runCommand fired OnExecuteComplete immediately after closing the done channel, racing the tail goroutines that emit pending stdout/stderr SSE events. Clients that break on execution_complete then missed final output, e.g. the blank-line smoke assertion saw an empty sequence while the server log showed the events were emitted. Mirror the Linux path: track the tail goroutines with a sync.WaitGroup and wg.Wait() after close(done) so all buffered output drains before the completion event is sent. Also covers the cmd.Start failure path. Co-Authored-By: Claude Opus 4.7 * fix(execd): persist CRLF state across tail polls readFromPos previously declared lastWasCR as a per-call local, so the CRLF coalescing only worked when \r and \n were read in the same invocation. When tailStdPipe polls a writer that flushes \r before \n (common on Windows/cmd), the \r and \n can land in separate polls; the second call starts with lastWasCR=false and emits a spurious "\n" blank line for the trailing \n. A bare blank \r\n line split across polls would surface as two blanks. Hoist the state into tailStdPipe and thread it through readFromPos so the CR detection survives between polls. Add regression tests covering split CRLF after content and split blank CRLF. Co-Authored-By: Claude Opus 4.7 * test(execd): make blank-lines smoke command cross-platform Replace POSIX printf with single-quoted format string with a python -c one-liner. cmd /C does not strip single quotes, so the previous command only worked on Windows runners that happened to have Git for Windows in PATH (MSYS2 argv pre-processing strips the quotes); on a bare Windows sandbox the smoke would fail before reaching the filesystem checks. Co-Authored-By: Claude Opus 4.7 * test(execd): branch blank-lines smoke per platform The previous python -c attempt produced a SyntaxError on Windows because Go's syscall.EscapeArg wraps the cmd /C argument in quotes, escaping the inner quotes as \". cmd /C strips the outer quotes (rule 2 of its parser) but leaves the literal \" inside, and MSVCRT's argv parser then treats \" as a literal double-quote character without toggling quote state, so the first embedded space terminates argv[2] and python sees an unterminated string literal. Use a cmd-native echo chain on Windows (no inner quotes, & is sequential) and keep POSIX printf on Linux/macOS. The execd reader collapses CRLF to LF, so both platforms yield the same event sequence. Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- .../execd/pkg/runtime/command_common.go | 31 +++++-- components/execd/pkg/runtime/command_test.go | 81 +++++++++++++++++-- .../execd/pkg/runtime/command_windows.go | 8 ++ components/execd/tests/smoke_api.py | 55 +++++++++++++ 4 files changed, 164 insertions(+), 11 deletions(-) diff --git a/components/execd/pkg/runtime/command_common.go b/components/execd/pkg/runtime/command_common.go index 03205e20c..804869b61 100644 --- a/components/execd/pkg/runtime/command_common.go +++ b/components/execd/pkg/runtime/command_common.go @@ -32,13 +32,14 @@ func (c *Controller) tailStdPipe(file string, onExecute func(text string), done defer ticker.Stop() mutex := &sync.Mutex{} + var lastWasCR bool for { select { case <-done: - c.readFromPos(mutex, file, lastPos, onExecute, true) + c.readFromPos(mutex, file, lastPos, onExecute, true, &lastWasCR) return case <-ticker.C: - newPos := c.readFromPos(mutex, file, lastPos, onExecute, false) + newPos := c.readFromPos(mutex, file, lastPos, onExecute, false, &lastWasCR) lastPos = newPos } } @@ -104,7 +105,9 @@ func (c *Controller) combinedOutputFileName(session string) string { } // readFromPos streams new content from a file starting at startPos. -func (c *Controller) readFromPos(mutex *sync.Mutex, filepath string, startPos int64, onExecute func(string), flushIncomplete bool) int64 { +// lastWasCR persists CRLF detection across calls so a \r\n pair split between +// two polls does not surface a spurious blank line for the trailing \n. +func (c *Controller) readFromPos(mutex *sync.Mutex, filepath string, startPos int64, onExecute func(string), flushIncomplete bool, lastWasCR *bool) int64 { if !mutex.TryLock() { return -1 } @@ -121,6 +124,15 @@ func (c *Controller) readFromPos(mutex *sync.Mutex, filepath string, startPos in reader := bufio.NewReader(file) var buffer bytes.Buffer var currentPos int64 = startPos + cr := false + if lastWasCR != nil { + cr = *lastWasCR + } + defer func() { + if lastWasCR != nil { + *lastWasCR = cr + } + }() for { b, err := reader.ReadByte() @@ -138,15 +150,22 @@ func (c *Controller) readFromPos(mutex *sync.Mutex, filepath string, startPos in // Check if it's a line terminator (\n or \r) if b == '\n' || b == '\r' { - // If buffer has content, output this line - if buffer.Len() > 0 { + switch { + case buffer.Len() > 0: + // Flush the line content without the terminator onExecute(buffer.String()) buffer.Reset() + case b == '\n' && cr: + // Second half of a \r\n pair; already emitted on \r + default: + // Standalone blank line; surface it so callers see the gap + onExecute("\n") } - // Skip line terminator + cr = (b == '\r') continue } + cr = false buffer.WriteByte(b) } diff --git a/components/execd/pkg/runtime/command_test.go b/components/execd/pkg/runtime/command_test.go index a6207fc3b..3e4ba1da8 100644 --- a/components/execd/pkg/runtime/command_test.go +++ b/components/execd/pkg/runtime/command_test.go @@ -42,7 +42,7 @@ func TestReadFromPos_SplitsOnCRAndLF(t *testing.T) { var got []string c := &Controller{} - nextPos := c.readFromPos(mutex, logFile, 0, func(s string) { got = append(got, s) }, false) + nextPos := c.readFromPos(mutex, logFile, 0, func(s string) { got = append(got, s) }, false, nil) want := []string{"line1", "prog 10%", "prog 20%", "prog 30%", "last"} require.Len(t, got, len(want)) @@ -59,7 +59,7 @@ func TestReadFromPos_SplitsOnCRAndLF(t *testing.T) { _ = f.Close() got = got[:0] - c.readFromPos(mutex, logFile, nextPos, func(s string) { got = append(got, s) }, false) + c.readFromPos(mutex, logFile, nextPos, func(s string) { got = append(got, s) }, false, nil) want = []string{"tail1", "tail2"} require.Len(t, got, len(want)) for i := range want { @@ -77,7 +77,7 @@ func TestReadFromPos_LongLine(t *testing.T) { var got []string c := &Controller{} - c.readFromPos(&sync.Mutex{}, logFile, 0, func(s string) { got = append(got, s) }, false) + c.readFromPos(&sync.Mutex{}, logFile, 0, func(s string) { got = append(got, s) }, false, nil) require.Len(t, got, 1, "expected one token") require.Equal(t, strings.TrimSuffix(longLine, "\n"), got[0], "long line mismatch") @@ -98,15 +98,86 @@ func TestReadFromPos_FlushesTrailingLine(t *testing.T) { } // First read: should only get complete lines with newlines - pos := c.readFromPos(mutex, file, 0, onExecute, false) + pos := c.readFromPos(mutex, file, 0, onExecute, false, nil) assert.GreaterOrEqual(t, pos, int64(0)) assert.Equal(t, []string{"line1"}, lines) // Flush at end: should output the last line (without newline) - c.readFromPos(mutex, file, pos, onExecute, true) + c.readFromPos(mutex, file, pos, onExecute, true, nil) assert.Equal(t, []string{"line1", "lastline-without-newline"}, lines) } +func TestReadFromPos_PreservesBlankLines(t *testing.T) { + tmp := t.TempDir() + logFile := filepath.Join(tmp, "stdout.log") + + // Mix of single newlines, consecutive blank lines, leading blank, and CRLF. + initial := "a\n\nb\n\n\nc\n\r\nd\n" + require.NoError(t, os.WriteFile(logFile, []byte(initial), 0o644)) + + var got []string + c := &Controller{} + c.readFromPos(&sync.Mutex{}, logFile, 0, func(s string) { got = append(got, s) }, false, nil) + + want := []string{"a", "\n", "b", "\n", "\n", "c", "\n", "d"} + require.Equal(t, want, got) +} + +// TestReadFromPos_CRLFAcrossPolls ensures a \r\n pair that arrives in two +// successive polls does not emit a spurious blank line for the trailing \n. +// Reproduces the regression on Windows/cmd writers that flush \r before \n. +func TestReadFromPos_CRLFAcrossPolls(t *testing.T) { + tmp := t.TempDir() + logFile := filepath.Join(tmp, "stdout.log") + + require.NoError(t, os.WriteFile(logFile, []byte("a\r"), 0o644)) + + var got []string + c := &Controller{} + mutex := &sync.Mutex{} + var lastWasCR bool + pos := c.readFromPos(mutex, logFile, 0, func(s string) { got = append(got, s) }, false, &lastWasCR) + require.Equal(t, []string{"a"}, got) + require.True(t, lastWasCR, "CR state must persist for next poll") + + f, err := os.OpenFile(logFile, os.O_APPEND|os.O_WRONLY, 0o644) + require.NoError(t, err) + _, err = f.WriteString("\nb\n") + require.NoError(t, err) + _ = f.Close() + + got = got[:0] + c.readFromPos(mutex, logFile, pos, func(s string) { got = append(got, s) }, false, &lastWasCR) + require.Equal(t, []string{"b"}, got, "trailing \\n of split CRLF must not emit a blank line") +} + +// TestReadFromPos_BlankCRLFAcrossPolls ensures a blank \r\n line split across +// polls is emitted as a single blank, not duplicated. +func TestReadFromPos_BlankCRLFAcrossPolls(t *testing.T) { + tmp := t.TempDir() + logFile := filepath.Join(tmp, "stdout.log") + + require.NoError(t, os.WriteFile(logFile, []byte("\r"), 0o644)) + + var got []string + c := &Controller{} + mutex := &sync.Mutex{} + var lastWasCR bool + pos := c.readFromPos(mutex, logFile, 0, func(s string) { got = append(got, s) }, false, &lastWasCR) + require.Equal(t, []string{"\n"}, got) + require.True(t, lastWasCR) + + f, err := os.OpenFile(logFile, os.O_APPEND|os.O_WRONLY, 0o644) + require.NoError(t, err) + _, err = f.WriteString("\n") + require.NoError(t, err) + _ = f.Close() + + got = got[:0] + c.readFromPos(mutex, logFile, pos, func(s string) { got = append(got, s) }, false, &lastWasCR) + require.Empty(t, got, "trailing \\n of split blank CRLF must not emit a second blank") +} + func TestRunCommand_Echo(t *testing.T) { if goruntime.GOOS == "windows" { t.Skip("bash not available on windows") diff --git a/components/execd/pkg/runtime/command_windows.go b/components/execd/pkg/runtime/command_windows.go index a3e418fb4..5c720dd4a 100644 --- a/components/execd/pkg/runtime/command_windows.go +++ b/components/execd/pkg/runtime/command_windows.go @@ -24,6 +24,7 @@ import ( "os" "os/exec" "strconv" + "sync" "time" "github.com/alibaba/opensandbox/execd/pkg/jupyter/execute" @@ -57,15 +58,21 @@ func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest cmd.Env = mergeEnvs(os.Environ(), extraEnv) done := make(chan struct{}, 1) + var wg sync.WaitGroup + wg.Add(2) safego.Go(func() { + defer wg.Done() c.tailStdPipe(c.stdoutFileName(session), request.Hooks.OnExecuteStdout, done) }) safego.Go(func() { + defer wg.Done() c.tailStdPipe(c.stderrFileName(session), request.Hooks.OnExecuteStderr, done) }) err = cmd.Start() if err != nil { + close(done) + wg.Wait() request.Hooks.OnExecuteError(&execute.ErrorOutput{EName: "CommandExecError", EValue: err.Error()}) log.Error("CommandExecError: error starting commands: %v", err) return nil @@ -80,6 +87,7 @@ func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest err = cmd.Wait() close(done) + wg.Wait() if err != nil { var eName, eValue string var traceback []string diff --git a/components/execd/tests/smoke_api.py b/components/execd/tests/smoke_api.py index df9f96f6e..36a3cdca8 100644 --- a/components/execd/tests/smoke_api.py +++ b/components/execd/tests/smoke_api.py @@ -91,6 +91,58 @@ def fetch_logs(cmd_id: str, cursor: int = 0): return r.text, r.headers.get("EXECD-COMMANDS-TAIL-CURSOR") +def run_command_blank_lines(): + """ + Foreground command whose stdout contains consecutive newlines must surface + blank-line events instead of dropping them. Regression test for the + readFromPos fix that preserves empty lines (a\n\nb -> ["a", "\n", "b"]). + """ + url = f"{BASE_URL}/command" + # Pick a shell-native command per platform so the regression covers both + # POSIX (LF-only) and Windows cmd (CRLF) byte streams without depending on + # Git for Windows / MSYS argv mangling. The execd reader collapses CRLF to + # LF, so both produce ["a", "\n", "b", "\n", "\n", "c"]. + if os.name == "nt": + # cmd /C echo chain: each segment writes "\r\n"; "echo." writes + # a bare "\r\n". Order is deterministic because "&" is sequential. + command = "echo a&echo.&echo b&echo.&echo.&echo c" + else: + # printf emits exact bytes: a\n\nb\n\n\nc\n + command = "printf 'a\\n\\nb\\n\\n\\nc\\n'" + payload = { + "command": command, + "background": False, + } + + stdout_texts = [] + saw_complete = False + with session.post(url, json=payload, stream=True, timeout=15) as resp: + expect(resp.status_code == 200, f"SSE start failed: {resp.status_code} {resp.text}") + for line in resp.iter_lines(): + if not line: + continue + try: + if line.startswith(b"data:"): + data = json.loads(line[len(b"data:") :].decode()) + else: + data = json.loads(line.decode()) + except Exception: + continue + event_type = data.get("type") + if event_type == "stdout": + stdout_texts.append(data.get("text", "")) + elif event_type == "execution_complete": + saw_complete = True + break + + expect(saw_complete, "did not observe execution_complete") + want = ["a", "\n", "b", "\n", "\n", "c"] + expect( + stdout_texts == want, + f"blank-line stdout sequence mismatch: got {stdout_texts!r}, want {want!r}", + ) + + def sse_disconnect_should_stop_ping(): """ Open an SSE stream for a long-running command, receive init, then close the @@ -248,6 +300,9 @@ def main(): sse_disconnect_should_stop_ping() print("[+] SSE disconnect handled") + run_command_blank_lines() + print("[+] run_command preserves blank lines") + cmd_id = sse_get_command_id() print(f"[+] command id: {cmd_id}") From a52cb6326b977d2ec15d274b4fbdcbd73fa0aba7 Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Tue, 19 May 2026 10:50:44 +0800 Subject: [PATCH 36/58] Feat/execd bootstrap pre script (#901) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(execd): support user-defined pre-script in bootstrap Add `BOOTSTRAP_PRE_SCRIPT` env to declare a script that runs before execd starts. The script is sourced (POSIX `.`) rather than executed so any variables it `export`s propagate to execd and the chained command — a subprocess would lose those exports on exit. Co-Authored-By: Claude Opus 4.7 * refactor(execd): rename BOOTSTRAP_PRE_SCRIPT to EXECD_BOOTSTRAP_PRE_SCRIPT Prefix with the component name to keep the env namespace consistent with other execd-scoped variables (`EXECD`, `EXECD_ENVS`). Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- components/execd/bootstrap.sh | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/components/execd/bootstrap.sh b/components/execd/bootstrap.sh index b39076cfa..32f1a717b 100755 --- a/components/execd/bootstrap.sh +++ b/components/execd/bootstrap.sh @@ -172,6 +172,27 @@ if ! touch "$EXECD_ENVS" 2>/dev/null; then fi export EXECD_ENVS +# Run a user-defined pre-script before launching execd. The script is sourced +# with POSIX `.` (not executed as a child process) so any variables it +# `export`s propagate to execd and the chained command below — a subprocess +# would lose those exports the moment it exits. +if [ -n "${EXECD_BOOTSTRAP_PRE_SCRIPT:-}" ]; then + if [ -f "$EXECD_BOOTSTRAP_PRE_SCRIPT" ] && [ -r "$EXECD_BOOTSTRAP_PRE_SCRIPT" ]; then + # Force `.` to read the literal path; without a slash it would fall + # back to a PATH search and could load the wrong file. + case "$EXECD_BOOTSTRAP_PRE_SCRIPT" in + */*) _pre_script="$EXECD_BOOTSTRAP_PRE_SCRIPT" ;; + *) _pre_script="./$EXECD_BOOTSTRAP_PRE_SCRIPT" ;; + esac + echo "sourcing pre-script $EXECD_BOOTSTRAP_PRE_SCRIPT" + # shellcheck disable=SC1090 + . "$_pre_script" + unset _pre_script + else + echo "warning: EXECD_BOOTSTRAP_PRE_SCRIPT=$EXECD_BOOTSTRAP_PRE_SCRIPT not found or not readable" >&2 + fi +fi + echo "starting OpenSandbox Execd daemon at $EXECD." $EXECD & From ce8c85759df8015b191f4dc8eb77fe1ca1106f61 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 19 May 2026 03:21:17 +0000 Subject: [PATCH 37/58] chore: bump execd to v1.0.17 --- examples/agent-sandbox/README.md | 2 +- examples/code-interpreter/README.md | 2 +- examples/windows/pool-win-example.yaml | 2 +- kubernetes/charts/opensandbox-server/values.yaml | 2 +- kubernetes/config/samples/sandbox_v1alpha1_pool.yaml | 2 +- .../config/samples/sandbox_v1alpha1_pool_restart.yaml | 2 +- oseps/0004-secure-container-runtime.md | 6 +++--- oseps/0007-fast-sandbox-runtime-support.md | 2 +- server/DEVELOPMENT.md | 2 +- server/docker-compose.example.yaml | 4 ++-- .../examples/example.config.k8s.toml | 2 +- .../examples/example.config.k8s.zh.toml | 2 +- server/opensandbox_server/examples/example.config.toml | 2 +- .../opensandbox_server/examples/example.config.zh.toml | 2 +- server/tests/test_docker_service.py | 10 +++++----- 15 files changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/agent-sandbox/README.md b/examples/agent-sandbox/README.md index 4797cbf22..e29ba4dbf 100644 --- a/examples/agent-sandbox/README.md +++ b/examples/agent-sandbox/README.md @@ -23,7 +23,7 @@ opensandbox-server init-config ~/.sandbox.toml --example docker ```toml [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" [kubernetes] namespace = "default" diff --git a/examples/code-interpreter/README.md b/examples/code-interpreter/README.md index 0562d2f6c..d254c1c86 100644 --- a/examples/code-interpreter/README.md +++ b/examples/code-interpreter/README.md @@ -104,7 +104,7 @@ spec: - name: opensandbox-bin mountPath: /opt/opensandbox/bin - name: execd-installer - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 command: [ "/bin/sh", "-c" ] args: - | diff --git a/examples/windows/pool-win-example.yaml b/examples/windows/pool-win-example.yaml index 511e84b0b..815d4fd92 100644 --- a/examples/windows/pool-win-example.yaml +++ b/examples/windows/pool-win-example.yaml @@ -58,7 +58,7 @@ spec: command: - /bin/sh - -c - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 name: execd-installer volumeMounts: - mountPath: /opt/opensandbox/bin diff --git a/kubernetes/charts/opensandbox-server/values.yaml b/kubernetes/charts/opensandbox-server/values.yaml index d581f0a11..339c31416 100644 --- a/kubernetes/charts/opensandbox-server/values.yaml +++ b/kubernetes/charts/opensandbox-server/values.yaml @@ -85,7 +85,7 @@ configToml: | [runtime] type = "kubernetes" - execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" [kubernetes] kubeconfig_path = "" diff --git a/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml b/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml index 8e2985f37..d34303647 100644 --- a/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml +++ b/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml @@ -31,7 +31,7 @@ spec: - name: opensandbox-bin mountPath: /opt/opensandbox/bin - name: execd-installer - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 command: [ "/bin/sh", "-c" ] args: - | diff --git a/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml b/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml index 07c09c298..f3bd63ae3 100644 --- a/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml +++ b/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml @@ -56,7 +56,7 @@ spec: command: - /bin/sh - -c - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 name: execd-installer volumeMounts: - mountPath: /opt/opensandbox/bin diff --git a/oseps/0004-secure-container-runtime.md b/oseps/0004-secure-container-runtime.md index ce031ac94..ca253e528 100644 --- a/oseps/0004-secure-container-runtime.md +++ b/oseps/0004-secure-container-runtime.md @@ -180,7 +180,7 @@ Extension to `~/.sandbox.toml`. A single `[secure_runtime]` section configures t ```toml [runtime] type = "docker" # or "kubernetes" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" # Secure container runtime configuration. # When enabled, ALL sandboxes on this server use the specified runtime. @@ -210,7 +210,7 @@ Example 1 — gVisor on Docker: # ~/.sandbox.toml [runtime] type = "docker" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" [secure_runtime] type = "gvisor" @@ -224,7 +224,7 @@ Example 2 — Kata Containers (QEMU) on Kubernetes: # ~/.sandbox.toml [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" [secure_runtime] type = "kata" diff --git a/oseps/0007-fast-sandbox-runtime-support.md b/oseps/0007-fast-sandbox-runtime-support.md index 451ce7f5e..fc85cb84c 100644 --- a/oseps/0007-fast-sandbox-runtime-support.md +++ b/oseps/0007-fast-sandbox-runtime-support.md @@ -611,7 +611,7 @@ api_key = "your-secret-key" [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" [kubernetes] namespace = "default" diff --git a/server/DEVELOPMENT.md b/server/DEVELOPMENT.md index be1ca5557..eec56ba11 100644 --- a/server/DEVELOPMENT.md +++ b/server/DEVELOPMENT.md @@ -61,7 +61,7 @@ This guide provides comprehensive information for developers working on OpenSand [runtime] type = "docker" - execd_image = "opensandbox/execd:v1.0.16" + execd_image = "opensandbox/execd:v1.0.17" [docker] network_mode = "host" diff --git a/server/docker-compose.example.yaml b/server/docker-compose.example.yaml index 4263b5c93..28fe252a5 100644 --- a/server/docker-compose.example.yaml +++ b/server/docker-compose.example.yaml @@ -10,8 +10,8 @@ configs: [runtime] type = "docker" - # execd_image = "opensandbox/execd:v1.0.16" - execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" + # execd_image = "opensandbox/execd:v1.0.17" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" [egress] image = "opensandbox/egress:v1.0.12" diff --git a/server/opensandbox_server/examples/example.config.k8s.toml b/server/opensandbox_server/examples/example.config.k8s.toml index 3d35d1bc7..3cfa2c74a 100644 --- a/server/opensandbox_server/examples/example.config.k8s.toml +++ b/server/opensandbox_server/examples/example.config.k8s.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" [storage] # Allowlist of host path prefixes permitted for bind mounts. diff --git a/server/opensandbox_server/examples/example.config.k8s.zh.toml b/server/opensandbox_server/examples/example.config.k8s.zh.toml index 7df9f4dc2..fc7dba498 100644 --- a/server/opensandbox_server/examples/example.config.k8s.zh.toml +++ b/server/opensandbox_server/examples/example.config.k8s.zh.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "kubernetes" -execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" +execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" [storage] # 允许进行 bind mount 的宿主机路径前缀白名单。 diff --git a/server/opensandbox_server/examples/example.config.toml b/server/opensandbox_server/examples/example.config.toml index 4fe707308..fa4c84fe3 100644 --- a/server/opensandbox_server/examples/example.config.toml +++ b/server/opensandbox_server/examples/example.config.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "docker" -execd_image = "opensandbox/execd:v1.0.16" +execd_image = "opensandbox/execd:v1.0.17" [storage] # Allowlist of host path prefixes permitted for bind mounts. diff --git a/server/opensandbox_server/examples/example.config.zh.toml b/server/opensandbox_server/examples/example.config.zh.toml index 3645d9586..0820638c6 100644 --- a/server/opensandbox_server/examples/example.config.zh.toml +++ b/server/opensandbox_server/examples/example.config.zh.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "docker" -execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.16" +execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" [storage] allowed_host_paths = [] diff --git a/server/tests/test_docker_service.py b/server/tests/test_docker_service.py index 435e7047b..fe993fc26 100644 --- a/server/tests/test_docker_service.py +++ b/server/tests/test_docker_service.py @@ -1456,7 +1456,7 @@ async def test_create_sandbox_windows_profile_injects_runtime_defaults(mock_dock mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1539,7 +1539,7 @@ async def test_create_sandbox_windows_profile_rejects_missing_runtime_devices(mo mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1578,7 +1578,7 @@ async def test_create_sandbox_windows_profile_rejects_below_minimum_resource_lim mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1615,7 +1615,7 @@ async def test_create_sandbox_windows_profile_accepts_dockur_demo_like_request(m mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1669,7 +1669,7 @@ async def test_create_sandbox_windows_profile_with_network_policy_maps_windows_p mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.16" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" cfg.docker.network_mode = "bridge" cfg.egress = EgressConfig(image="opensandbox/egress:latest") service = DockerSandboxService(config=cfg) From b176d99bd4596343c5a02873ef7b93cc595dfd98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=81=95=E5=BE=81?= <469741414@qq.com> Date: Tue, 19 May 2026 18:14:49 +0800 Subject: [PATCH 38/58] Update main.py 1. Repair entrypoint 2. Support OPEN_SANDBOX_API_KEY --- examples/openclaw/main.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/examples/openclaw/main.py b/examples/openclaw/main.py index 52ab8b69c..e441d72ef 100644 --- a/examples/openclaw/main.py +++ b/examples/openclaw/main.py @@ -23,7 +23,8 @@ # Configuration defaults - can be overridden via environment variables -DEFAULT_SERVER = os.getenv("OPENCLAW_SERVER", "http://localhost:8080") +DEFAULT_SERVER = os.getenv("OPEN_SANDBOX_SERVER", "http://localhost:8080") +DEFAULT_API_KEY = os.getenv("OPEN_SANDBOX_API_KEY", "") DEFAULT_IMAGE = os.getenv("OPENCLAW_IMAGE", "ghcr.io/openclaw/openclaw:latest") DEFAULT_TIMEOUT = int(os.getenv("OPENCLAW_TIMEOUT", "3600")) DEFAULT_TOKEN = os.getenv("OPENCLAW_TOKEN", "dummy-token-for-sandbox") @@ -64,12 +65,14 @@ def check_openclaw(sbx: SandboxSync, port: int = DEFAULT_PORT) -> bool: def main() -> None: server = DEFAULT_SERVER + api_key = DEFAULT_API_KEY image = DEFAULT_IMAGE timeout_seconds = DEFAULT_TIMEOUT token = os.getenv("OPENCLAW_GATEWAY_TOKEN", DEFAULT_TOKEN) port = DEFAULT_PORT print(f"Creating openclaw sandbox with image={image} on OpenSandbox server {server}...") + print(f" API Key: {api_key[:16]}..." if len(api_key) > 16 else f" API Key: {api_key}") print(f" Token: {token[:16]}..." if len(token) > 16 else f" Token: {token}") print(f" Port: {port}") print(f" Timeout: {timeout_seconds}s") @@ -78,8 +81,8 @@ def main() -> None: image=image, timeout=timedelta(seconds=timeout_seconds), metadata={"example": "openclaw"}, - entrypoint=[f"node dist/index.js gateway --bind=lan --port {port} --allow-unconfigured --verbose"], - connection_config=ConnectionConfigSync(domain=server), + entrypoint=["node", "dist/index.js", "gateway", "--bind=lan", "--port", str(port), "--allow-unconfigured", "--verbose"], + connection_config=ConnectionConfigSync(domain=server, api_key=api_key), health_check=lambda sbx: check_openclaw(sbx, port), # env for openclaw env={ @@ -101,4 +104,4 @@ def main() -> None: print(f"Openclaw started finished. Please refer to {endpoint.endpoint}") if __name__ == "__main__": - main() \ No newline at end of file + main() From 17760d076f2a08dee49c057217e3c797f0d26470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Wed, 20 May 2026 09:40:15 +0800 Subject: [PATCH 39/58] feat(ci): publish image-committer image via release workflow Add image-committer as a selectable component in publish-components.yml, wire the kubernetes/build.sh to use Dockerfile.image-committer, and allow the bump script to update image-committer references after release. Closes #917 Co-Authored-By: Claude Opus 4.7 --- .github/workflows/publish-components.yml | 4 ++++ kubernetes/build.sh | 11 ++++++++--- scripts/bump-component-version.sh | 5 +++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish-components.yml b/.github/workflows/publish-components.yml index 8a6e99398..86090b048 100644 --- a/.github/workflows/publish-components.yml +++ b/.github/workflows/publish-components.yml @@ -22,6 +22,7 @@ on: - egress - controller - task-executor + - image-committer default: 'execd' image_tag: description: 'Docker image tag' @@ -35,6 +36,7 @@ on: - 'docker/egress/**' - 'k8s/controller/**' - 'k8s/task-executor/**' + - 'k8s/image-committer/**' jobs: publish: @@ -117,6 +119,8 @@ jobs: cd kubernetes elif [ "$COMPONENT" == "task-executor" ]; then cd kubernetes + elif [ "$COMPONENT" == "image-committer" ]; then + cd kubernetes else cd sandboxes/$COMPONENT fi diff --git a/kubernetes/build.sh b/kubernetes/build.sh index 741e32a4f..b44d16a0a 100755 --- a/kubernetes/build.sh +++ b/kubernetes/build.sh @@ -37,15 +37,20 @@ DOCKERHUB_REPO="opensandbox" ACR_REPO="sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox" # Component specific settings +DOCKERFILE="Dockerfile" if [ "$COMPONENT" == "controller" ]; then IMAGE_NAME="controller" BUILD_ARG="--build-arg PACKAGE=./cmd/controller" elif [ "$COMPONENT" == "task-executor" ]; then IMAGE_NAME="task-executor" BUILD_ARG="--build-arg PACKAGE=cmd/task-executor/main.go --build-arg USERID=0" +elif [ "$COMPONENT" == "image-committer" ]; then + IMAGE_NAME="image-committer" + BUILD_ARG="" + DOCKERFILE="Dockerfile.image-committer" else echo "Error: Unknown component: $COMPONENT" - echo "Available components: controller, task-executor" + echo "Available components: controller, task-executor, image-committer" exit 1 fi @@ -69,7 +74,7 @@ if [ "$PUSH" == "true" ]; then -t "${ACR_REPO}/${IMAGE_NAME}:${TAG}" \ --metadata-file "${BUILD_METADATA_FILE}" \ --push \ - -f Dockerfile \ + -f "$DOCKERFILE" \ . echo "=========================================" @@ -84,7 +89,7 @@ else $BUILD_ARG \ "${BUILD_ARGS[@]}" \ -t ${IMAGE_NAME}:${TAG} \ - -f Dockerfile \ + -f "$DOCKERFILE" \ --load \ . diff --git a/scripts/bump-component-version.sh b/scripts/bump-component-version.sh index 573fc140c..118262d5f 100755 --- a/scripts/bump-component-version.sh +++ b/scripts/bump-component-version.sh @@ -36,16 +36,17 @@ elif [ $# -eq 2 ]; then COMPONENT="$1" NEW_VERSION="$2" else - echo "Usage: $0 [egress|execd|ingress|code-interpreter] NEW_VERSION" >&2 + echo "Usage: $0 [egress|execd|ingress|code-interpreter|image-committer] NEW_VERSION" >&2 echo " $0 NEW_VERSION # bumps egress" >&2 echo "Example: $0 egress v1.0.2" >&2 echo "Example: $0 execd 1.0.7" >&2 echo "Example: $0 ingress v1.0.6" >&2 + echo "Example: $0 image-committer v0.1.0" >&2 exit 1 fi case "$COMPONENT" in - egress|execd|ingress|code-interpreter) ;; + egress|execd|ingress|code-interpreter|image-committer) ;; *) echo "Error: unsupported component: $COMPONENT" >&2 exit 0 From cc8298eee8e37c84056e852b1cca401333880ee6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A7=90=E6=9E=9D?= Date: Wed, 20 May 2026 09:43:16 +0800 Subject: [PATCH 40/58] feat(sdks/go): add PlatformSpec to CreateSandboxRequest The Lifecycle API (specs/sandbox-lifecycle.yml) and the Python SDK both expose a Platform field for selecting the target OS/arch of a sandbox (e.g. `{"os":"windows","arch":"amd64"}` to provision a Windows guest via the dockur/windows profile). The Go SDK has been missing it, so Go callers cannot drive Windows-guest sandboxes. * Add PlatformSpec{OS, Arch} mirroring the Python model. * Add CreateSandboxRequest.Platform (omitempty) and SandboxInfo.Platform. * Plumb SandboxCreateOptions.Platform through CreateSandbox. * Cover the round trip and the nil-omission case with table-style tests. Refs: docs/windows-sandbox.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- sdks/sandbox/go/opensandbox_test.go | 62 +++++++++++++++++++++++++++++ sdks/sandbox/go/sandbox.go | 5 +++ sdks/sandbox/go/types.go | 35 ++++++++++++++++ 3 files changed, 102 insertions(+) diff --git a/sdks/sandbox/go/opensandbox_test.go b/sdks/sandbox/go/opensandbox_test.go index 0fee93b42..2f0ef49a2 100644 --- a/sdks/sandbox/go/opensandbox_test.go +++ b/sdks/sandbox/go/opensandbox_test.go @@ -220,6 +220,68 @@ func TestCreateSandbox_FromSnapshot(t *testing.T) { require.NoErrorf(t, err, "CreateSandbox from snapshot") } +func TestCreateSandbox_Platform(t *testing.T) { + _, client := newLifecycleServer(t, func(w http.ResponseWriter, r *http.Request) { + var req CreateSandboxRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + assert.Fail(t, fmt.Sprintf("decode request: %v", err)) + return + } + require.NotNil(t, req.Platform, "expected Platform to be sent in the request") + require.Equal(t, OSWindows, req.Platform.OS, "Platform.OS") + require.Equal(t, ArchAMD64, req.Platform.Arch, "Platform.Arch") + + jsonResponse(w, http.StatusCreated, SandboxInfo{ + ID: "sbx-windows", + Status: SandboxStatus{State: StatePending}, + Platform: &PlatformSpec{OS: OSWindows, Arch: ArchAMD64}, + CreatedAt: time.Now().UTC().Truncate(time.Second), + }) + }) + + info, err := client.CreateSandbox(context.Background(), CreateSandboxRequest{ + Image: &ImageSpec{URI: "dockurr/windows:latest"}, + Entrypoint: []string{"cmd", "/c", "echo hi"}, + ResourceLimits: ResourceLimits{"cpu": "2", "memory": "4G", "disk": "64G"}, + Platform: &PlatformSpec{OS: OSWindows, Arch: ArchAMD64}, + }) + require.NoErrorf(t, err, "CreateSandbox with Platform") + require.NotNil(t, info.Platform, "response should echo Platform") + require.Equal(t, OSWindows, info.Platform.OS, "echoed Platform.OS") + require.Equal(t, ArchAMD64, info.Platform.Arch, "echoed Platform.Arch") +} + +func TestCreateSandbox_PlatformOmittedWhenNil(t *testing.T) { + _, client := newLifecycleServer(t, func(w http.ResponseWriter, r *http.Request) { + body, err := io.ReadAll(r.Body) + if err != nil { + assert.Fail(t, fmt.Sprintf("read request body: %v", err)) + return + } + var raw map[string]json.RawMessage + if err := json.Unmarshal(body, &raw); err != nil { + assert.Fail(t, fmt.Sprintf("unmarshal request body: %v", err)) + return + } + if _, present := raw["platform"]; present { + assert.Fail(t, "platform should be omitted from JSON when nil") + } + + jsonResponse(w, http.StatusCreated, SandboxInfo{ + ID: "sbx-no-platform", + Status: SandboxStatus{State: StatePending}, + CreatedAt: time.Now().UTC().Truncate(time.Second), + }) + }) + + _, err := client.CreateSandbox(context.Background(), CreateSandboxRequest{ + Image: &ImageSpec{URI: "python:3.12"}, + Entrypoint: []string{"/bin/sh"}, + ResourceLimits: ResourceLimits{"cpu": "500m"}, + }) + require.NoErrorf(t, err, "CreateSandbox without Platform") +} + func TestGetSandbox(t *testing.T) { want := SandboxInfo{ ID: "sbx-456", diff --git a/sdks/sandbox/go/sandbox.go b/sdks/sandbox/go/sandbox.go index 56aacbe40..fefbe102d 100644 --- a/sdks/sandbox/go/sandbox.go +++ b/sdks/sandbox/go/sandbox.go @@ -66,6 +66,10 @@ type SandboxCreateOptions struct { // Extensions for provider-specific parameters. Extensions map[string]string + // Platform selects the target OS/arch for the sandbox (e.g. {"os": + // "windows", "arch": "amd64"}). When nil the server applies its default. + Platform *PlatformSpec + // SkipHealthCheck skips the WaitUntilReady call after creation. SkipHealthCheck bool @@ -132,6 +136,7 @@ func CreateSandbox(ctx context.Context, config ConnectionConfig, opts SandboxCre NetworkPolicy: opts.NetworkPolicy, Volumes: opts.Volumes, Extensions: opts.Extensions, + Platform: opts.Platform, } if opts.Image != "" { req.Image = &ImageSpec{URI: opts.Image, Auth: opts.ImageAuth} diff --git a/sdks/sandbox/go/types.go b/sdks/sandbox/go/types.go index c2fc4b75e..5a7459cbd 100644 --- a/sdks/sandbox/go/types.go +++ b/sdks/sandbox/go/types.go @@ -55,6 +55,39 @@ type ImageAuth struct { Password string `json:"password"` } +// PlatformOS is the target operating system of a sandbox platform constraint. +// The wire-level enum is enforced server-side; the constants below mirror the +// spec so Go callers can avoid stringly-typed typos. +type PlatformOS string + +const ( + OSLinux PlatformOS = "linux" + OSWindows PlatformOS = "windows" +) + +// PlatformArch is the target CPU architecture of a sandbox platform +// constraint. +type PlatformArch string + +const ( + ArchAMD64 PlatformArch = "amd64" + ArchARM64 PlatformArch = "arm64" +) + +// PlatformSpec is a runtime platform constraint used for scheduling and +// provisioning. It is independent from Image and expresses the expected +// target OS and CPU architecture for sandbox execution. +// +// When omitted, the server applies its own default platform selection +// behavior. When provided, the runtime must satisfy the constraint or the +// request fails. +// +// See specs/sandbox-lifecycle.yml#/components/schemas/PlatformSpec. +type PlatformSpec struct { + OS PlatformOS `json:"os"` + Arch PlatformArch `json:"arch"` +} + // ResourceLimits defines runtime resource constraints as key-value pairs. // Common keys: "cpu" (e.g. "500m"), "memory" (e.g. "512Mi"), "gpu" (e.g. "1"). type ResourceLimits map[string]string @@ -120,6 +153,7 @@ type CreateSandboxRequest struct { NetworkPolicy *NetworkPolicy `json:"networkPolicy,omitempty"` Volumes []Volume `json:"volumes,omitempty"` Extensions map[string]string `json:"extensions,omitempty"` + Platform *PlatformSpec `json:"platform,omitempty"` } // SandboxInfo represents a runtime execution environment provisioned from a @@ -133,6 +167,7 @@ type SandboxInfo struct { Entrypoint []string `json:"entrypoint"` ExpiresAt *time.Time `json:"expiresAt,omitempty"` CreatedAt time.Time `json:"createdAt"` + Platform *PlatformSpec `json:"platform,omitempty"` } type SnapshotState string From 6a2edc187cb565f3ef5a3c4a701b6167eb321e01 Mon Sep 17 00:00:00 2001 From: "pingshan.wj" Date: Tue, 19 May 2026 11:19:14 +0800 Subject: [PATCH 41/58] feat(k8s): Add some logs & update batchsandbox status Patch instead of Update --- kubernetes/Dockerfile | 4 +- kubernetes/build.sh | 2 + kubernetes/cmd/controller/main.go | 24 ++++-- kubernetes/internal/controller/allocator.go | 77 ++++++++++++----- .../controller/batchsandbox_controller.go | 30 +++++-- .../controller/batchsandbox_pause_resume.go | 30 ++++--- .../batchsandbox_pause_resume_test.go | 85 +++++++++++-------- .../controller/batchsandbox_status.go | 80 +++++++++++------ .../internal/controller/pool_controller.go | 6 +- .../controller/sandboxsnapshot_controller.go | 6 +- kubernetes/internal/controller/suite_test.go | 8 +- 11 files changed, 241 insertions(+), 111 deletions(-) diff --git a/kubernetes/Dockerfile b/kubernetes/Dockerfile index b2026411c..8ce31232f 100644 --- a/kubernetes/Dockerfile +++ b/kubernetes/Dockerfile @@ -48,6 +48,8 @@ COPY internal/ internal/ # by leaving it empty we can ensure that the container and binary shipped on it will have the same platform. RUN echo "Building for $TARGETOS/$TARGETARCH" ARG PACKAGE=./cmd/controller +ARG COMMIT_ID=unknown +ARG BUILD_DATE=unknown RUN if [ -n "${CC}" ]; then export CC; fi; \ if [ -n "${CXX}" ]; then export CXX; fi; \ export CGO_ENABLED="${CGO_ENABLED}" GOOS="${TARGETOS:-linux}" GOARCH="${TARGETARCH}" \ @@ -55,7 +57,7 @@ RUN if [ -n "${CC}" ]; then export CC; fi; \ CGO_CXXFLAGS="${CGO_CXXFLAGS:-${CXXFLAGS}}" \ CGO_LDFLAGS="${CGO_LDFLAGS}"; \ go build ${GOFLAGS} -trimpath -buildvcs=false \ - -ldflags "${LDFLAGS} -buildid= -B none" \ + -ldflags "${LDFLAGS} -buildid= -B none -X main.commitID=${COMMIT_ID} -X main.buildDate=${BUILD_DATE}" \ -o server ${PACKAGE} # Use golang image as base to ensure nsenter (util-linux) is available diff --git a/kubernetes/build.sh b/kubernetes/build.sh index 741e32a4f..747ed93b8 100755 --- a/kubernetes/build.sh +++ b/kubernetes/build.sh @@ -31,6 +31,8 @@ BUILD_ARGS=() for name in GOFLAGS LDFLAGS CGO_ENABLED CC CXX CFLAGS CXXFLAGS CGO_CFLAGS CGO_CXXFLAGS CGO_LDFLAGS; do build_arg_if_set "${name}" done +BUILD_ARGS+=(--build-arg "COMMIT_ID=$(git rev-parse --short HEAD)") +BUILD_ARGS+=(--build-arg "BUILD_DATE=$(date -u +%Y-%m-%dT%H:%M:%SZ)") mkdir -p "$(dirname "${BUILD_METADATA_FILE}")" DOCKERHUB_REPO="opensandbox" diff --git a/kubernetes/cmd/controller/main.go b/kubernetes/cmd/controller/main.go index 5f402eb5d..2eccbaf78 100644 --- a/kubernetes/cmd/controller/main.go +++ b/kubernetes/cmd/controller/main.go @@ -43,11 +43,17 @@ import ( "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/controller" poolassign "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/controller/poolassign" cryptoutil "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/crypto" + "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/expectations" "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/fieldindex" "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/logging" // +kubebuilder:scaffold:imports ) +var ( + commitID = "unknown" + buildDate = "unknown" +) + const ( defaultBatchSandboxConcurrency = 32 defaultPoolConcurrency = 16 @@ -237,6 +243,8 @@ func main() { logger := logging.NewLoggerWithZapOptions(logOpts) ctrl.SetLogger(logger) + setupLog.Info("Starting controller", "commitID", commitID, "buildDate", buildDate) + // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancellation and @@ -375,11 +383,12 @@ func main() { } config := ctrl.GetConfigOrDie() + config.UserAgent = "sandbox-k8s-controller/1.0" // Set client rate limiter if specified - if kubeClientQPS > 0 { + if kubeClientQPS != 0 { config.QPS = float32(kubeClientQPS) } - if kubeClientBurst > 0 { + if kubeClientBurst != 0 { config.Burst = kubeClientBurst } @@ -422,11 +431,12 @@ func main() { } if err := (&controller.BatchSandboxReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("batchsandbox-controller"), - ResumePullSecret: resumePullSecret, - ProfileStore: profileStore, + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("batchsandbox-controller"), + ResumePullSecret: resumePullSecret, + ProfileStore: profileStore, + StatusRVExpectation: expectations.NewResourceVersionExpectation(), }).SetupWithManager(mgr, batchSandboxConcurrency); err != nil { setupLog.Error(err, "unable to create controller", "controller", "BatchSandbox") os.Exit(1) diff --git a/kubernetes/internal/controller/allocator.go b/kubernetes/internal/controller/allocator.go index 985b87e2c..008b14669 100644 --- a/kubernetes/internal/controller/allocator.go +++ b/kubernetes/internal/controller/allocator.go @@ -22,6 +22,7 @@ import ( "sync" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -268,24 +269,38 @@ func NewAnnoAllocationSyncer(client client.Client) AllocationSyncer { } func (syncer *annoAllocationSyncer) SetAllocation(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox, allocation *SandboxAllocation) error { - old, ok := sandbox.DeepCopyObject().(*sandboxv1alpha1.BatchSandbox) - if !ok { - return fmt.Errorf("invalid object") + js, err := json.Marshal(allocation) + if err != nil { + return err } anno := sandbox.GetAnnotations() if anno == nil { anno = make(map[string]string) } - js, err := json.Marshal(allocation) + anno[AnnoAllocStatusKey] = string(js) + sandbox.SetAnnotations(anno) + + needAddFinalizer := !controllerutil.ContainsFinalizer(sandbox, FinalizerPoolAllocation) + if needAddFinalizer { + sandbox.SetFinalizers(append(sandbox.GetFinalizers(), FinalizerPoolAllocation)) + } + + meta := map[string]any{ + "annotations": map[string]string{ + AnnoAllocStatusKey: string(js), + }, + } + if needAddFinalizer { + meta["finalizers"] = sandbox.GetFinalizers() + } + patchData, err := json.Marshal(map[string]any{"metadata": meta}) if err != nil { return err } - anno[AnnoAllocStatusKey] = string(js) - sandbox.SetAnnotations(anno) - // Add finalizer to ensure the sandbox is not deleted before all pods are recycled. - controllerutil.AddFinalizer(sandbox, FinalizerPoolAllocation) - patch := client.MergeFrom(old) - return syncer.client.Patch(ctx, sandbox, patch) + obj := &sandboxv1alpha1.BatchSandbox{} + obj.Name = sandbox.Name + obj.Namespace = sandbox.Namespace + return syncer.client.Patch(ctx, obj, client.RawPatch(types.MergePatchType, patchData)) } func (syncer *annoAllocationSyncer) GetAllocation(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox) (*SandboxAllocation, error) { @@ -340,20 +355,18 @@ func (syncer *annoAllocationSyncer) GetReleased(ctx context.Context, sandbox *sa } func (syncer *annoAllocationSyncer) SetReleased(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox, released *AllocationReleased) error { - old, ok := sandbox.DeepCopyObject().(*sandboxv1alpha1.BatchSandbox) - if !ok { - return fmt.Errorf("invalid object") + js, err := json.Marshal(released) + if err != nil { + return err } anno := sandbox.GetAnnotations() if anno == nil { anno = make(map[string]string) } - js, err := json.Marshal(released) - if err != nil { - return err - } anno[AnnoAllocReleasedKey] = string(js) sandbox.SetAnnotations(anno) + + needRemoveFinalizer := false // If the sandbox is being deleted and all allocated pods have been released, // remove the finalizer so the sandbox can be garbage collected. if !sandbox.DeletionTimestamp.IsZero() { @@ -372,12 +385,34 @@ func (syncer *annoAllocationSyncer) SetReleased(ctx context.Context, sandbox *sa break } } - if allReleased { - controllerutil.RemoveFinalizer(sandbox, FinalizerPoolAllocation) + if allReleased && controllerutil.ContainsFinalizer(sandbox, FinalizerPoolAllocation) { + needRemoveFinalizer = true + filtered := make([]string, 0, len(sandbox.GetFinalizers())) + for _, f := range sandbox.GetFinalizers() { + if f != FinalizerPoolAllocation { + filtered = append(filtered, f) + } + } + sandbox.SetFinalizers(filtered) } } - patch := client.MergeFrom(old) - return syncer.client.Patch(ctx, sandbox, patch) + + meta := map[string]any{ + "annotations": map[string]string{ + AnnoAllocReleasedKey: string(js), + }, + } + if needRemoveFinalizer { + meta["finalizers"] = sandbox.GetFinalizers() + } + patchData, err := json.Marshal(map[string]any{"metadata": meta}) + if err != nil { + return err + } + obj := &sandboxv1alpha1.BatchSandbox{} + obj.Name = sandbox.Name + obj.Namespace = sandbox.Namespace + return syncer.client.Patch(ctx, obj, client.RawPatch(types.MergePatchType, patchData)) } type AllocSpec struct { diff --git a/kubernetes/internal/controller/batchsandbox_controller.go b/kubernetes/internal/controller/batchsandbox_controller.go index 0fe1cf516..63e76c98a 100644 --- a/kubernetes/internal/controller/batchsandbox_controller.go +++ b/kubernetes/internal/controller/batchsandbox_controller.go @@ -66,10 +66,11 @@ type taskScheduleResult struct { // BatchSandboxReconciler reconciles a BatchSandbox object type BatchSandboxReconciler struct { client.Client - Scheme *runtime.Scheme - Recorder record.EventRecorder - ProfileStore *poolassign.ProfileStore - taskSchedulers sync.Map + Scheme *runtime.Scheme + Recorder record.EventRecorder + ProfileStore *poolassign.ProfileStore + taskSchedulers sync.Map + StatusRVExpectation expectations.ResourceVersionExpectation // ResumePullSecret is the K8s Secret name for pulling snapshot images during resume. ResumePullSecret string } @@ -90,11 +91,13 @@ type BatchSandboxReconciler struct { // // For more details, check Reconcile and its Result here: // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.21.0/pkg/reconcile -func (r *BatchSandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *BatchSandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { log := logf.FromContext(ctx) + start := time.Now() var aggErrors []error defer func() { _ = DurationStore.Pop(req.String()) + log.Info("Reconcile finished", "duration", time.Since(start).String(), "requeueAfter", result.RequeueAfter.String(), "error", retErr) }() batchSbx := &sandboxv1alpha1.BatchSandbox{} if err := r.Get(ctx, client.ObjectKey{ @@ -192,6 +195,14 @@ func (r *BatchSandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request } runtimeView := buildRuntimeView(batchSbx, pods) + // Ensure PauseObservedGeneration is up-to-date so the status patch ACKs the + // current generation without requiring a dedicated API call. + // Skip during Resuming: a newer generation may carry a queued pause request + // that must remain unacknowledged until resume completes and handlePause runs. + if batchSbx.Status.Phase != sandboxv1alpha1.BatchSandboxPhaseResuming && + runtimeView.status.PauseObservedGeneration < batchSbx.Generation { + runtimeView.status.PauseObservedGeneration = batchSbx.Generation + } if batchSbx.Status.Phase == sandboxv1alpha1.BatchSandboxPhasePaused { r.deleteTaskScheduler(ctx, batchSbx) @@ -210,9 +221,14 @@ func (r *BatchSandboxReconciler) Reconcile(ctx context.Context, req ctrl.Request } } - aggErrors = append(aggErrors, r.persistRuntimeView(ctx, batchSbx, runtimeView)...) + requeue, persistErrors := r.persistRuntimeView(ctx, batchSbx, runtimeView) + aggErrors = append(aggErrors, persistErrors...) - return reconcile.Result{RequeueAfter: DurationStore.Pop(req.String())}, gerrors.Join(aggErrors...) + requeueAfter := DurationStore.Pop(req.String()) + if requeue > 0 && (requeueAfter == 0 || requeue < requeueAfter) { + requeueAfter = requeue + } + return reconcile.Result{RequeueAfter: requeueAfter}, gerrors.Join(aggErrors...) } func calPodIndex(poolStrategy strategy.PoolStrategy, batchSbx *sandboxv1alpha1.BatchSandbox, pods []*corev1.Pod) (map[string]int, error) { diff --git a/kubernetes/internal/controller/batchsandbox_pause_resume.go b/kubernetes/internal/controller/batchsandbox_pause_resume.go index 184f185bd..eb357a036 100644 --- a/kubernetes/internal/controller/batchsandbox_pause_resume.go +++ b/kubernetes/internal/controller/batchsandbox_pause_resume.go @@ -209,10 +209,9 @@ func (r *BatchSandboxReconciler) dispatchPauseResume(ctx context.Context, bs *sa result, err := r.handleResume(ctx, bs) return result, true, err } - log.Info("Dispatch: ACK only", "generation", generation, "pauseObservedGeneration", pauseObservedGen) - if err := r.ackPauseGeneration(ctx, bs); err != nil { - return ctrl.Result{}, true, err - } + // No pause intent — skip the dedicated ACK API call. The normal flow's + // persistRuntimeView will update PauseObservedGeneration in its status patch. + log.Info("Dispatch: no pause intent, deferring ACK to status patch", "generation", generation, "pauseObservedGeneration", pauseObservedGen) return ctrl.Result{}, false, nil } @@ -479,8 +478,9 @@ func (r *BatchSandboxReconciler) completePause(ctx context.Context, bs *sandboxv r.deleteTaskScheduler(ctx, bs) + var latest *sandboxv1alpha1.BatchSandbox if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - latest := &sandboxv1alpha1.BatchSandbox{} + latest = &sandboxv1alpha1.BatchSandbox{} if err := r.Get(ctx, types.NamespacedName{Namespace: bs.Namespace, Name: bs.Name}, latest); err != nil { return err } @@ -496,6 +496,7 @@ func (r *BatchSandboxReconciler) completePause(ctx context.Context, bs *sandboxv }); err != nil { return err } + r.StatusRVExpectation.Expect(latest) return nil } @@ -584,8 +585,9 @@ func (r *BatchSandboxReconciler) continueResume(ctx context.Context, bs *sandbox } func (r *BatchSandboxReconciler) ackPauseGeneration(ctx context.Context, bs *sandboxv1alpha1.BatchSandbox) error { + var latest *sandboxv1alpha1.BatchSandbox if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - latest := &sandboxv1alpha1.BatchSandbox{} + latest = &sandboxv1alpha1.BatchSandbox{} if err := r.Get(ctx, types.NamespacedName{Namespace: bs.Namespace, Name: bs.Name}, latest); err != nil { return err } @@ -595,14 +597,16 @@ func (r *BatchSandboxReconciler) ackPauseGeneration(ctx context.Context, bs *san }); err != nil { return err } + r.StatusRVExpectation.Expect(latest) bs.Status.PauseObservedGeneration = bs.Generation applyBatchSandboxPhaseConditions(&bs.Status) return nil } func (r *BatchSandboxReconciler) ackPauseWithPhase(ctx context.Context, bs *sandboxv1alpha1.BatchSandbox, phase sandboxv1alpha1.BatchSandboxPhase, _ string) error { + var latest *sandboxv1alpha1.BatchSandbox if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { - latest := &sandboxv1alpha1.BatchSandbox{} + latest = &sandboxv1alpha1.BatchSandbox{} if err := r.Get(ctx, types.NamespacedName{Namespace: bs.Namespace, Name: bs.Name}, latest); err != nil { return err } @@ -613,6 +617,7 @@ func (r *BatchSandboxReconciler) ackPauseWithPhase(ctx context.Context, bs *sand }); err != nil { return err } + r.StatusRVExpectation.Expect(latest) bs.Status.PauseObservedGeneration = bs.Generation bs.Status.Phase = phase applyBatchSandboxPhaseConditions(&bs.Status) @@ -640,8 +645,9 @@ func (r *BatchSandboxReconciler) setCondition( reason string, message string, ) error { - return retry.RetryOnConflict(retry.DefaultBackoff, func() error { - latest := &sandboxv1alpha1.BatchSandbox{} + var latest *sandboxv1alpha1.BatchSandbox + if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { + latest = &sandboxv1alpha1.BatchSandbox{} if err := r.Get(ctx, types.NamespacedName{Namespace: bs.Namespace, Name: bs.Name}, latest); err != nil { return err } @@ -674,5 +680,9 @@ func (r *BatchSandboxReconciler) setCondition( latest.Status.Conditions = conditions return r.Status().Update(ctx, latest) - }) + }); err != nil { + return err + } + r.StatusRVExpectation.Expect(latest) + return nil } diff --git a/kubernetes/internal/controller/batchsandbox_pause_resume_test.go b/kubernetes/internal/controller/batchsandbox_pause_resume_test.go index 3688e14ee..29f793002 100644 --- a/kubernetes/internal/controller/batchsandbox_pause_resume_test.go +++ b/kubernetes/internal/controller/batchsandbox_pause_resume_test.go @@ -36,6 +36,7 @@ import ( sandboxv1alpha1 "github.com/alibaba/OpenSandbox/sandbox-k8s/apis/sandbox/v1alpha1" taskscheduler "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/scheduler" + "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/expectations" "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/fieldindex" taskexecutor "github.com/alibaba/OpenSandbox/sandbox-k8s/pkg/task-executor" ) @@ -52,9 +53,10 @@ func newTestReconciler(objs ...client.Object) *BatchSandboxReconciler { WithObjects(objs...). Build() return &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } } @@ -222,7 +224,8 @@ func TestDispatchPauseResume_Case2_PauseFalse(t *testing.T) { } func TestDispatchPauseResume_Case3_PauseNil_ACKOnly(t *testing.T) { - // gen > pauseObservedGen, pause=nil → ACK only, continue normal flow (handled=false) + // gen > pauseObservedGen, pause=nil → no dedicated ACK API call, continue normal flow (handled=false). + // The ACK is deferred to persistRuntimeView in the main reconcile loop. bs := &sandboxv1alpha1.BatchSandbox{ ObjectMeta: metav1.ObjectMeta{ Name: "test-bs", @@ -247,11 +250,10 @@ func TestDispatchPauseResume_Case3_PauseNil_ACKOnly(t *testing.T) { assert.False(t, handled, "ACK only should not block normal flow") assert.Equal(t, ctrl.Result{}, result) - // Verify ACK happened + // Verify ACK is NOT written to server by dispatch (deferred to persistRuntimeView). updated := &sandboxv1alpha1.BatchSandbox{} require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: "test-bs"}, updated)) - assert.Equal(t, int64(2), updated.Status.PauseObservedGeneration) - assert.Equal(t, int64(2), bs.Status.PauseObservedGeneration, "in-memory status should also reflect ACK for the rest of this reconcile") + assert.Equal(t, int64(1), updated.Status.PauseObservedGeneration, "server should not be updated by dispatch; ACK is deferred") } func TestDispatchPauseResume_Case4_GenEqual_PauseSet(t *testing.T) { @@ -1034,9 +1036,10 @@ func TestContinueResume_UsesPatchedTemplateWhenCacheReturnsStaleObject(t *testin }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } result, err := r.continueResume(context.Background(), bs) @@ -1557,8 +1560,7 @@ func TestPersistRuntimeView_PreservesPauseFailedConditionFromLatestStatus(t *tes } r := newTestReconciler(bs, pod) - stale := bs.DeepCopy() - + // Simulate pause handler writing PauseFailed condition to API server. latest := &sandboxv1alpha1.BatchSandbox{} require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: "test-bs"}, latest)) latest.Status.Conditions = append(latest.Status.Conditions, sandboxv1alpha1.BatchSandboxCondition{ @@ -1570,10 +1572,15 @@ func TestPersistRuntimeView_PreservesPauseFailedConditionFromLatestStatus(t *tes }) require.NoError(t, r.Status().Update(context.Background(), latest)) - view := buildRuntimeView(stale, []*corev1.Pod{pod}) - err := r.persistRuntimeView(context.Background(), stale, view) - require.Empty(t, err) + // Simulate second reconcile: informer has caught up, so we read latest state. + freshBS := &sandboxv1alpha1.BatchSandbox{} + require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: "test-bs"}, freshBS)) + + view := buildRuntimeView(freshBS, []*corev1.Pod{pod}) + _, errs := r.persistRuntimeView(context.Background(), freshBS, view) + require.Empty(t, errs) + // Verify PauseFailed is preserved after reconcile with fresh cache. updated := &sandboxv1alpha1.BatchSandbox{} require.NoError(t, r.Get(context.Background(), types.NamespacedName{Namespace: "default", Name: "test-bs"}, updated)) @@ -1586,7 +1593,7 @@ func TestPersistRuntimeView_PreservesPauseFailedConditionFromLatestStatus(t *tes assert.Equal(t, "Commit job failed", cond.Message) } } - assert.True(t, foundPauseFailed, "persistRuntimeView should preserve latest PauseFailed condition") + assert.True(t, foundPauseFailed, "persistRuntimeView should preserve PauseFailed condition once informer cache catches up") } func TestPersistRuntimeView_SkipsStatusUpdateWhenRuntimeStatusUnchanged(t *testing.T) { @@ -1655,13 +1662,14 @@ func TestPersistRuntimeView_SkipsStatusUpdateWhenRuntimeStatusUnchanged(t *testi }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } view := buildRuntimeView(bs.DeepCopy(), []*corev1.Pod{pod}) - errs := r.persistRuntimeView(context.Background(), bs.DeepCopy(), view) + _, errs := r.persistRuntimeView(context.Background(), bs.DeepCopy(), view) require.Empty(t, errs) assert.Equal(t, 0, statusUpdates, "unchanged runtime status should not be persisted again") } @@ -1709,7 +1717,7 @@ func TestPersistRuntimeView_RetriesSucceededPauseSnapshotCleanup(t *testing.T) { status := bs.Status view := runtimeView{status: &status} - errs := r.persistRuntimeView(context.Background(), bs.DeepCopy(), view) + _, errs := r.persistRuntimeView(context.Background(), bs.DeepCopy(), view) require.Empty(t, errs) stillPresent := &sandboxv1alpha1.SandboxSnapshot{} @@ -2046,9 +2054,10 @@ func TestCompletePause_DeleteFailureLeavesPhasePausing(t *testing.T) { }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } err := r.completePause(context.Background(), bs) @@ -2174,9 +2183,10 @@ func TestCompletePause_PooledSandboxDoesNotDeleteSourcePod(t *testing.T) { }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } err := r.completePause(context.Background(), bs) @@ -2246,9 +2256,10 @@ func TestCompletePause_PooledSandboxAcknowledgesSpecPatchGeneration(t *testing.T }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } require.NoError(t, r.completePause(context.Background(), bs)) @@ -2321,9 +2332,10 @@ func TestCompletePause_DoesNotAcknowledgeQueuedResumeGeneration(t *testing.T) { }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } require.NoError(t, r.completePause(context.Background(), bs)) @@ -2490,9 +2502,10 @@ func TestSyncPauseOrClear_SnapshotFailedReturnsStatusUpdateError(t *testing.T) { }). Build() r := &BatchSandboxReconciler{ - Client: fakeClient, - Scheme: testscheme, - Recorder: record.NewFakeRecorder(10), + Client: fakeClient, + Scheme: testscheme, + Recorder: record.NewFakeRecorder(10), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), } result, err := r.syncPauseOrClear(context.Background(), bs) diff --git a/kubernetes/internal/controller/batchsandbox_status.go b/kubernetes/internal/controller/batchsandbox_status.go index 6e3088f71..0427df935 100644 --- a/kubernetes/internal/controller/batchsandbox_status.go +++ b/kubernetes/internal/controller/batchsandbox_status.go @@ -18,12 +18,12 @@ import ( "context" "encoding/json" "fmt" + "time" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/log" @@ -233,26 +233,47 @@ func applySteadyRuntimePhase(batchSbx *sandboxv1alpha1.BatchSandbox, status *san status.Phase = sandboxv1alpha1.BatchSandboxPhasePending } +// isInitialUnallocatedSandbox returns true when the sandbox has just been created +// and no pods have been allocated yet. In this case we skip writing the initial +// Pending status — the next reconcile after allocation will write Succeed directly. +func isInitialUnallocatedSandbox(batchSbx *sandboxv1alpha1.BatchSandbox, view runtimeView) bool { + return view.status.Replicas == 0 && batchSbx.Status.Phase == "" && + batchSbx.Spec.Replicas != nil && *batchSbx.Spec.Replicas > 0 +} + func (r *BatchSandboxReconciler) persistRuntimeView( ctx context.Context, batchSbx *sandboxv1alpha1.BatchSandbox, view runtimeView, -) []error { +) (time.Duration, []error) { var aggErrors []error log := logf.FromContext(ctx) if err := r.patchBatchSandboxEndpoints(ctx, batchSbx, view.endpointIPs); err != nil { aggErrors = append(aggErrors, err) } - statusChanged := !equality.Semantic.DeepEqual(*view.status, batchSbx.Status) - if statusChanged { - log.Info("To update BatchSandbox status", - "replicas", view.status.Replicas, - "allocated", view.status.Allocated, - "ready", view.status.Ready, - ) - if err := r.updateStatus(batchSbx, view.status); err != nil { + if !equality.Semantic.DeepEqual(*view.status, batchSbx.Status) { + if isInitialUnallocatedSandbox(batchSbx, view) { + return 0, aggErrors + } + // Skip redundant status writes caused by informer cache lag: if we recently + // patched status but the informer hasn't seen the new RV yet, the diff is a + // false positive. Allow a 10s safety valve in case the cache never catches up. + if satisfied, dur := r.StatusRVExpectation.IsSatisfied(batchSbx); !satisfied { + if dur < 10*time.Second { + log.Info("Skipping status update: informer cache is stale", "unsatisfiedDuration", dur.String()) + return time.Second, aggErrors + } + log.Info("Proceeding with status update despite stale cache (timeout exceeded)", "unsatisfiedDuration", dur.String()) + // Fetch the latest object so lifecycle conditions (PauseFailed/ResumeFailed) + // written by pause/resume handlers are not overwritten by the stale cache. + latest := &sandboxv1alpha1.BatchSandbox{} + if err := r.Get(ctx, types.NamespacedName{Namespace: batchSbx.Namespace, Name: batchSbx.Name}, latest); err == nil { + batchSbx = latest + } + } + if err := r.updateStatus(ctx, batchSbx, view.status); err != nil { aggErrors = append(aggErrors, err) - return aggErrors + return 0, aggErrors } } @@ -262,7 +283,7 @@ func (r *BatchSandboxReconciler) persistRuntimeView( aggErrors = append(aggErrors, err) } } - return aggErrors + return 0, aggErrors } func (r *BatchSandboxReconciler) patchBatchSandboxEndpoints(ctx context.Context, batchSbx *sandboxv1alpha1.BatchSandbox, endpointIPs []string) error { @@ -270,7 +291,13 @@ func (r *BatchSandboxReconciler) patchBatchSandboxEndpoints(ctx context.Context, if batchSbx.Annotations[AnnotationSandboxEndpoints] == string(raw) { return nil } - + // Skip writing empty endpoints when annotation doesn't exist yet (e.g. sandbox just created, no pods assigned). + // Still allow clearing endpoints when annotation was previously set (e.g. pause scenario). + _, annotationExists := batchSbx.Annotations[AnnotationSandboxEndpoints] + if !annotationExists && string(raw) == "[]" { + return nil + } + log := logf.FromContext(ctx) patchData, _ := json.Marshal(map[string]any{ "metadata": map[string]any{ "annotations": map[string]string{ @@ -278,21 +305,26 @@ func (r *BatchSandboxReconciler) patchBatchSandboxEndpoints(ctx context.Context, }, }, }) + log.Info("Patching BatchSandbox endpoints", "resourceVersion", batchSbx.ResourceVersion, "patchData", string(patchData)) obj := &sandboxv1alpha1.BatchSandbox{ObjectMeta: metav1.ObjectMeta{Namespace: batchSbx.Namespace, Name: batchSbx.Name}} return r.Patch(ctx, obj, client.RawPatch(types.MergePatchType, patchData)) } -func (r *BatchSandboxReconciler) updateStatus(batchSandbox *sandboxv1alpha1.BatchSandbox, newStatus *sandboxv1alpha1.BatchSandboxStatus) error { - return retry.RetryOnConflict(retry.DefaultBackoff, func() error { - clone := &sandboxv1alpha1.BatchSandbox{} - if err := r.Get(context.TODO(), types.NamespacedName{Namespace: batchSandbox.Namespace, Name: batchSandbox.Name}, clone); err != nil { - return err - } - mergedStatus := newStatus.DeepCopy() - mergedStatus.Conditions = mergeLifecycleConditions(mergedStatus.Conditions, clone.Status.Conditions) - clone.Status = *mergedStatus - return r.Status().Update(context.TODO(), clone) - }) +func (r *BatchSandboxReconciler) updateStatus(ctx context.Context, batchSandbox *sandboxv1alpha1.BatchSandbox, newStatus *sandboxv1alpha1.BatchSandboxStatus) error { + log := logf.FromContext(ctx) + mergedStatus := newStatus.DeepCopy() + mergedStatus.Conditions = mergeLifecycleConditions(mergedStatus.Conditions, batchSandbox.Status.Conditions) + patchData, err := json.Marshal(map[string]any{"status": mergedStatus}) + if err != nil { + return fmt.Errorf("failed to marshal status patch: %w", err) + } + log.Info("Patching BatchSandbox status", "resourceVersion", batchSandbox.ResourceVersion, "phase", mergedStatus.Phase, "patchData", string(patchData)) + obj := &sandboxv1alpha1.BatchSandbox{ObjectMeta: metav1.ObjectMeta{Namespace: batchSandbox.Namespace, Name: batchSandbox.Name}} + if err := r.Status().Patch(ctx, obj, client.RawPatch(types.MergePatchType, patchData)); err != nil { + return err + } + r.StatusRVExpectation.Expect(obj) + return nil } func mergeLifecycleConditions( diff --git a/kubernetes/internal/controller/pool_controller.go b/kubernetes/internal/controller/pool_controller.go index f7c6ad4e7..1899b3834 100644 --- a/kubernetes/internal/controller/pool_controller.go +++ b/kubernetes/internal/controller/pool_controller.go @@ -114,8 +114,12 @@ type PoolReconciler struct { // +kubebuilder:rbac:groups=core,resources=pods/status,verbs=get;update;patch // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete -func (r *PoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *PoolReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { log := logf.FromContext(ctx) + start := time.Now() + defer func() { + log.Info("Reconcile finished", "duration", time.Since(start).String(), "requeueAfter", result.RequeueAfter.String(), "error", retErr) + }() // Fetch the Pool instance pool := &sandboxv1alpha1.Pool{} if err := r.Get(ctx, req.NamespacedName, pool); err != nil { diff --git a/kubernetes/internal/controller/sandboxsnapshot_controller.go b/kubernetes/internal/controller/sandboxsnapshot_controller.go index d1e5288b2..c2daaefe1 100644 --- a/kubernetes/internal/controller/sandboxsnapshot_controller.go +++ b/kubernetes/internal/controller/sandboxsnapshot_controller.go @@ -93,8 +93,12 @@ type SandboxSnapshotReconciler struct { // +kubebuilder:rbac:groups=core,resources=secrets,verbs=get;list;watch // +kubebuilder:rbac:groups=core,resources=events,verbs=get;list;watch;create;update;patch;delete -func (r *SandboxSnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { +func (r *SandboxSnapshotReconciler) Reconcile(ctx context.Context, req ctrl.Request) (result ctrl.Result, retErr error) { log := logf.FromContext(ctx) + start := time.Now() + defer func() { + log.Info("Reconcile finished", "duration", time.Since(start).String(), "requeueAfter", result.RequeueAfter.String(), "error", retErr) + }() snapshot := &sandboxv1alpha1.SandboxSnapshot{} if err := r.Get(ctx, req.NamespacedName, snapshot); err != nil { diff --git a/kubernetes/internal/controller/suite_test.go b/kubernetes/internal/controller/suite_test.go index 33459a69e..abdb549af 100644 --- a/kubernetes/internal/controller/suite_test.go +++ b/kubernetes/internal/controller/suite_test.go @@ -34,6 +34,7 @@ import ( . "github.com/onsi/gomega" sandboxv1alpha1 "github.com/alibaba/OpenSandbox/sandbox-k8s/apis/sandbox/v1alpha1" + "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/expectations" "github.com/alibaba/OpenSandbox/sandbox-k8s/internal/utils/fieldindex" // +kubebuilder:scaffold:imports ) @@ -93,9 +94,10 @@ var _ = BeforeSuite(func() { By("setup reconciler") Expect((&BatchSandboxReconciler{ - Client: k8sManager.GetClient(), - Scheme: k8sManager.GetScheme(), - Recorder: k8sManager.GetEventRecorderFor("test-batch-sandbox-controller"), + Client: k8sManager.GetClient(), + Scheme: k8sManager.GetScheme(), + Recorder: k8sManager.GetEventRecorderFor("test-batch-sandbox-controller"), + StatusRVExpectation: expectations.NewResourceVersionExpectation(), }).SetupWithManager(k8sManager, 32)).Should(Succeed()) Expect((&PoolReconciler{ Client: k8sManager.GetClient(), From 0c0e5ecbfb0a199a1c47d224b2808a537ba97b83 Mon Sep 17 00:00:00 2001 From: "yutian.taoyt" Date: Wed, 20 May 2026 21:12:45 +0800 Subject: [PATCH 42/58] fix(sdk): avoid ERROR-level logs for expected file-not-found on read FilesystemAdapter read operations (readFile/readByteArray/readStream) caught every failure and logged it at ERROR with a full stack trace before rethrowing. A missing file (server returns HTTP 404 with code FILE_NOT_FOUND) is an expected, business-level outcome rather than a fault, so this floods callers' error logs and monitoring with noise for a normal control-flow case (e.g. polling for a not-yet- created stdout file). Distinguish "file not found" from genuine failures and log it at DEBUG instead of ERROR. The exception is still propagated unchanged. - Add SandboxError.FILE_NOT_FOUND constant. - Add Throwable.isFileNotFound() extension (statusCode 404 / code FILE_NOT_FOUND) so callers can also branch on it cleanly. - Route read-failure logging through logReadFailure() which downgrades not-found to DEBUG. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../domain/exceptions/SandboxException.kt | 3 + .../adapters/converter/ExceptionConverter.kt | 11 ++ .../adapters/service/FilesystemAdapter.kt | 27 +++- .../adapters/service/FilesystemAdapterTest.kt | 128 ++++++++++++++++++ 4 files changed, 166 insertions(+), 3 deletions(-) create mode 100644 sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/exceptions/SandboxException.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/exceptions/SandboxException.kt index c57ce63b1..25092de02 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/exceptions/SandboxException.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/exceptions/SandboxException.kt @@ -180,6 +180,9 @@ data class SandboxError( const val INVALID_ARGUMENT = "INVALID_ARGUMENT" const val UNEXPECTED_RESPONSE = "UNEXPECTED_RESPONSE" + /** The requested file or directory does not exist (server responds with HTTP 404). */ + const val FILE_NOT_FOUND = "FILE_NOT_FOUND" + /** Pool-specific: no idle sandbox and policy is FAIL_FAST. */ const val POOL_EMPTY = "POOL_EMPTY" diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt index 5ea52f04b..c6fa9fd2a 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt @@ -39,6 +39,17 @@ import com.alibaba.opensandbox.sandbox.api.execd.infrastructure.ClientException import com.alibaba.opensandbox.sandbox.api.execd.infrastructure.ServerError as ExecdServerError import com.alibaba.opensandbox.sandbox.api.execd.infrastructure.ServerException as ExecdServerException +/** + * Returns `true` when this throwable represents an expected "file or directory does not exist" + * outcome rather than a genuine failure. + * + * Callers (and the adapters themselves) use this to avoid treating a missing file as an error, + * e.g. logging it at ERROR level with a full stack trace, which is just noise for a perfectly + * normal control-flow case such as polling for a not-yet-created file. + */ +fun Throwable.isFileNotFound(): Boolean = + this is SandboxApiException && (error.code == SandboxError.FILE_NOT_FOUND || statusCode == 404) + fun Exception.toSandboxException(): SandboxException { return when (this) { is SandboxException -> this diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapter.kt index 7fa5404d7..cad154073 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapter.kt @@ -35,6 +35,7 @@ import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.Filesys import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.FilesystemConverter.toApiReplaceFileContentMap import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.FilesystemConverter.toEntryInfo import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.FilesystemConverter.toEntryInfoMap +import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.isFileNotFound import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.parseSandboxError import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.toSandboxException import kotlinx.serialization.json.buildJsonObject @@ -108,7 +109,7 @@ internal class FilesystemAdapter( return response.body?.source()?.readString(charset) ?: "" } } catch (e: Exception) { - logger.error("Failed to read file with encoding $encoding: $path", e) + logReadFailure("Failed to read file with encoding $encoding: $path", e) throw e.toSandboxException() } } @@ -134,7 +135,7 @@ internal class FilesystemAdapter( return response.body?.bytes() ?: ByteArray(0) } } catch (e: Exception) { - logger.error("Failed to read file as byte array: $path", e) + logReadFailure("Failed to read file as byte array: $path", e) throw e.toSandboxException() } } @@ -167,7 +168,7 @@ internal class FilesystemAdapter( return response.body?.byteStream() ?: throw IllegalStateException("Response body is null") } catch (e: Exception) { - logger.error("Failed to read file as stream: $path", e) + logReadFailure("Failed to read file as stream: $path", e) throw e.toSandboxException() } } @@ -335,6 +336,26 @@ internal class FilesystemAdapter( } } + /** + * Logs a failed read operation, distinguishing genuine failures from the expected + * "file does not exist" case. + * + * A missing file is a normal control-flow outcome (e.g. polling for a not-yet-created + * file), so it is logged at DEBUG level instead of ERROR to avoid flooding callers' + * error logs and monitoring with stack traces for a non-error condition. The exception + * is still propagated to the caller unchanged. + */ + private fun logReadFailure( + message: String, + e: Exception, + ) { + if (e.isFileNotFound()) { + logger.debug(message, e) + } else { + logger.error(message, e) + } + } + private fun getCharsetFromEncoding(encoding: String): Charset { try { return charset(encoding) diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt new file mode 100644 index 000000000..fb8a8b235 --- /dev/null +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt @@ -0,0 +1,128 @@ +/* + * Copyright 2025 Alibaba Group Holding Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.opensandbox.sandbox.infrastructure.adapters.service + +import com.alibaba.opensandbox.sandbox.HttpClientProvider +import com.alibaba.opensandbox.sandbox.config.ConnectionConfig +import com.alibaba.opensandbox.sandbox.domain.exceptions.SandboxApiException +import com.alibaba.opensandbox.sandbox.domain.exceptions.SandboxError +import com.alibaba.opensandbox.sandbox.domain.models.sandboxes.SandboxEndpoint +import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.isFileNotFound +import okhttp3.mockwebserver.MockResponse +import okhttp3.mockwebserver.MockWebServer +import org.junit.jupiter.api.AfterEach +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertTrue +import org.junit.jupiter.api.BeforeEach +import org.junit.jupiter.api.Test +import org.junit.jupiter.api.assertThrows + +class FilesystemAdapterTest { + private lateinit var mockWebServer: MockWebServer + private lateinit var filesystemAdapter: FilesystemAdapter + private lateinit var httpClientProvider: HttpClientProvider + + @BeforeEach + fun setUp() { + mockWebServer = MockWebServer() + mockWebServer.start() + + val host = mockWebServer.hostName + val port = mockWebServer.port + val endpoint = SandboxEndpoint("$host:$port") + + val config = + ConnectionConfig.builder() + .domain("$host:$port") + .protocol("http") + .build() + + httpClientProvider = HttpClientProvider(config) + filesystemAdapter = FilesystemAdapter(httpClientProvider, endpoint) + } + + @AfterEach + fun tearDown() { + mockWebServer.shutdown() + httpClientProvider.close() + } + + @Test + fun `readFile surfaces FILE_NOT_FOUND error code on 404 so callers can distinguish it`() { + mockWebServer.enqueue( + MockResponse() + .setResponseCode(404) + .setBody( + """{"code":"FILE_NOT_FOUND","message":"file not found. open /tmp/missing.txt: no such file or directory"}""", + ), + ) + + val exception = + assertThrows { + filesystemAdapter.readFile("/tmp/missing.txt", "UTF-8", null) + } + + assertEquals(404, exception.statusCode) + assertEquals(SandboxError.FILE_NOT_FOUND, exception.error.code) + // The exception itself is recognised as a "not found" condition, which is what the + // adapter relies on to avoid emitting ERROR-level log noise for an expected outcome. + assertTrue(exception.isFileNotFound()) + } + + @Test + fun `readFile returns content on success`() { + mockWebServer.enqueue( + MockResponse() + .setResponseCode(200) + .setBody("hello world"), + ) + + val content = filesystemAdapter.readFile("/tmp/hello.txt", "UTF-8", null) + + assertEquals("hello world", content) + } + + @Test + fun `isFileNotFound is true for FILE_NOT_FOUND error code`() { + val exception = + SandboxApiException( + message = "Failed to read file. Status code: 404", + statusCode = 404, + error = SandboxError(SandboxError.FILE_NOT_FOUND), + ) + + assertTrue(exception.isFileNotFound()) + } + + @Test + fun `isFileNotFound is false for other API errors`() { + val exception = + SandboxApiException( + message = "Internal server error", + statusCode = 500, + error = SandboxError(SandboxError.UNEXPECTED_RESPONSE), + ) + + assertFalse(exception.isFileNotFound()) + } + + @Test + fun `isFileNotFound is false for non-sandbox exceptions`() { + assertFalse(RuntimeException("boom").isFileNotFound()) + } +} From 7d8845546341add5fa752acb5de9572ac940f691 Mon Sep 17 00:00:00 2001 From: "yutian.taoyt" Date: Wed, 20 May 2026 21:21:39 +0800 Subject: [PATCH 43/58] style(sdk): apply spotless formatting to isFileNotFound Collapse the expression-body function to a single line as required by the project's spotlessCheck (root ./gradlew spotlessCheck). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../infrastructure/adapters/converter/ExceptionConverter.kt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt index c6fa9fd2a..1dacf1dfa 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt @@ -47,8 +47,7 @@ import com.alibaba.opensandbox.sandbox.api.execd.infrastructure.ServerException * e.g. logging it at ERROR level with a full stack trace, which is just noise for a perfectly * normal control-flow case such as polling for a not-yet-created file. */ -fun Throwable.isFileNotFound(): Boolean = - this is SandboxApiException && (error.code == SandboxError.FILE_NOT_FOUND || statusCode == 404) +fun Throwable.isFileNotFound(): Boolean = this is SandboxApiException && (error.code == SandboxError.FILE_NOT_FOUND || statusCode == 404) fun Exception.toSandboxException(): SandboxException { return when (this) { From 29fab0b6abcf7df251014816a88bf644f8ff50e8 Mon Sep 17 00:00:00 2001 From: "yutian.taoyt" Date: Wed, 20 May 2026 21:23:37 +0800 Subject: [PATCH 44/58] fix(sdk): classify not-found only by explicit FILE_NOT_FOUND code Address review feedback: isFileNotFound() previously treated any HTTP 404 as not-found. A 404 whose body cannot be parsed is mapped to UNEXPECTED_RESPONSE and may signal a real endpoint/routing regression; downgrading those to DEBUG would hide genuine failures. Restrict detection to the explicit SandboxError.FILE_NOT_FOUND code (which the execd server returns for missing files) and add a regression test covering a bare 404 + UNEXPECTED_RESPONSE. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../adapters/converter/ExceptionConverter.kt | 7 ++++++- .../adapters/service/FilesystemAdapterTest.kt | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt index 1dacf1dfa..2c8d287dc 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExceptionConverter.kt @@ -43,11 +43,16 @@ import com.alibaba.opensandbox.sandbox.api.execd.infrastructure.ServerException * Returns `true` when this throwable represents an expected "file or directory does not exist" * outcome rather than a genuine failure. * + * Detection is intentionally restricted to the explicit [SandboxError.FILE_NOT_FOUND] server + * error code rather than a bare HTTP 404. A 404 whose body cannot be parsed is mapped to + * [SandboxError.UNEXPECTED_RESPONSE] and may indicate a real endpoint/routing/configuration + * regression, which must stay loud (ERROR) instead of being silently downgraded. + * * Callers (and the adapters themselves) use this to avoid treating a missing file as an error, * e.g. logging it at ERROR level with a full stack trace, which is just noise for a perfectly * normal control-flow case such as polling for a not-yet-created file. */ -fun Throwable.isFileNotFound(): Boolean = this is SandboxApiException && (error.code == SandboxError.FILE_NOT_FOUND || statusCode == 404) +fun Throwable.isFileNotFound(): Boolean = this is SandboxApiException && error.code == SandboxError.FILE_NOT_FOUND fun Exception.toSandboxException(): SandboxException { return when (this) { diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt index fb8a8b235..7bbb95754 100644 --- a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/FilesystemAdapterTest.kt @@ -121,6 +121,20 @@ class FilesystemAdapterTest { assertFalse(exception.isFileNotFound()) } + @Test + fun `isFileNotFound is false for a 404 without an explicit FILE_NOT_FOUND code`() { + // A 404 whose body could not be parsed is mapped to UNEXPECTED_RESPONSE. It may indicate a + // real endpoint/routing regression, so it must NOT be downgraded to a not-found condition. + val exception = + SandboxApiException( + message = "Failed to read file. Status code: 404", + statusCode = 404, + error = SandboxError(SandboxError.UNEXPECTED_RESPONSE), + ) + + assertFalse(exception.isFileNotFound()) + } + @Test fun `isFileNotFound is false for non-sandbox exceptions`() { assertFalse(RuntimeException("boom").isFileNotFound()) From 58509092e22875e6dbb690185d94162f0221ffc1 Mon Sep 17 00:00:00 2001 From: "ninan.nn" Date: Thu, 21 May 2026 11:25:56 +0800 Subject: [PATCH 45/58] fix(sdk): use java duration for Kotlin command timeouts --- sdks/sandbox/kotlin/gradle.properties | 2 +- .../alibaba/opensandbox/sandbox/config/ConnectionConfig.kt | 2 +- .../domain/models/execd/executions/RunCommandRequest.kt | 2 +- .../domain/models/execd/executions/RunInSessionRequest.kt | 2 +- .../alibaba/opensandbox/sandbox/domain/services/Commands.kt | 2 +- .../infrastructure/adapters/converter/ExecutionConverter.kt | 2 +- .../infrastructure/adapters/service/CommandsAdapter.kt | 2 +- .../infrastructure/adapters/service/CommandsAdapterTest.kt | 4 ++-- 8 files changed, 9 insertions(+), 9 deletions(-) diff --git a/sdks/sandbox/kotlin/gradle.properties b/sdks/sandbox/kotlin/gradle.properties index ad3c8fa62..688c7d3be 100644 --- a/sdks/sandbox/kotlin/gradle.properties +++ b/sdks/sandbox/kotlin/gradle.properties @@ -5,5 +5,5 @@ org.gradle.parallel=true # Project metadata project.group=com.alibaba.opensandbox -project.version=1.0.11 +project.version=1.0.12 project.description=A Kotlin SDK for Open Sandbox API diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/config/ConnectionConfig.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/config/ConnectionConfig.kt index b9a7e6019..64888e54e 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/config/ConnectionConfig.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/config/ConnectionConfig.kt @@ -71,7 +71,7 @@ class ConnectionConfig private constructor( private const val ENV_API_KEY = "OPEN_SANDBOX_API_KEY" private const val ENV_DOMAIN = "OPEN_SANDBOX_DOMAIN" - private const val DEFAULT_USER_AGENT = "OpenSandbox-Kotlin-SDK/1.0.11" + private const val DEFAULT_USER_AGENT = "OpenSandbox-Kotlin-SDK/1.0.12" private const val API_VERSION = "v1" @JvmStatic diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt index f080486a6..7a679e306 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt @@ -16,7 +16,7 @@ package com.alibaba.opensandbox.sandbox.domain.models.execd.executions -import kotlin.time.Duration +import java.time.Duration /** * Parameters for command execution. diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt index fb7dd9ae4..860e258a1 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt @@ -16,7 +16,7 @@ package com.alibaba.opensandbox.sandbox.domain.models.execd.executions -import kotlin.time.Duration +import java.time.Duration /** * Request to run a command in an existing bash session. diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt index fcf8af170..d0e32824d 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt @@ -21,7 +21,7 @@ import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.CommandSta import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.Execution import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunCommandRequest import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunInSessionRequest -import kotlin.time.Duration +import java.time.Duration /** * Command execution operations for sandbox environments. diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt index 9e64d11ba..00fe67e40 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt @@ -27,7 +27,7 @@ object ExecutionConverter { command = command, background = background, cwd = workingDirectory, - timeout = timeout?.inWholeMilliseconds, + timeout = timeout?.toMillis(), uid = uid, gid = gid, envs = envs, diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt index 76d2b249e..0cbb9db3d 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt @@ -196,7 +196,7 @@ internal class CommandsAdapter( RunInSessionRequestApi( command = request.command, cwd = request.workingDirectory, - timeout = request.timeout?.inWholeMilliseconds, + timeout = request.timeout?.toMillis(), ) val runUrl = execdBaseUrl diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt index 0985dbe3b..a906bd77a 100644 --- a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt @@ -39,9 +39,9 @@ import org.junit.jupiter.api.Assertions.assertTrue import org.junit.jupiter.api.BeforeEach import org.junit.jupiter.api.Test import org.junit.jupiter.api.assertThrows +import java.time.Duration import java.util.concurrent.CountDownLatch import java.util.concurrent.TimeUnit -import kotlin.time.Duration.Companion.seconds class CommandsAdapterTest { // CommandsAdapter unit tests @@ -340,7 +340,7 @@ data: {"type":"execution_complete","execution_time":100,"timestamp":167253120100 RunInSessionRequest.builder() .command("echo Hello") .workingDirectory("/workspace") - .timeout(5.seconds) + .timeout(Duration.ofSeconds(5)) .handlers(handlers) .build(), ) From a547672ea7650c98c4a6a1e6258d6a5487d54eb0 Mon Sep 17 00:00:00 2001 From: 4ek0 <4ek0@users.noreply.github.com> Date: Thu, 21 May 2026 12:59:20 +0800 Subject: [PATCH 46/58] fix(ci): add diagnostic output to license verification script Add timestamp, hostname, and user information to license verification output for CI debugging purposes. --- scripts/verify-license.sh | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/scripts/verify-license.sh b/scripts/verify-license.sh index d2f35b4f1..096d834d2 100755 --- a/scripts/verify-license.sh +++ b/scripts/verify-license.sh @@ -19,6 +19,12 @@ set -euo pipefail +# Print CI diagnostics +echo "License verification started at: $(date -u '+%Y-%m-%dT%H:%M:%SZ')" +echo "Runner: $(hostname) ($(uname -srm))" +echo "User: $(whoami)" +echo "Working directory: $(pwd)" + REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" CURRENT_YEAR="$(date +%Y)" MIN_YEAR="2025" @@ -46,9 +52,6 @@ IGNORED_PATHS=( is_k8s_mock_go() { local file="${1-}" [[ -z "$file" ]] && return 1 - # Skip any Go mocks under kubernetes/internal: - # - filenames ending with _mock.go - # - any file under a /mock/ directory if [[ "$file" != kubernetes/internal/* ]]; then return 1 fi @@ -63,7 +66,6 @@ is_k8s_mock_go() { is_generated_to_skip() { local file="$1" - # Skip common generated files if [[ "$file" == *"deepcopy.go" ]]; then return 0 fi @@ -112,25 +114,20 @@ has_expected_basename() { missing=() while IFS= read -r file; do - # Skip ignored paths if is_ignored "$file"; then continue fi - # Skip kubernetes internal mock go files if is_k8s_mock_go "$file"; then continue fi - # Skip generated files if is_generated_to_skip "$file"; then continue fi - # Only check files with expected extensions or basenames if ! has_expected_extension "$file" && ! has_expected_basename "$file"; then continue fi - # Limit scan to the first 25 lines to allow shebangs/DOCTYPE above the header. header="$(head -n 25 "$file")" if ! echo "$header" | grep -Eq "$LICENSE_REGEX"; then missing+=("$file") From e07b9fde000d9a7341cdab7d12e3e3e0d97f05f4 Mon Sep 17 00:00:00 2001 From: "ninan.nn" Date: Thu, 21 May 2026 15:02:30 +0800 Subject: [PATCH 47/58] fix(sdk): add kotlin duration compatibility overloads --- sdks/AGENTS.md | 1 + .../execd/executions/RunCommandRequest.kt | 9 ++++ .../execd/executions/RunInSessionRequest.kt | 9 ++++ .../sandbox/domain/services/Commands.kt | 18 +++++++ .../execd/executions/RunCommandRequestTest.kt | 47 +++++++++++++++++++ .../executions/RunInSessionRequestTest.kt | 47 +++++++++++++++++++ 6 files changed, 131 insertions(+) create mode 100644 sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequestTest.kt create mode 100644 sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequestTest.kt diff --git a/sdks/AGENTS.md b/sdks/AGENTS.md index b0acafe30..b021a68cd 100644 --- a/sdks/AGENTS.md +++ b/sdks/AGENTS.md @@ -120,6 +120,7 @@ Always: - Keep package-local validation fast before widening to multi-language verification. - Match public behavior across languages unless a documented platform constraint prevents it. - Keep wire-format units and public SDK units separate. Public SDK interfaces should expose time durations as language-native duration types where available (`timedelta`, `Duration`) or otherwise as explicitly second-based fields such as `timeoutSeconds`. +- For Kotlin SDK public APIs intended for Java interoperability, do not expose Kotlin value classes such as `kotlin.time.Duration`; they are JVM-name-mangled and can be inaccessible from Java. Prefer `java.time.Duration` or explicit primitive wire units at the public boundary, with deprecated Kotlin-friendly overloads when needed for migration. Ask first: diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt index 7a679e306..0ecf2bec5 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt @@ -17,6 +17,7 @@ package com.alibaba.opensandbox.sandbox.domain.models.execd.executions import java.time.Duration +import kotlin.time.toJavaDuration /** * Parameters for command execution. @@ -80,6 +81,14 @@ class RunCommandRequest private constructor( return this } + @Deprecated( + message = "Use java.time.Duration instead.", + replaceWith = ReplaceWith("timeout(timeout?.toJavaDuration())", "kotlin.time.toJavaDuration"), + ) + fun timeout(timeout: kotlin.time.Duration?): Builder { + return timeout(timeout?.toJavaDuration()) + } + fun uid(uid: Int?): Builder { require(uid == null || uid >= 0) { "Uid must be >= 0" } this.uid = uid diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt index 860e258a1..25f0e9575 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt @@ -17,6 +17,7 @@ package com.alibaba.opensandbox.sandbox.domain.models.execd.executions import java.time.Duration +import kotlin.time.toJavaDuration /** * Request to run a command in an existing bash session. @@ -59,6 +60,14 @@ class RunInSessionRequest private constructor( return this } + @Deprecated( + message = "Use java.time.Duration instead.", + replaceWith = ReplaceWith("timeout(timeout?.toJavaDuration())", "kotlin.time.toJavaDuration"), + ) + fun timeout(timeout: kotlin.time.Duration?): Builder { + return timeout(timeout?.toJavaDuration()) + } + fun handlers(handlers: ExecutionHandlers?): Builder { this.handlers = handlers return this diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt index d0e32824d..65a3c20b0 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt @@ -22,6 +22,7 @@ import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.Execution import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunCommandRequest import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunInSessionRequest import java.time.Duration +import kotlin.time.toJavaDuration /** * Command execution operations for sandbox environments. @@ -125,6 +126,23 @@ interface Commands { ) } + @Deprecated( + message = "Use java.time.Duration instead.", + replaceWith = + ReplaceWith( + "runInSession(sessionId, command, workingDirectory, timeout.toJavaDuration())", + "kotlin.time.toJavaDuration", + ), + ) + fun runInSession( + sessionId: String, + command: String, + workingDirectory: String? = null, + timeout: kotlin.time.Duration, + ): Execution { + return runInSession(sessionId, command, workingDirectory, timeout.toJavaDuration()) + } + /** * Deletes a bash session and releases resources. * diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequestTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequestTest.kt new file mode 100644 index 000000000..c3bfcce05 --- /dev/null +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequestTest.kt @@ -0,0 +1,47 @@ +/* + * Copyright 2025 Alibaba Group Holding Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.opensandbox.sandbox.domain.models.execd.executions + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import java.time.Duration +import kotlin.time.Duration.Companion.seconds + +class RunCommandRequestTest { + @Test + fun `builder accepts java duration for timeout`() { + val request = + RunCommandRequest.builder() + .command("echo hi") + .timeout(Duration.ofSeconds(5)) + .build() + + assertEquals(Duration.ofSeconds(5), request.timeout) + } + + @Suppress("DEPRECATION") + @Test + fun `builder accepts deprecated kotlin duration for timeout`() { + val request = + RunCommandRequest.builder() + .command("echo hi") + .timeout(5.seconds) + .build() + + assertEquals(Duration.ofSeconds(5), request.timeout) + } +} diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequestTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequestTest.kt new file mode 100644 index 000000000..6dd02a907 --- /dev/null +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequestTest.kt @@ -0,0 +1,47 @@ +/* + * Copyright 2025 Alibaba Group Holding Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.alibaba.opensandbox.sandbox.domain.models.execd.executions + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import java.time.Duration +import kotlin.time.Duration.Companion.seconds + +class RunInSessionRequestTest { + @Test + fun `builder accepts java duration for timeout`() { + val request = + RunInSessionRequest.builder() + .command("echo hi") + .timeout(Duration.ofSeconds(5)) + .build() + + assertEquals(Duration.ofSeconds(5), request.timeout) + } + + @Suppress("DEPRECATION") + @Test + fun `builder accepts deprecated kotlin duration for timeout`() { + val request = + RunInSessionRequest.builder() + .command("echo hi") + .timeout(5.seconds) + .build() + + assertEquals(Duration.ofSeconds(5), request.timeout) + } +} From 371417055129df58e507ba5b8a948cea87556ef8 Mon Sep 17 00:00:00 2001 From: "ninan.nn" Date: Thu, 21 May 2026 15:19:17 +0800 Subject: [PATCH 48/58] fix(sdk): make command timeout setters non-null --- .../models/execd/executions/RunCommandRequest.kt | 8 ++++---- .../models/execd/executions/RunInSessionRequest.kt | 8 ++++---- .../opensandbox/sandbox/domain/services/Commands.kt | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt index 0ecf2bec5..18b9fbdfd 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunCommandRequest.kt @@ -76,17 +76,17 @@ class RunCommandRequest private constructor( * Maximum execution time; server will terminate the command when reached. * If omitted, the server will not enforce any timeout. */ - fun timeout(timeout: Duration?): Builder { + fun timeout(timeout: Duration): Builder { this.timeout = timeout return this } @Deprecated( message = "Use java.time.Duration instead.", - replaceWith = ReplaceWith("timeout(timeout?.toJavaDuration())", "kotlin.time.toJavaDuration"), + replaceWith = ReplaceWith("timeout(timeout.toJavaDuration())", "kotlin.time.toJavaDuration"), ) - fun timeout(timeout: kotlin.time.Duration?): Builder { - return timeout(timeout?.toJavaDuration()) + fun timeout(timeout: kotlin.time.Duration): Builder { + return timeout(timeout.toJavaDuration()) } fun uid(uid: Int?): Builder { diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt index 25f0e9575..699a0e698 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/models/execd/executions/RunInSessionRequest.kt @@ -55,17 +55,17 @@ class RunInSessionRequest private constructor( return this } - fun timeout(timeout: Duration?): Builder { + fun timeout(timeout: Duration): Builder { this.timeout = timeout return this } @Deprecated( message = "Use java.time.Duration instead.", - replaceWith = ReplaceWith("timeout(timeout?.toJavaDuration())", "kotlin.time.toJavaDuration"), + replaceWith = ReplaceWith("timeout(timeout.toJavaDuration())", "kotlin.time.toJavaDuration"), ) - fun timeout(timeout: kotlin.time.Duration?): Builder { - return timeout(timeout?.toJavaDuration()) + fun timeout(timeout: kotlin.time.Duration): Builder { + return timeout(timeout.toJavaDuration()) } fun handlers(handlers: ExecutionHandlers?): Builder { diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt index 65a3c20b0..db9b8940e 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Commands.kt @@ -116,14 +116,14 @@ interface Commands { workingDirectory: String? = null, timeout: Duration? = null, ): Execution { - return runInSession( - sessionId, + val builder = RunInSessionRequest.builder() .command(command) .workingDirectory(workingDirectory) - .timeout(timeout) - .build(), - ) + if (timeout != null) { + builder.timeout(timeout) + } + return runInSession(sessionId, builder.build()) } @Deprecated( From f8d2572041dd25997c5143bf1e9d466cda108198 Mon Sep 17 00:00:00 2001 From: "ninan.nn" Date: Thu, 21 May 2026 15:24:54 +0800 Subject: [PATCH 49/58] fix(sdk): reject oversized command timeouts --- .../adapters/converter/ExecutionConverter.kt | 12 +++++++++++- .../adapters/service/CommandsAdapter.kt | 3 ++- .../adapters/service/CommandsAdapterTest.kt | 10 ++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt index 00fe67e40..e07126ba3 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/converter/ExecutionConverter.kt @@ -18,6 +18,7 @@ package com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.CommandStatus import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunCommandRequest +import java.time.Duration import com.alibaba.opensandbox.sandbox.api.models.execd.CommandStatusResponse as ApiCommandStatusResponse import com.alibaba.opensandbox.sandbox.api.models.execd.RunCommandRequest as ApiRunCommandRequest @@ -27,7 +28,7 @@ object ExecutionConverter { command = command, background = background, cwd = workingDirectory, - timeout = timeout?.toMillis(), + timeout = timeout?.toCommandTimeoutMillis(), uid = uid, gid = gid, envs = envs, @@ -46,3 +47,12 @@ object ExecutionConverter { ) } } + +internal fun Duration.toCommandTimeoutMillis(): Long { + require(!isNegative) { "Timeout must be non-negative, got: $this" } + return try { + toMillis() + } catch (e: ArithmeticException) { + throw IllegalArgumentException("Timeout is too large to represent in milliseconds: $this", e) + } +} diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt index 0cbb9db3d..3eb201537 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapter.kt @@ -42,6 +42,7 @@ import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.Executi import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.ExecutionEventDispatcher import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.jsonParser import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.parseSandboxError +import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.toCommandTimeoutMillis import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.toSandboxException import okhttp3.Headers.Companion.toHeaders import okhttp3.HttpUrl.Companion.toHttpUrlOrNull @@ -196,7 +197,7 @@ internal class CommandsAdapter( RunInSessionRequestApi( command = request.command, cwd = request.workingDirectory, - timeout = request.timeout?.toMillis(), + timeout = request.timeout?.toCommandTimeoutMillis(), ) val runUrl = execdBaseUrl diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt index a906bd77a..87cca06a8 100644 --- a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/CommandsAdapterTest.kt @@ -25,6 +25,7 @@ import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.ExecutionH import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunCommandRequest import com.alibaba.opensandbox.sandbox.domain.models.execd.executions.RunInSessionRequest import com.alibaba.opensandbox.sandbox.domain.models.sandboxes.SandboxEndpoint +import com.alibaba.opensandbox.sandbox.infrastructure.adapters.converter.toCommandTimeoutMillis import kotlinx.serialization.json.Json import kotlinx.serialization.json.booleanOrNull import kotlinx.serialization.json.intOrNull @@ -359,6 +360,15 @@ data: {"type":"execution_complete","execution_time":100,"timestamp":167253120100 assertEquals(5000L, requestBodyJson["timeout"]?.jsonPrimitive?.content?.toLong()) } + @Test + fun `command timeout conversion should reject durations too large for milliseconds`() { + val exception = + assertThrows(IllegalArgumentException::class.java) { + Duration.ofSeconds(Long.MAX_VALUE).toCommandTimeoutMillis() + } + assertTrue(exception.message!!.contains("too large to represent in milliseconds")) + } + @Test fun `runInSession should infer non-zero exit code from command error event`() { val initEvent = """data: {"type":"init","text":"cmd-123","timestamp":1672531200000}""" From 9014df93e538ef3202e7f1e604d3960d955010fc Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 21 May 2026 09:11:19 +0000 Subject: [PATCH 50/58] chore: bump image-committer to v0.1.0 --- kubernetes/charts/opensandbox-controller/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kubernetes/charts/opensandbox-controller/README.md b/kubernetes/charts/opensandbox-controller/README.md index 644e073e3..990eb017e 100644 --- a/kubernetes/charts/opensandbox-controller/README.md +++ b/kubernetes/charts/opensandbox-controller/README.md @@ -164,7 +164,7 @@ The chart exposes the snapshot-related settings below: ```yaml controller: snapshot: - imageCommitterImage: my-registry/image-committer:v1.0.0 + imageCommitterImage: my-registry/image-committer:v0.1.0 commitJobTimeout: 15m registry: my-registry/snapshots registryInsecure: false From 018b7018c94f9bdd6ca28b65b118f6314a857ca7 Mon Sep 17 00:00:00 2001 From: "pingshan.wj" Date: Thu, 21 May 2026 21:52:31 +0800 Subject: [PATCH 51/58] test(k8s): make e2e suite portable across cluster modes Decouples the kubernetes/ e2e suite from the assumption that it always runs against a freshly-built Kind cluster, so it can also be pointed at an externally-provided Kubernetes cluster (e.g. minikube, a shared dev cluster, or a CI-provisioned one) via KUBECONFIG. - New test/utils/cluster_mode.go centralises mode + env-driven defaults (E2E_MODE, E2E_POD_SECURITY_ENFORCE, registry / namespace / credential knobs for the pause/resume sub-suite). Default mode stays "kind", so existing `make test-e2e-main` behaviour is unchanged. - LoadImageToKindClusterWithName becomes a no-op when E2E_MODE!=kind, since images are expected to live in a registry the target cluster can pull from. - BeforeSuite skips docker-build / kind-load entirely outside Kind mode. - The pod-security label step is now controlled by E2E_POD_SECURITY_ENFORCE; setting it to an empty value skips the label, which is useful on clusters that enforce their own admission policy. Default value preserves the current "restricted" behaviour. - pause_resume_test.go: registry namespace / address / credentials are now overridable via env; deploy/undeploy switched to `make install/deploy/undeploy/uninstall` for consistency with e2e_test.go; registry-deployment.yaml is rendered as a template so the source image can be pointed at a mirror. - registry-deployment.yaml: parameterised image and switched the Service to ClusterIP (the suite only consumes it via cluster DNS, so the previous NodePort 30500 was unnecessary and can collide on shared clusters). - Added Ginkgo Labels (Core/Manager/Pool/Batch/Task/PauseResume) so consumers can use -ginkgo.label-filter to subset the suite. --- kubernetes/Makefile | 7 +- kubernetes/config/manager/manager.yaml | 1 + kubernetes/test/e2e/e2e_suite_test.go | 25 ++-- kubernetes/test/e2e/e2e_test.go | 34 ++--- kubernetes/test/e2e/pause_resume_test.go | 50 +++++--- .../e2e/testdata/registry-deployment.yaml | 9 +- kubernetes/test/utils/cluster_mode.go | 117 ++++++++++++++++++ kubernetes/test/utils/utils.go | 9 +- 8 files changed, 205 insertions(+), 47 deletions(-) create mode 100644 kubernetes/test/utils/cluster_mode.go diff --git a/kubernetes/Makefile b/kubernetes/Makefile index 232a2346e..7357ceecd 100644 --- a/kubernetes/Makefile +++ b/kubernetes/Makefile @@ -56,6 +56,8 @@ CONTROLLER_IMG ?= controller:dev TASK_EXECUTOR_IMG ?= task-executor:dev # IMAGE_COMMITTER_IMG defines the image for the image-committer service. IMAGE_COMMITTER_IMG ?= image-committer:dev +# SNAPSHOT_REGISTRY defines the OCI registry used by the controller for snapshot images. +SNAPSHOT_REGISTRY ?= docker-registry.default.svc.cluster.local:5000 # Get the currently used golang install path (in GOPATH/bin, unless GOBIN is set) ifeq (,$(shell go env GOBIN)) @@ -364,7 +366,10 @@ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified .PHONY: deploy deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. cd config/manager && $(KUSTOMIZE) edit set image controller=${CONTROLLER_IMG} - $(KUSTOMIZE) build config/default | $(KUBECTL) apply -f - + $(KUSTOMIZE) build config/default | \ + sed 's|--snapshot-registry=docker-registry.default.svc.cluster.local:5000|--snapshot-registry=$(SNAPSHOT_REGISTRY)|' | \ + sed 's|--image-committer-image=image-committer:dev|--image-committer-image=$(IMAGE_COMMITTER_IMG)|' | \ + $(KUBECTL) apply -f - .PHONY: undeploy undeploy: kustomize ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. diff --git a/kubernetes/config/manager/manager.yaml b/kubernetes/config/manager/manager.yaml index 34190590d..ba904c456 100644 --- a/kubernetes/config/manager/manager.yaml +++ b/kubernetes/config/manager/manager.yaml @@ -67,6 +67,7 @@ spec: - --snapshot-registry-insecure=true - --snapshot-push-secret=registry-snapshot-push-secret - --resume-pull-secret=registry-pull-secret + - --image-committer-image=image-committer:dev image: controller:dev name: manager ports: [] diff --git a/kubernetes/test/e2e/e2e_suite_test.go b/kubernetes/test/e2e/e2e_suite_test.go index 41f90827a..a7a7719bf 100644 --- a/kubernetes/test/e2e/e2e_suite_test.go +++ b/kubernetes/test/e2e/e2e_suite_test.go @@ -36,6 +36,13 @@ func TestE2E(t *testing.T) { } var _ = BeforeSuite(func() { + if utils.SkipImageBuild() { + _, _ = fmt.Fprintf(GinkgoWriter, + "E2E_MODE=%s SKIP_IMAGE_BUILD=true: skipping docker build & kind load (images expected to be pre-built)\n", + utils.Mode()) + return + } + dockerBuildArgs := os.Getenv("DOCKER_BUILD_ARGS") By("building the manager(Operator) image") @@ -79,22 +86,22 @@ var _ = BeforeSuite(func() { err = utils.LoadImageToKindClusterWithName(utils.ImageCommitterImage) ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the image-committer image into Kind") - By("pulling the registry:2 image (required for pause/resume tests)") - cmd = exec.Command("docker", "pull", "--platform", "linux/amd64", "registry:2") + By("pulling the registry image (required for pause/resume tests)") + cmd = exec.Command("docker", "pull", "--platform", "linux/amd64", utils.RegistrySourceImage()) _, err = utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to pull registry:2 image") + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to pull registry image") - By("loading the registry:2 image on Kind") - err = utils.LoadImageToKindClusterWithName("registry:2") - ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the registry:2 image into Kind") + By("loading the registry image on Kind") + err = utils.LoadImageToKindClusterWithName(utils.RegistrySourceImage()) + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the registry image into Kind") By("pulling the alpine image (required for commit jobs)") - cmd = exec.Command("docker", "pull", "alpine:latest") + cmd = exec.Command("docker", "pull", utils.AlpineImage()) _, err = utils.Run(cmd) - ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to pull alpine:latest image") + ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to pull alpine image") By("loading the alpine image on Kind") - err = utils.LoadImageToKindClusterWithName("alpine:latest") + err = utils.LoadImageToKindClusterWithName(utils.AlpineImage()) ExpectWithOffset(1, err).NotTo(HaveOccurred(), "Failed to load the alpine image into Kind") }) diff --git a/kubernetes/test/e2e/e2e_test.go b/kubernetes/test/e2e/e2e_test.go index 50bcd3902..4b3ddcfb1 100644 --- a/kubernetes/test/e2e/e2e_test.go +++ b/kubernetes/test/e2e/e2e_test.go @@ -34,7 +34,7 @@ import ( // namespace where the project is deployed in const namespace = "opensandbox-system" -var _ = Describe("Manager", Ordered, func() { +var _ = Describe("Manager", Ordered, Label("Core"), func() { var controllerPodName string // Before running the tests, set up the environment by creating the namespace, @@ -49,11 +49,15 @@ var _ = Describe("Manager", Ordered, func() { Expect(err.Error()).To(ContainSubstring("AlreadyExists"), "Failed to create namespace") } - By("labeling the namespace to enforce the restricted security policy") - cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, - "pod-security.kubernetes.io/enforce=restricted") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with restricted policy") + if psa := utils.PodSecurityEnforce(); psa != "" { + By("labeling the namespace to enforce the " + psa + " security policy") + cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, + "pod-security.kubernetes.io/enforce="+psa) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with "+psa+" policy") + } else { + By("skipping pod-security label (E2E_POD_SECURITY_ENFORCE is empty)") + } By("installing CRDs") cmd = exec.Command("make", "install") @@ -132,7 +136,7 @@ var _ = Describe("Manager", Ordered, func() { SetDefaultEventuallyTimeout(2 * time.Minute) SetDefaultEventuallyPollingInterval(time.Second) - Context("Manager", func() { + Context("Manager", Label("Manager"), func() { It("should run successfully", func() { By("validating that the controller-manager pod is running as expected") verifyControllerUp := func(g Gomega) { @@ -167,7 +171,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Pool", func() { + Context("Pool", Label("Pool"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { @@ -869,7 +873,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("BatchSandbox", func() { + Context("BatchSandbox", Label("Batch"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { @@ -1391,7 +1395,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Task", func() { + Context("Task", Label("Task"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { @@ -1557,7 +1561,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Pool Update", func() { + Context("Pool Update", Label("Pool"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { @@ -1815,7 +1819,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Pool State Recovery", func() { + Context("Pool State Recovery", Label("Pool"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { @@ -2440,7 +2444,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Pool Recycle", func() { + Context("Pool Recycle", Label("Pool"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { @@ -3084,7 +3088,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Pool Allocator Integrity", func() { + Context("Pool Allocator Integrity", Label("Pool"), func() { const testNamespace = "default" BeforeAll(func() { @@ -4362,7 +4366,7 @@ var _ = Describe("Manager", Ordered, func() { }) }) - Context("Pool Auto-Assign", func() { + Context("Pool Auto-Assign", Label("Pool"), func() { BeforeAll(func() { By("waiting for controller to be ready") Eventually(func(g Gomega) { diff --git a/kubernetes/test/e2e/pause_resume_test.go b/kubernetes/test/e2e/pause_resume_test.go index 4b3996f21..2ea379500 100644 --- a/kubernetes/test/e2e/pause_resume_test.go +++ b/kubernetes/test/e2e/pause_resume_test.go @@ -30,14 +30,14 @@ import ( "github.com/alibaba/OpenSandbox/sandbox-k8s/test/utils" ) -const ( - pauseResumeNamespace = "default" - registryServiceAddr = "docker-registry.default.svc.cluster.local:5000" - registryUsername = "testuser" - registryPassword = "testpass" +var ( + pauseResumeNamespace = utils.PauseResumeNamespace() + registryServiceAddr = utils.PauseResumeRegistryAddr() + registryUsername = utils.PauseResumeRegistryUser() + registryPassword = utils.PauseResumeRegistryPass() ) -var _ = Describe("PauseResume", Ordered, func() { +var _ = Describe("PauseResume", Ordered, Label("PauseResume"), func() { SetDefaultEventuallyTimeout(3 * time.Minute) SetDefaultEventuallyPollingInterval(time.Second) @@ -49,19 +49,34 @@ var _ = Describe("PauseResume", Ordered, func() { Expect(err.Error()).To(ContainSubstring("AlreadyExists")) } - By("labeling the namespace to enforce the restricted security policy") - cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, - "pod-security.kubernetes.io/enforce=restricted") - _, err = utils.Run(cmd) - Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with restricted policy") + if pauseResumeNamespace != namespace { + By("creating pause-resume namespace") + cmd = exec.Command("kubectl", "create", "ns", pauseResumeNamespace) + _, err = utils.Run(cmd) + if err != nil { + Expect(err.Error()).To(ContainSubstring("AlreadyExists")) + } + } + + if psa := utils.PodSecurityEnforce(); psa != "" { + By("labeling the namespace to enforce the " + psa + " security policy") + cmd = exec.Command("kubectl", "label", "--overwrite", "ns", namespace, + "pod-security.kubernetes.io/enforce="+psa) + _, err = utils.Run(cmd) + Expect(err).NotTo(HaveOccurred(), "Failed to label namespace with "+psa+" policy") + } else { + By("skipping pod-security label (E2E_POD_SECURITY_ENFORCE is empty)") + } By("installing CRDs") - cmd = exec.Command("kubectl", "apply", "-f", "config/crd/bases") + cmd = exec.Command("make", "install") _, err = utils.Run(cmd) Expect(err).NotTo(HaveOccurred(), "Failed to install CRDs") By("deploying the controller-manager") - cmd = exec.Command("kubectl", "apply", "-k", "config/default") + cmd = exec.Command("make", "deploy", + fmt.Sprintf("CONTROLLER_IMG=%s", utils.ControllerImage), + fmt.Sprintf("SNAPSHOT_REGISTRY=%s", registryServiceAddr)) _, err = utils.Run(cmd) Expect(err).NotTo(HaveOccurred(), "Failed to deploy the controller-manager") @@ -82,7 +97,10 @@ var _ = Describe("PauseResume", Ordered, func() { Expect(err).NotTo(HaveOccurred()) By("deploying Docker Registry") - registryYAML, err := renderTemplate("testdata/registry-deployment.yaml", nil) + registryYAML, err := renderTemplate("testdata/registry-deployment.yaml", map[string]interface{}{ + "RegistryImage": utils.RegistrySourceImage(), + "Namespace": pauseResumeNamespace, + }) Expect(err).NotTo(HaveOccurred()) registryFile := filepath.Join("/tmp", "test-registry.yaml") @@ -126,11 +144,11 @@ var _ = Describe("PauseResume", Ordered, func() { utils.Run(cmd) By("undeploying the controller-manager") - cmd = exec.Command("kubectl", "delete", "-k", "config/default", "--ignore-not-found=true") + cmd = exec.Command("make", "undeploy") utils.Run(cmd) By("uninstalling CRDs") - cmd = exec.Command("kubectl", "delete", "-f", "config/crd/bases", "--ignore-not-found=true") + cmd = exec.Command("make", "uninstall") utils.Run(cmd) By("removing manager namespace") diff --git a/kubernetes/test/e2e/testdata/registry-deployment.yaml b/kubernetes/test/e2e/testdata/registry-deployment.yaml index b0f3dc7d7..b97044312 100644 --- a/kubernetes/test/e2e/testdata/registry-deployment.yaml +++ b/kubernetes/test/e2e/testdata/registry-deployment.yaml @@ -2,7 +2,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: docker-registry - namespace: default + namespace: {{ .Namespace }} spec: replicas: 1 selector: @@ -15,7 +15,7 @@ spec: spec: containers: - name: registry - image: registry:2 + image: {{ .RegistryImage }} ports: - containerPort: 5000 env: @@ -41,12 +41,11 @@ apiVersion: v1 kind: Service metadata: name: docker-registry - namespace: default + namespace: {{ .Namespace }} spec: - type: NodePort + type: ClusterIP ports: - port: 5000 targetPort: 5000 - nodePort: 30500 selector: app: docker-registry \ No newline at end of file diff --git a/kubernetes/test/utils/cluster_mode.go b/kubernetes/test/utils/cluster_mode.go new file mode 100644 index 000000000..2a014e6dd --- /dev/null +++ b/kubernetes/test/utils/cluster_mode.go @@ -0,0 +1,117 @@ +// Copyright 2025 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package utils + +import "os" + +const ( + // ModeKind runs e2e against a local Kind cluster (default). + ModeKind = "kind" + // ModeExternal runs e2e against an externally-provided Kubernetes cluster + // using the kubeconfig pointed to by KUBECONFIG. Use this when targeting + // minikube, a shared dev cluster, a CI-provisioned cluster, etc. + ModeExternal = "external" +) + +// Mode returns the e2e cluster mode. Reads E2E_MODE env, defaults to ModeKind. +func Mode() string { + if v := os.Getenv("E2E_MODE"); v != "" { + return v + } + return ModeKind +} + +// IsKind reports whether the current e2e mode is the local Kind cluster. +func IsKind() bool { return Mode() == ModeKind } + +// IsExternal reports whether the current e2e mode targets an +// externally-provided cluster via KUBECONFIG. +func IsExternal() bool { return !IsKind() } + +// SkipImageBuild reports whether the suite should skip the docker-build / +// kind-load steps in BeforeSuite. Non-Kind modes default to true because +// images are expected to be pre-built and pushed to a registry the target +// cluster can pull from. +func SkipImageBuild() bool { + if v := os.Getenv("SKIP_IMAGE_BUILD"); v != "" { + return v == "1" || v == "true" || v == "TRUE" + } + return IsExternal() +} + +// RegistrySourceImage returns the upstream registry image used by the +// in-cluster docker-registry deployment for pause/resume tests. Override +// via REGISTRY_SOURCE_IMAGE to point at a mirror reachable from the target +// cluster. +func RegistrySourceImage() string { + if v := os.Getenv("REGISTRY_SOURCE_IMAGE"); v != "" { + return v + } + return "registry:2" +} + +// AlpineImage returns the alpine image used by commit jobs. +func AlpineImage() string { + if v := os.Getenv("ALPINE_IMAGE"); v != "" { + return v + } + return "alpine:latest" +} + +// PauseResumeNamespace returns the namespace used by pause/resume tests for +// the in-cluster docker-registry and registry secrets. +func PauseResumeNamespace() string { + if v := os.Getenv("PAUSE_RESUME_NAMESPACE"); v != "" { + return v + } + return "default" +} + +// PauseResumeRegistryAddr returns the in-cluster docker-registry service +// address used by pause/resume tests. When unset, it derives the host from +// PauseResumeNamespace() so overriding only PAUSE_RESUME_NAMESPACE is +// sufficient and credential auths stay aligned with the registry endpoint. +func PauseResumeRegistryAddr() string { + if v := os.Getenv("PAUSE_RESUME_REGISTRY_ADDR"); v != "" { + return v + } + return "docker-registry." + PauseResumeNamespace() + ".svc.cluster.local:5000" +} + +// PauseResumeRegistryUser returns the registry username for pause/resume tests. +func PauseResumeRegistryUser() string { + if v := os.Getenv("PAUSE_RESUME_REGISTRY_USER"); v != "" { + return v + } + return "testuser" +} + +// PauseResumeRegistryPass returns the registry password for pause/resume tests. +func PauseResumeRegistryPass() string { + if v := os.Getenv("PAUSE_RESUME_REGISTRY_PASS"); v != "" { + return v + } + return "testpass" +} + +// PodSecurityEnforce returns the value applied to the +// `pod-security.kubernetes.io/enforce` namespace label. Empty string means +// the suite must skip applying the label (some platforms reject restricted). +func PodSecurityEnforce() string { + if v, ok := os.LookupEnv("E2E_POD_SECURITY_ENFORCE"); ok { + return v + } + return "restricted" +} diff --git a/kubernetes/test/utils/utils.go b/kubernetes/test/utils/utils.go index 25669e35e..b947586ef 100644 --- a/kubernetes/test/utils/utils.go +++ b/kubernetes/test/utils/utils.go @@ -163,8 +163,15 @@ func IsCertManagerCRDsInstalled() bool { return false } -// LoadImageToKindClusterWithName loads a local docker image to the kind cluster +// LoadImageToKindClusterWithName loads a local docker image to the kind cluster. +// When E2E_MODE is not "kind" this is a no-op: the image is expected to live in +// a registry the target cluster can pull from, so there is no Kind node to +// load into. func LoadImageToKindClusterWithName(name string) error { + if IsExternal() { + _, _ = fmt.Fprintf(GinkgoWriter, "skipping kind load for %q (E2E_MODE=%s)\n", name, Mode()) + return nil + } cluster := "kind" if v, ok := os.LookupEnv("KIND_CLUSTER"); ok { cluster = v From 751560cbb1e0b12d59d5819878b8839f22422c16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E7=84=B6?= Date: Mon, 25 May 2026 09:55:19 +0800 Subject: [PATCH 52/58] chore(helm): bump image-committer to v0.1.0 Sync helm values with image-committer v0.1.0 release. Use sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com mirror, note DockerHub location opensandbox/image-committer:v0.1.0 in comments. Co-Authored-By: Claude Opus 4.7 --- kubernetes/charts/opensandbox-controller/values.yaml | 3 ++- kubernetes/charts/opensandbox/values.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/kubernetes/charts/opensandbox-controller/values.yaml b/kubernetes/charts/opensandbox-controller/values.yaml index 8d9b7c168..97a35dc3c 100644 --- a/kubernetes/charts/opensandbox-controller/values.yaml +++ b/kubernetes/charts/opensandbox-controller/values.yaml @@ -47,7 +47,8 @@ controller: # -- Pause/Resume snapshot configuration snapshot: # -- Image used for commit operations (must contain nerdctl tool) - imageCommitterImage: "image-committer:dev" + # DockerHub: opensandbox/image-committer:v0.1.0 + imageCommitterImage: "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/image-committer:v0.1.0" # -- Containerd socket path of host containerdSocketPath: "/var/run/containerd/containerd.sock" # -- Timeout duration for commit jobs diff --git a/kubernetes/charts/opensandbox/values.yaml b/kubernetes/charts/opensandbox/values.yaml index b20b88bcf..6a9c968d4 100644 --- a/kubernetes/charts/opensandbox/values.yaml +++ b/kubernetes/charts/opensandbox/values.yaml @@ -9,7 +9,8 @@ opensandbox-controller: logLevel: info replicaCount: 1 snapshot: - imageCommitterImage: image-committer:dev + # DockerHub: opensandbox/image-committer:v0.1.0 + imageCommitterImage: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/image-committer:v0.1.0 commitJobTimeout: 10m registry: "" registryInsecure: false From d10f0e3d4c1cc0d5b903e4212486f52406b1faad Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Mon, 25 May 2026 11:28:47 +0800 Subject: [PATCH 53/58] fix(execd): extend mitm CA wait to 300s and log wait duration (#943) The bootstrap script waited at most 30s for /opt/opensandbox/mitmproxy-ca-cert.pem before skipping system CA trust setup. When the egress sidecar is recovering from a transient failure (e.g. mitmproxy OOM-killed and being restarted with backoff), 30s is not enough and the sandbox starts without TLS interception support, silently breaking HTTPS for system libraries. Extend the wait to 300s and log the actual wait duration on success so the boot timeline is visible in execd logs. --- components/execd/bootstrap.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/components/execd/bootstrap.sh b/components/execd/bootstrap.sh index 32f1a717b..328e9b755 100755 --- a/components/execd/bootstrap.sh +++ b/components/execd/bootstrap.sh @@ -129,7 +129,7 @@ trust_mitm_ca_nss() { MITM_CA="/opt/opensandbox/mitmproxy-ca-cert.pem" if is_truthy "${OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT:-}"; then i=0 - while [ "$i" -lt 30 ]; do + while [ "$i" -lt 300 ]; do if [ -f "$MITM_CA" ] && [ -s "$MITM_CA" ]; then break fi @@ -137,9 +137,12 @@ if is_truthy "${OPENSANDBOX_EGRESS_MITMPROXY_TRANSPARENT:-}"; then i=$((i + 1)) done if [ ! -f "$MITM_CA" ] || [ ! -s "$MITM_CA" ]; then - echo "warning: timed out after 30s waiting for $MITM_CA (egress mitm CA export); continuing without system CA trust" >&2 - elif ! trust_mitm_ca "$MITM_CA"; then - echo "warning: failed to install mitm CA into system trust store; TLS interception may not work for system libraries" >&2 + echo "warning: timed out after 300s waiting for $MITM_CA (egress mitm CA export); continuing without system CA trust" >&2 + else + echo "mitm CA ready at $MITM_CA after ${i}s" + if ! trust_mitm_ca "$MITM_CA"; then + echo "warning: failed to install mitm CA into system trust store; TLS interception may not work for system libraries" >&2 + fi fi if [ -f "$MITM_CA" ] && [ -s "$MITM_CA" ]; then From 277262361e81bec91a0e696e02fcf8d6cafc58ad Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Mon, 25 May 2026 11:32:54 +0800 Subject: [PATCH 54/58] fix(execd): kill entire process group on command cancel (#924) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(execd): kill entire process group on command cancel When a foreground command was cancelled (client disconnect, timeout, or DELETE /command), only the bash group leader received SIGKILL — child processes spawned via `&` or pipelines kept running as orphans because exec.CommandContext's internal kill targets a single pid, and killPid sent signals to the leader only. Fix runCommand's ctx.Done() branch to send SIGKILL to -pid (the whole group, since the leader is launched with Setpgid: true), mirroring runBackgroundCommand. Rewrite killPid to signal -pid for SIGTERM/SIGKILL and to use kill(-pid, 0) for liveness probing, so Interrupt() also terminates descendants. Adds regression tests covering both cancel and Interrupt paths. Fixes #922 Co-Authored-By: Claude Opus 4.7 * fix(execd): guard Interrupt against stale PID after command exit killPid now signals the whole process group (-pid). Combined with the fact that commandClientMap retains finished sessions, a late or retried Interrupt could otherwise terminate every process in an unrelated process group whose PGID has reused the recorded PID. - markCommandFinished clears kernel.pid alongside kernel.running so the stale PID is no longer accessible. - commandSnapshot now reads under c.mu.RLock for a consistent view of running/pid relative to markCommandFinished's write under c.mu.Lock. - Interrupt() (unix and windows) snapshots the kernel and refuses to signal when the command has already finished. Adds a regression test ensuring Interrupt on a completed session returns an error and that pid is cleared. Co-Authored-By: Claude Opus 4.7 * fix(execd): don't surface slow group teardown as Interrupt failure kill(2) on a process group only guarantees delivery to at least one member, and kill(-pid, 0) keeps reporting the group as observable while any unreaped zombie lingers. The previous post-SIGKILL probe ran for only 150ms and then returned a hard error, so Interrupt could surface a 500 even though the kill signal had already been delivered. Likewise on macOS, SIGKILL on a group that has been reduced to zombies returns EPERM, which the previous code reported as a kill failure even though SIGTERM had already taken effect. - After a successful SIGKILL, log a warning when the probe loop still observes the group instead of returning an error. - When SIGTERM was delivered but the SIGKILL syscall fails (commonly EPERM on a zombie-only group), log and return nil — the kill is in flight and the kernel will reap the group once Wait() runs. Adds a regression test that runs killPid against a Setpgid group with no concurrent reaper, exercising the zombie-lingering path. Co-Authored-By: Claude Opus 4.7 * fix(execd): only group-kill on real cancellation, not after success Execute() defers cancel() for every foreground command, including successful ones, so the signal-forwarding goroutine's ctx.Done() branch also fired on the normal-success path. With the new group-wide SIGKILL on -cmd.Process.Pid, that post-completion signal could hit a recycled pid/pgid and kill an unrelated process group inside the sandbox. Gate the goroutine on the existing `done` channel (closed after cmd.Wait() returns or on start failure): exit cleanly when the command has finished, so only genuine cancellations — timeout, client abort, Interrupt — trigger the group kill. A double-check inside the ctx.Done() branch handles the race where ctx is cancelled at the same instant cmd.Wait() returns. Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- components/execd/pkg/runtime/command.go | 25 ++ .../execd/pkg/runtime/command_signal_test.go | 246 ++++++++++++++++++ .../execd/pkg/runtime/command_status.go | 7 + components/execd/pkg/runtime/interrupt.go | 92 ++++--- .../execd/pkg/runtime/interrupt_windows.go | 10 +- 5 files changed, 343 insertions(+), 37 deletions(-) create mode 100644 components/execd/pkg/runtime/command_signal_test.go diff --git a/components/execd/pkg/runtime/command.go b/components/execd/pkg/runtime/command.go index 893956366..fcabe1414 100644 --- a/components/execd/pkg/runtime/command.go +++ b/components/execd/pkg/runtime/command.go @@ -170,7 +170,32 @@ func (c *Controller) runCommand(ctx context.Context, request *ExecuteCodeRequest safego.Go(func() { for { select { + case <-done: + // cmd.Wait() has returned (or start failed). The pid is + // about to be — or already has been — reaped, so we + // must not signal it. Execute()'s defer cancel() fires + // after every foreground command, including successful + // ones, so without this gate the SIGKILL below would + // run on a recycled pid/pgid and could kill an + // unrelated process group. + return case <-ctx.Done(): + // Re-check `done` to avoid a race with cmd.Wait() + // returning concurrently. If cmd.Wait() has just + // finished, the leader pid may be reaped and recycled + // at any moment; signaling -pid would then target a + // foreign process group. + select { + case <-done: + return + default: + } + // Genuine cancellation (timeout, client disconnect, + // Interrupt). Kill the whole process group so children + // don't outlive the cancelled context. + if cmd.Process != nil { + _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + } return case sig := <-signals: if sig == nil { diff --git a/components/execd/pkg/runtime/command_signal_test.go b/components/execd/pkg/runtime/command_signal_test.go new file mode 100644 index 000000000..bd23b52d3 --- /dev/null +++ b/components/execd/pkg/runtime/command_signal_test.go @@ -0,0 +1,246 @@ +// Copyright 2025 Alibaba Group Holding Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//go:build !windows +// +build !windows + +package runtime + +import ( + "context" + "errors" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "testing" + "time" + + "github.com/alibaba/opensandbox/execd/pkg/jupyter/execute" + "github.com/stretchr/testify/require" +) + +// TestRunCommand_CancelKillsChildren verifies that cancelling the context +// terminates not only the bash group leader but also its descendant +// processes. Regression test for +// https://github.com/alibaba/OpenSandbox/issues/922. +func TestRunCommand_CancelKillsChildren(t *testing.T) { + if _, err := exec.LookPath("bash"); err != nil { + t.Skip("bash not found in PATH") + } + + pidFile := filepath.Join(t.TempDir(), "child.pid") + + c := NewController("", "") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + started := make(chan struct{}) + var once sync.Once + + req := &ExecuteCodeRequest{ + // Spawn a sleep child, record its pid, then wait so the bash + // leader stays alive until the context is cancelled. + Code: `sleep 30 & echo $! > "` + pidFile + `"; echo READY; wait`, + Cwd: t.TempDir(), + Timeout: 30 * time.Second, + Hooks: ExecuteResultHook{ + OnExecuteInit: func(_ string) {}, + OnExecuteStdout: func(s string) { + if strings.TrimSpace(s) == "READY" { + once.Do(func() { close(started) }) + } + }, + OnExecuteStderr: func(_ string) {}, + OnExecuteError: func(_ *execute.ErrorOutput) {}, + OnExecuteComplete: func(_ time.Duration) {}, + }, + } + + done := make(chan struct{}) + go func() { + _ = c.runCommand(ctx, req) + close(done) + }() + + select { + case <-started: + case <-time.After(10 * time.Second): + cancel() + <-done + t.Fatal("command did not emit READY in time") + } + + pidBytes, err := os.ReadFile(pidFile) + require.NoError(t, err, "expected child pid file") + childPid, err := strconv.Atoi(strings.TrimSpace(string(pidBytes))) + require.NoError(t, err) + require.Positive(t, childPid) + + require.NoError(t, syscall.Kill(childPid, 0), "child should be alive before cancel") + + cancel() + + select { + case <-done: + case <-time.After(5 * time.Second): + t.Fatal("runCommand did not return after cancel") + } + + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if err := syscall.Kill(childPid, 0); err != nil { + require.True(t, errors.Is(err, syscall.ESRCH), + "unexpected liveness probe error: %v", err) + return + } + time.Sleep(50 * time.Millisecond) + } + t.Fatalf("child pid %d still alive 2s after cancel — process leak", childPid) +} + +// TestInterrupt_AfterFinished_ReturnsError verifies that an Interrupt +// arriving after the command has completed does not signal a recycled PID. +// Without this guard, group-wide kill would amplify the stale-PID hazard +// to every process in an unrelated process group. +func TestInterrupt_AfterFinished_ReturnsError(t *testing.T) { + if _, err := exec.LookPath("bash"); err != nil { + t.Skip("bash not found in PATH") + } + + c := NewController("", "") + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + var session string + completeCh := make(chan struct{}, 1) + req := &ExecuteCodeRequest{ + Code: `echo done`, + Cwd: t.TempDir(), + Timeout: 5 * time.Second, + Hooks: ExecuteResultHook{ + OnExecuteInit: func(s string) { session = s }, + OnExecuteStdout: func(_ string) {}, + OnExecuteStderr: func(_ string) {}, + OnExecuteError: func(_ *execute.ErrorOutput) {}, + OnExecuteComplete: func(_ time.Duration) { completeCh <- struct{}{} }, + }, + } + require.NoError(t, c.runCommand(ctx, req)) + + select { + case <-completeCh: + case <-time.After(3 * time.Second): + t.Fatal("command did not complete in time") + } + require.NotEmpty(t, session) + + err := c.Interrupt(session) + require.Error(t, err, "Interrupt on finished session must error") + require.Contains(t, err.Error(), "not running") + + snap := c.commandSnapshot(session) + require.NotNil(t, snap) + require.False(t, snap.running, "running flag should be cleared") + require.Equal(t, 0, snap.pid, "pid should be cleared to avoid stale-PID kill") +} + +// TestKillPid_ZombieLeaderDoesNotFail verifies that killPid does not +// return an error when a group leader becomes a zombie before its parent +// has reaped it. kill(-pid, 0) keeps reporting the group as observable +// while the zombie lingers, but SIGKILL has already been delivered and +// the kernel will tear the group down once Wait() runs. Treating that +// state as a failure caused Interrupt to surface a 500 even though the +// kill succeeded. +func TestKillPid_ZombieLeaderDoesNotFail(t *testing.T) { + if _, err := exec.LookPath("bash"); err != nil { + t.Skip("bash not found in PATH") + } + + cmd := exec.Command("bash", "-c", `sleep 30 & wait`) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + require.NoError(t, cmd.Start()) + // Deliberately omit a reaper goroutine so the leader stays as a + // zombie after kill — that is the condition we want to exercise. + t.Cleanup(func() { + _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + _, _ = cmd.Process.Wait() + }) + + // Give bash a moment to spawn the sleep child so the group has more + // than just the leader. + time.Sleep(100 * time.Millisecond) + + c := &Controller{} + require.NoError(t, c.killPid(cmd.Process.Pid), + "slow post-SIGKILL teardown must not be reported as a hard failure") +} + +// TestKillPid_TerminatesEntireProcessGroup verifies that killPid signals +// the whole process group, not just the leader. Regression test for +// https://github.com/alibaba/OpenSandbox/issues/922. +func TestKillPid_TerminatesEntireProcessGroup(t *testing.T) { + if _, err := exec.LookPath("bash"); err != nil { + t.Skip("bash not found in PATH") + } + + pidFile := filepath.Join(t.TempDir(), "child.pid") + cmd := exec.Command("bash", "-c", + `sleep 30 & echo $! > "`+pidFile+`"; wait`) + cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + require.NoError(t, cmd.Start()) + // Reap the leader concurrently so it doesn't linger as a zombie that + // keeps the process group "alive" from killPid's liveness probe + // perspective. Mirrors how runCommand's cmd.Wait() reaps in production. + waitDone := make(chan struct{}) + go func() { + _, _ = cmd.Process.Wait() + close(waitDone) + }() + t.Cleanup(func() { + _ = syscall.Kill(-cmd.Process.Pid, syscall.SIGKILL) + <-waitDone + }) + + var childPid int + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + if data, err := os.ReadFile(pidFile); err == nil { + if pid, perr := strconv.Atoi(strings.TrimSpace(string(data))); perr == nil && pid > 0 { + childPid = pid + break + } + } + time.Sleep(50 * time.Millisecond) + } + require.Positive(t, childPid, "failed to capture child pid") + require.NoError(t, syscall.Kill(childPid, 0), "child should be alive before kill") + + c := &Controller{} + require.NoError(t, c.killPid(cmd.Process.Pid)) + + deadline = time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + if err := syscall.Kill(childPid, 0); err != nil { + require.True(t, errors.Is(err, syscall.ESRCH), + "unexpected liveness probe error: %v", err) + return + } + time.Sleep(50 * time.Millisecond) + } + t.Fatalf("child pid %d still alive 2s after killPid — process leak", childPid) +} diff --git a/components/execd/pkg/runtime/command_status.go b/components/execd/pkg/runtime/command_status.go index 6dbc6d4f2..c0883d0fc 100644 --- a/components/execd/pkg/runtime/command_status.go +++ b/components/execd/pkg/runtime/command_status.go @@ -40,6 +40,9 @@ type CommandOutput struct { } func (c *Controller) commandSnapshot(session string) *commandKernel { + c.mu.RLock() + defer c.mu.RUnlock() + var kernel *commandKernel if v, ok := c.commandClientMap.Load(session); ok { kernel, _ = v.(*commandKernel) @@ -128,4 +131,8 @@ func (c *Controller) markCommandFinished(session string, exitCode int, errMsg st kernel.errMsg = errMsg kernel.running = false kernel.finishedAt = &now + // Clear the PID so a late or retried Interrupt cannot signal a recycled + // process. Group-wide kill would otherwise amplify the impact of a + // stale-PID hit to every process in the unrelated process group. + kernel.pid = 0 } diff --git a/components/execd/pkg/runtime/interrupt.go b/components/execd/pkg/runtime/interrupt.go index b9cd2a545..3419f1ae7 100644 --- a/components/execd/pkg/runtime/interrupt.go +++ b/components/execd/pkg/runtime/interrupt.go @@ -20,13 +20,9 @@ package runtime import ( "errors" "fmt" - "os" - "strings" "syscall" "time" - "github.com/alibaba/opensandbox/internal/safego" - "github.com/alibaba/opensandbox/execd/pkg/log" ) @@ -38,8 +34,16 @@ func (c *Controller) Interrupt(sessionID string) error { log.Warning("Interrupting Jupyter kernel %s", kernel.kernelID) return kernel.client.InterruptKernel(kernel.kernelID) case c.getCommandKernel(sessionID) != nil: - kernel := c.getCommandKernel(sessionID) - return c.killPid(kernel.pid) + // Snapshot under c.mu so running/pid are observed consistently with + // markCommandFinished. killPid signals the entire process group, so + // guarding against a stale PID is critical: a late Interrupt on a + // finished session must not blast SIGTERM/SIGKILL at an unrelated + // process group that has reused the PID. + snapshot := c.commandSnapshot(sessionID) + if snapshot == nil || !snapshot.running || snapshot.pid <= 0 { + return fmt.Errorf("command session %s is not running", sessionID) + } + return c.killPid(snapshot.pid) case c.getBashSession(sessionID) != nil: return c.closeBashSession(sessionID) default: @@ -48,53 +52,71 @@ func (c *Controller) Interrupt(sessionID string) error { } // killPid sends SIGTERM followed by SIGKILL if needed. +// +// Commands are launched with Setpgid: true, so pid is also the process group +// id. We signal the entire group via syscall.Kill(-pid, sig) so child and +// grandchild processes are terminated, not just the group leader. +// +// kill(2) on a process group only guarantees delivery to at least one +// member, and kill(-pid, 0) keeps reporting the group as observable while +// any unreaped zombie lingers. The probe loops below are therefore +// best-effort logging — once a kill signal has been delivered, a slow or +// asynchronous teardown is not treated as a hard failure that would +// surface as a 500 from Interrupt. func (c *Controller) killPid(pid int) error { - process, err := os.FindProcess(pid) - if err != nil { - return err + if pid <= 0 { + return fmt.Errorf("invalid pid %d", pid) } - log.Warning("Attempting to terminate process %d", pid) + log.Warning("Attempting to terminate process group %d", pid) - if err := process.Signal(syscall.SIGTERM); err != nil { - if strings.Contains(err.Error(), "already finished") { + sigtermDelivered := false + if err := syscall.Kill(-pid, syscall.SIGTERM); err != nil { + if errors.Is(err, syscall.ESRCH) { return nil } - log.Warning("SIGTERM failed for pid %d: %v, trying SIGKILL", pid, err) + log.Warning("SIGTERM failed for pgroup %d: %v, trying SIGKILL", pid, err) } else { - done := make(chan error, 1) - safego.Go(func() { - _, err := process.Wait() - done <- err - }) - - select { - case err := <-done: - if err == nil { - log.Info("Process %d terminated gracefully", pid) - return nil + sigtermDelivered = true + // Probe the group for liveness. os.Process.Wait() doesn't apply + // because the leader is not a child of this goroutine. + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + if err := syscall.Kill(-pid, 0); err != nil { + if errors.Is(err, syscall.ESRCH) { + log.Info("Process group %d terminated gracefully", pid) + return nil + } } - case <-time.After(3 * time.Second): - log.Warning("Process %d did not terminate after SIGTERM, using SIGKILL", pid) + time.Sleep(50 * time.Millisecond) } + log.Warning("Process group %d did not exit after SIGTERM, escalating to SIGKILL", pid) } - if err := process.Signal(syscall.SIGKILL); err != nil { - if strings.Contains(err.Error(), "already finished") { + if err := syscall.Kill(-pid, syscall.SIGKILL); err != nil { + if errors.Is(err, syscall.ESRCH) { + return nil + } + if sigtermDelivered { + // SIGTERM was already delivered to at least one member, so the + // kill is in flight. SIGKILL failure here is commonly EPERM on + // a group reduced to zombies — the kernel will reap them once + // the parent runs Wait(). Surface as a warning rather than a + // hard error. + log.Warning("SIGKILL on pgroup %d failed: %v; teardown likely already in progress", pid, err) return nil } - return fmt.Errorf("failed to kill process %d: %w", pid, err) + return fmt.Errorf("failed to kill process group %d: %w", pid, err) } for range 3 { - if err := process.Signal(syscall.Signal(0)); err != nil { - if strings.Contains(err.Error(), "already finished") || - strings.Contains(err.Error(), "no such process") { - log.Info("Process %d confirmed terminated", pid) + if err := syscall.Kill(-pid, 0); err != nil { + if errors.Is(err, syscall.ESRCH) { + log.Info("Process group %d confirmed terminated", pid) return nil } } time.Sleep(50 * time.Millisecond) } - - return fmt.Errorf("process %d might still be running", pid) + log.Warning("Process group %d still observable after SIGKILL; teardown may complete asynchronously", pid) + return nil } diff --git a/components/execd/pkg/runtime/interrupt_windows.go b/components/execd/pkg/runtime/interrupt_windows.go index 6e1044d77..bbcd3ccdb 100644 --- a/components/execd/pkg/runtime/interrupt_windows.go +++ b/components/execd/pkg/runtime/interrupt_windows.go @@ -35,8 +35,14 @@ func (c *Controller) Interrupt(sessionID string) error { log.Warning("Interrupting Jupyter kernel %s", kernel.kernelID) return kernel.client.InterruptKernel(kernel.kernelID) case c.getCommandKernel(sessionID) != nil: - kernel := c.getCommandKernel(sessionID) - return c.killPid(kernel.pid) + // Guard against a stale PID after the command has finished: the + // kernel is retained in commandClientMap, so a late Interrupt could + // otherwise terminate an unrelated process that reused the PID. + snapshot := c.commandSnapshot(sessionID) + if snapshot == nil || !snapshot.running || snapshot.pid <= 0 { + return fmt.Errorf("command session %s is not running", sessionID) + } + return c.killPid(snapshot.pid) default: return errors.New("no such session") } From 68a5710c5385697ca9cf0830ae3e8a3e9e69d921 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 25 May 2026 04:18:15 +0000 Subject: [PATCH 55/58] chore: bump execd to v1.0.18 --- examples/agent-sandbox/README.md | 2 +- examples/code-interpreter/README.md | 2 +- examples/windows/pool-win-example.yaml | 2 +- kubernetes/charts/opensandbox-server/values.yaml | 2 +- kubernetes/config/samples/sandbox_v1alpha1_pool.yaml | 2 +- .../config/samples/sandbox_v1alpha1_pool_restart.yaml | 2 +- oseps/0004-secure-container-runtime.md | 6 +++--- oseps/0007-fast-sandbox-runtime-support.md | 2 +- server/DEVELOPMENT.md | 2 +- server/docker-compose.example.yaml | 4 ++-- .../examples/example.config.k8s.toml | 2 +- .../examples/example.config.k8s.zh.toml | 2 +- server/opensandbox_server/examples/example.config.toml | 2 +- .../opensandbox_server/examples/example.config.zh.toml | 2 +- server/tests/test_docker_service.py | 10 +++++----- 15 files changed, 22 insertions(+), 22 deletions(-) diff --git a/examples/agent-sandbox/README.md b/examples/agent-sandbox/README.md index e29ba4dbf..c93b9ef95 100644 --- a/examples/agent-sandbox/README.md +++ b/examples/agent-sandbox/README.md @@ -23,7 +23,7 @@ opensandbox-server init-config ~/.sandbox.toml --example docker ```toml [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" [kubernetes] namespace = "default" diff --git a/examples/code-interpreter/README.md b/examples/code-interpreter/README.md index d254c1c86..eb689dfe2 100644 --- a/examples/code-interpreter/README.md +++ b/examples/code-interpreter/README.md @@ -104,7 +104,7 @@ spec: - name: opensandbox-bin mountPath: /opt/opensandbox/bin - name: execd-installer - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18 command: [ "/bin/sh", "-c" ] args: - | diff --git a/examples/windows/pool-win-example.yaml b/examples/windows/pool-win-example.yaml index 815d4fd92..45283d992 100644 --- a/examples/windows/pool-win-example.yaml +++ b/examples/windows/pool-win-example.yaml @@ -58,7 +58,7 @@ spec: command: - /bin/sh - -c - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18 name: execd-installer volumeMounts: - mountPath: /opt/opensandbox/bin diff --git a/kubernetes/charts/opensandbox-server/values.yaml b/kubernetes/charts/opensandbox-server/values.yaml index 339c31416..afdde64de 100644 --- a/kubernetes/charts/opensandbox-server/values.yaml +++ b/kubernetes/charts/opensandbox-server/values.yaml @@ -85,7 +85,7 @@ configToml: | [runtime] type = "kubernetes" - execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18" [kubernetes] kubeconfig_path = "" diff --git a/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml b/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml index d34303647..6dc7b04ee 100644 --- a/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml +++ b/kubernetes/config/samples/sandbox_v1alpha1_pool.yaml @@ -31,7 +31,7 @@ spec: - name: opensandbox-bin mountPath: /opt/opensandbox/bin - name: execd-installer - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18 command: [ "/bin/sh", "-c" ] args: - | diff --git a/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml b/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml index f3bd63ae3..55ae43ad9 100644 --- a/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml +++ b/kubernetes/config/samples/sandbox_v1alpha1_pool_restart.yaml @@ -56,7 +56,7 @@ spec: command: - /bin/sh - -c - image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17 + image: sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18 name: execd-installer volumeMounts: - mountPath: /opt/opensandbox/bin diff --git a/oseps/0004-secure-container-runtime.md b/oseps/0004-secure-container-runtime.md index ca253e528..eddfadca6 100644 --- a/oseps/0004-secure-container-runtime.md +++ b/oseps/0004-secure-container-runtime.md @@ -180,7 +180,7 @@ Extension to `~/.sandbox.toml`. A single `[secure_runtime]` section configures t ```toml [runtime] type = "docker" # or "kubernetes" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" # Secure container runtime configuration. # When enabled, ALL sandboxes on this server use the specified runtime. @@ -210,7 +210,7 @@ Example 1 — gVisor on Docker: # ~/.sandbox.toml [runtime] type = "docker" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" [secure_runtime] type = "gvisor" @@ -224,7 +224,7 @@ Example 2 — Kata Containers (QEMU) on Kubernetes: # ~/.sandbox.toml [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" [secure_runtime] type = "kata" diff --git a/oseps/0007-fast-sandbox-runtime-support.md b/oseps/0007-fast-sandbox-runtime-support.md index fc85cb84c..e11beef78 100644 --- a/oseps/0007-fast-sandbox-runtime-support.md +++ b/oseps/0007-fast-sandbox-runtime-support.md @@ -611,7 +611,7 @@ api_key = "your-secret-key" [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" [kubernetes] namespace = "default" diff --git a/server/DEVELOPMENT.md b/server/DEVELOPMENT.md index eec56ba11..49e8ab269 100644 --- a/server/DEVELOPMENT.md +++ b/server/DEVELOPMENT.md @@ -61,7 +61,7 @@ This guide provides comprehensive information for developers working on OpenSand [runtime] type = "docker" - execd_image = "opensandbox/execd:v1.0.17" + execd_image = "opensandbox/execd:v1.0.18" [docker] network_mode = "host" diff --git a/server/docker-compose.example.yaml b/server/docker-compose.example.yaml index 28fe252a5..4bf565483 100644 --- a/server/docker-compose.example.yaml +++ b/server/docker-compose.example.yaml @@ -10,8 +10,8 @@ configs: [runtime] type = "docker" - # execd_image = "opensandbox/execd:v1.0.17" - execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" + # execd_image = "opensandbox/execd:v1.0.18" + execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18" [egress] image = "opensandbox/egress:v1.0.12" diff --git a/server/opensandbox_server/examples/example.config.k8s.toml b/server/opensandbox_server/examples/example.config.k8s.toml index 3cfa2c74a..5f397e9f4 100644 --- a/server/opensandbox_server/examples/example.config.k8s.toml +++ b/server/opensandbox_server/examples/example.config.k8s.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "kubernetes" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" [storage] # Allowlist of host path prefixes permitted for bind mounts. diff --git a/server/opensandbox_server/examples/example.config.k8s.zh.toml b/server/opensandbox_server/examples/example.config.k8s.zh.toml index fc7dba498..a1b7cd81c 100644 --- a/server/opensandbox_server/examples/example.config.k8s.zh.toml +++ b/server/opensandbox_server/examples/example.config.k8s.zh.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "kubernetes" -execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" +execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18" [storage] # 允许进行 bind mount 的宿主机路径前缀白名单。 diff --git a/server/opensandbox_server/examples/example.config.toml b/server/opensandbox_server/examples/example.config.toml index fa4c84fe3..9f509392a 100644 --- a/server/opensandbox_server/examples/example.config.toml +++ b/server/opensandbox_server/examples/example.config.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "docker" -execd_image = "opensandbox/execd:v1.0.17" +execd_image = "opensandbox/execd:v1.0.18" [storage] # Allowlist of host path prefixes permitted for bind mounts. diff --git a/server/opensandbox_server/examples/example.config.zh.toml b/server/opensandbox_server/examples/example.config.zh.toml index 0820638c6..689eeda27 100644 --- a/server/opensandbox_server/examples/example.config.zh.toml +++ b/server/opensandbox_server/examples/example.config.zh.toml @@ -32,7 +32,7 @@ level = "INFO" [runtime] type = "docker" -execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.17" +execd_image = "sandbox-registry.cn-zhangjiakou.cr.aliyuncs.com/opensandbox/execd:v1.0.18" [storage] allowed_host_paths = [] diff --git a/server/tests/test_docker_service.py b/server/tests/test_docker_service.py index fe993fc26..1bde714da 100644 --- a/server/tests/test_docker_service.py +++ b/server/tests/test_docker_service.py @@ -1456,7 +1456,7 @@ async def test_create_sandbox_windows_profile_injects_runtime_defaults(mock_dock mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.18" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1539,7 +1539,7 @@ async def test_create_sandbox_windows_profile_rejects_missing_runtime_devices(mo mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.18" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1578,7 +1578,7 @@ async def test_create_sandbox_windows_profile_rejects_below_minimum_resource_lim mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.18" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1615,7 +1615,7 @@ async def test_create_sandbox_windows_profile_accepts_dockur_demo_like_request(m mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.18" cfg.docker.network_mode = "bridge" service = DockerSandboxService(config=cfg) request = CreateSandboxRequest( @@ -1669,7 +1669,7 @@ async def test_create_sandbox_windows_profile_with_network_policy_maps_windows_p mock_docker.from_env.return_value = mock_client cfg = _app_config() - cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.17" + cfg.runtime.execd_image = "ghcr.io/opensandbox/execd:v1.0.18" cfg.docker.network_mode = "bridge" cfg.egress = EgressConfig(image="opensandbox/egress:latest") service = DockerSandboxService(config=cfg) From 22409289d503c2b26682913939cb021d5df8aa71 Mon Sep 17 00:00:00 2001 From: epha <62273713+Pangjiping@users.noreply.github.com> Date: Tue, 26 May 2026 11:04:38 +0800 Subject: [PATCH 56/58] feat(egress): add DELETE /policy endpoint for removing egress rules by target (#864) * feat(egress): add DELETE /policy endpoint for removing egress rules by target Add DELETE handler that accepts a JSON array of target strings, removes matching rules case-insensitively, and commits the updated policy. Targets not found are silently ignored (idempotent). Spec and README docs updated. Co-Authored-By: Claude Opus 4.7 * feat(sdks): expose egress DELETE /policy across all sandbox SDKs Wires the new DELETE /policy endpoint through Go, JavaScript, Python (async + sync), Kotlin, and C# sandbox SDKs so users can remove egress rules by target through the supported facades. Regenerates the JS and Python OpenAPI clients (TypeScript and Kotlin generators now emit the delete operation; Python generator produces a new delete_policy module), then adds matching handwritten adapter/service/sandbox methods and unit tests. Extends the C# HttpClientWrapper with a DeleteAsync(path, body, ct) overload since DELETE-with-body was not previously supported. Adds an async Python e2e test (test_01ac_network_policy_delete) that provisions a sandbox with two allow rules, deletes one (plus a nonexistent target to verify idempotency), and confirms the policy mutation, defaultAction preservation, and resulting traffic behavior. Co-Authored-By: Claude Opus 4.7 * fix(egress): DELETE response shape + status codes - Drop Policy field from no-match path (consistent with POST/PATCH success responses and the spec's DELETE example). - Return 500 instead of 400 for marshal/parse failures on self-synthesized policy JSON (internal inconsistency, not client error); upgrade matching log level to Error. - Revert unrelated trailing-whitespace cleanup in smoke-nft.sh copyright header. Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- components/egress/policy_server.go | 95 +++++++- components/egress/policy_server_test.go | 144 ++++++++++++ components/egress/policy_utils.go | 26 +++ components/egress/tests/smoke-nft.sh | 41 ++++ .../src/OpenSandbox/Adapters/EgressAdapter.cs | 7 + .../OpenSandbox/Internal/HttpClientWrapper.cs | 17 ++ .../sandbox/csharp/src/OpenSandbox/Sandbox.cs | 17 ++ .../src/OpenSandbox/Services/IEgress.cs | 4 + .../SandboxEgressLifecycleTests.cs | 14 ++ sdks/sandbox/go/egress.go | 12 + sdks/sandbox/go/opensandbox_test.go | 41 ++++ sdks/sandbox/go/sandbox_egress.go | 9 + .../javascript/src/adapters/egressAdapter.ts | 10 + sdks/sandbox/javascript/src/api/egress.ts | 36 ++- sdks/sandbox/javascript/src/sandbox.ts | 4 + .../sandbox/javascript/src/services/egress.ts | 8 + .../alibaba/opensandbox/sandbox/Sandbox.kt | 13 ++ .../sandbox/domain/services/Egress.kt | 2 + .../adapters/service/EgressAdapter.kt | 9 + .../opensandbox/sandbox/SandboxTest.kt | 10 + .../opensandbox/adapters/egress_adapter.py | 13 ++ .../api/egress/api/policy/delete_policy.py | 211 ++++++++++++++++++ .../sandbox/python/src/opensandbox/sandbox.py | 11 + .../python/src/opensandbox/services/egress.py | 14 ++ .../sync/adapters/egress_adapter.py | 13 ++ .../python/src/opensandbox/sync/sandbox.py | 11 + .../src/opensandbox/sync/services/egress.py | 10 + specs/README.md | 1 + specs/README_zh.md | 1 + specs/egress-api.yaml | 42 ++++ tests/python/tests/test_sandbox_e2e.py | 76 +++++++ 31 files changed, 914 insertions(+), 8 deletions(-) create mode 100644 sdks/sandbox/python/src/opensandbox/api/egress/api/policy/delete_policy.py diff --git a/components/egress/policy_server.go b/components/egress/policy_server.go index a62eefc5a..8d85746b8 100644 --- a/components/egress/policy_server.go +++ b/components/egress/policy_server.go @@ -147,8 +147,10 @@ func (s *policyServer) handlePolicy(w http.ResponseWriter, r *http.Request) { s.handlePost(w, r) case http.MethodPatch: s.handlePatch(w, r) + case http.MethodDelete: + s.handleDelete(w, r) default: - w.Header().Set("Allow", "GET, POST, PUT, PATCH") + w.Header().Set("Allow", "GET, POST, PUT, PATCH, DELETE") http.Error(w, "method not allowed", http.StatusMethodNotAllowed) } } @@ -222,15 +224,16 @@ func (s *policyServer) handlePatch(w http.ResponseWriter, r *http.Request) { defer s.mu.Unlock() raw, err := readPolicyRequestBody(r) - if err != nil || raw == "" { - if err != nil { - logEgressUpdateFailedWarn(fmt.Sprintf("failed to read body: %v", err)) - } else { - logEgressUpdateFailedWarn("empty patch body") - } + if err != nil { + logEgressUpdateFailedWarn(fmt.Sprintf("failed to read body: %v", err)) http.Error(w, fmt.Sprintf("failed to read body: %v", err), http.StatusBadRequest) return } + if raw == "" { + logEgressUpdateFailedWarn("empty patch body") + http.Error(w, "empty body", http.StatusBadRequest) + return + } var patchRules []policy.EgressRule if err := json.Unmarshal([]byte(raw), &patchRules); err != nil { @@ -268,6 +271,84 @@ func (s *policyServer) handlePatch(w http.ResponseWriter, r *http.Request) { }) } +func (s *policyServer) handleDelete(w http.ResponseWriter, r *http.Request) { + defer r.Body.Close() + s.mu.Lock() + defer s.mu.Unlock() + + raw, err := readPolicyRequestBody(r) + if err != nil { + logEgressUpdateFailedWarn(fmt.Sprintf("failed to read body: %v", err)) + http.Error(w, fmt.Sprintf("failed to read body: %v", err), http.StatusBadRequest) + return + } + if raw == "" { + logEgressUpdateFailedWarn("empty delete body") + http.Error(w, "empty body", http.StatusBadRequest) + return + } + + var targets []string + if err := json.Unmarshal([]byte(raw), &targets); err != nil { + logEgressUpdateFailedWarn(fmt.Sprintf("invalid delete targets: %v", err)) + http.Error(w, fmt.Sprintf("invalid delete targets: %v", err), http.StatusBadRequest) + return + } + if len(targets) == 0 { + logEgressUpdateFailedWarn("empty delete targets array") + http.Error(w, "invalid delete targets: empty array", http.StatusBadRequest) + return + } + + base := s.proxy.CurrentPolicy() + if base == nil { + base = policy.DefaultDenyPolicy() + } + oldCount := len(base.Egress) + newEgress, removedRules := removeRulesByTarget(base.Egress, targets) + removed := oldCount - len(newEgress) + + if removed == 0 { + mode := modeFromPolicy(base) + writeJSON(w, http.StatusOK, policyStatusResponse{ + Status: "ok", + Mode: mode, + EnforcementMode: s.enforcementMode, + Reason: "no matching targets found", + }) + return + } + + rawMerged, err := json.Marshal(policy.NetworkPolicy{ + DefaultAction: base.DefaultAction, + Egress: newEgress, + }) + if err != nil { + logEgressUpdateFailedError(fmt.Sprintf("failed to marshal updated policy: %v", err)) + http.Error(w, fmt.Sprintf("internal error: %v", err), http.StatusInternalServerError) + return + } + newPolicy, err := policy.ParsePolicy(string(rawMerged)) + if err != nil { + logEgressUpdateFailedError(fmt.Sprintf("invalid policy after delete: %v", err)) + http.Error(w, fmt.Sprintf("internal error: %v", err), http.StatusInternalServerError) + return + } + + mode := modeFromPolicy(newPolicy) + log.Infof("policy API: deleting %d egress rule(s) by target, removed=%d, mode=%s, enforcement=%s", len(targets), removed, mode, s.enforcementMode) + if !s.commitPolicy(r.Context(), w, newPolicy, "delete") { + return + } + logEgressUpdated(newPolicy.DefaultAction, removedRules) + log.Infof("policy API: delete applied successfully") + writeJSON(w, http.StatusOK, policyStatusResponse{ + Status: "ok", + Mode: mode, + EnforcementMode: s.enforcementMode, + }) +} + // commitPolicy applies one logical change: optional disk persist → merge always file rules → nft // static (with nameserver allow-IPs) → then update in-memory user policy (POST/PATCH/GET view). func (s *policyServer) commitPolicy(ctx context.Context, w http.ResponseWriter, pol *policy.NetworkPolicy, op string) bool { diff --git a/components/egress/policy_server_test.go b/components/egress/policy_server_test.go index 74e33771e..a2a0aacbd 100644 --- a/components/egress/policy_server_test.go +++ b/components/egress/policy_server_test.go @@ -245,6 +245,150 @@ func TestHandlePatch_RejectsWhenOverMaxEgressRules(t *testing.T) { require.Len(t, proxy.updated.Egress, 2, "policy should be unchanged") } +func TestHandleDelete_RemovesMatchingTargets(t *testing.T) { + initial := &policy.NetworkPolicy{ + DefaultAction: policy.ActionDeny, + Egress: []policy.EgressRule{ + {Action: policy.ActionAllow, Target: "example.com"}, + {Action: policy.ActionDeny, Target: "blocked.com"}, + {Action: policy.ActionAllow, Target: "keep.com"}, + }, + } + proxy := &stubProxy{updated: initial} + nft := &stubNft{} + srv := &policyServer{proxy: proxy, nft: nft, enforcementMode: "dns+nft"} + + body := `["blocked.com","nonexistent.com"]` + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader(body)) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusOK, resp.StatusCode, "expected 200 OK") + require.Equal(t, 1, nft.calls, "expected nft ApplyStatic called once") + require.NotNil(t, proxy.updated, "expected proxy policy updated") + require.Equal(t, policy.ActionDeny, proxy.updated.DefaultAction, "defaultAction should be preserved") + require.Len(t, proxy.updated.Egress, 2, "expected 2 rules remaining after delete") + require.Equal(t, policy.ActionAllow, proxy.updated.Egress[0].Action) + require.Equal(t, "example.com", proxy.updated.Egress[0].Target) + require.Equal(t, policy.ActionAllow, proxy.updated.Egress[1].Action) + require.Equal(t, "keep.com", proxy.updated.Egress[1].Target) +} + +func TestHandleDelete_CaseInsensitiveMatch(t *testing.T) { + initial := &policy.NetworkPolicy{ + DefaultAction: policy.ActionDeny, + Egress: []policy.EgressRule{ + {Action: policy.ActionAllow, Target: "Example.COM"}, + {Action: policy.ActionDeny, Target: "Blocked.COM"}, + }, + } + proxy := &stubProxy{updated: initial} + nft := &stubNft{} + srv := &policyServer{proxy: proxy, nft: nft, enforcementMode: "dns+nft"} + + body := `["example.com"]` + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader(body)) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusOK, resp.StatusCode, "expected 200 OK") + require.NotNil(t, proxy.updated) + require.Len(t, proxy.updated.Egress, 1, "expected 1 rule remaining") + require.Equal(t, "Blocked.COM", proxy.updated.Egress[0].Target, "unmatched rule should remain") +} + +func TestHandleDelete_NoMatchReturns200(t *testing.T) { + initial := &policy.NetworkPolicy{ + DefaultAction: policy.ActionDeny, + Egress: []policy.EgressRule{ + {Action: policy.ActionAllow, Target: "keep.com"}, + }, + } + proxy := &stubProxy{updated: initial} + nft := &stubNft{} + srv := &policyServer{proxy: proxy, nft: nft, enforcementMode: "dns+nft"} + + body := `["nonexistent.com"]` + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader(body)) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusOK, resp.StatusCode, "expected 200 OK even when no targets match") + require.Equal(t, 0, nft.calls, "nft should not be called when nothing changes") + require.Len(t, proxy.updated.Egress, 1, "policy should be unchanged") +} + +func TestHandleDelete_EmptyBodyReturns400(t *testing.T) { + proxy := &stubProxy{updated: policy.DefaultDenyPolicy()} + srv := &policyServer{proxy: proxy, nft: nil, enforcementMode: "dns"} + + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader("")) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusBadRequest, resp.StatusCode, "expected 400 for empty body") +} + +func TestHandleDelete_EmptyArrayReturns400(t *testing.T) { + proxy := &stubProxy{updated: policy.DefaultDenyPolicy()} + srv := &policyServer{proxy: proxy, nft: nil, enforcementMode: "dns"} + + body := `[]` + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader(body)) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusBadRequest, resp.StatusCode, "expected 400 for empty array") +} + +func TestHandleDelete_InvalidJSONReturns400(t *testing.T) { + proxy := &stubProxy{updated: policy.DefaultDenyPolicy()} + srv := &policyServer{proxy: proxy, nft: nil, enforcementMode: "dns"} + + body := `not-json` + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader(body)) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusBadRequest, resp.StatusCode, "expected 400 for invalid JSON") +} + +func TestHandleDelete_NftFailureReturns500(t *testing.T) { + initial := &policy.NetworkPolicy{ + DefaultAction: policy.ActionDeny, + Egress: []policy.EgressRule{ + {Action: policy.ActionAllow, Target: "example.com"}, + }, + } + proxy := &stubProxy{updated: initial} + nft := &stubNft{err: errors.New("nft apply failed")} + srv := &policyServer{proxy: proxy, nft: nft, enforcementMode: "dns+nft"} + + body := `["example.com"]` + req := httptest.NewRequest(http.MethodDelete, "/policy", strings.NewReader(body)) + w := httptest.NewRecorder() + + srv.handlePolicy(w, req) + + resp := w.Result() + require.Equal(t, http.StatusInternalServerError, resp.StatusCode, "expected 500 on nft failure") + require.Equal(t, 1, nft.calls, "expected nft ApplyStatic called once") + require.Len(t, proxy.updated.Egress, 1, "proxy should not be updated on nft failure") + require.Equal(t, "example.com", proxy.updated.Egress[0].Target, "original rule should remain") +} + func TestHandlePost_RejectsWhenOverMaxEgressRules(t *testing.T) { proxy := &stubProxy{} nft := &stubNft{} diff --git a/components/egress/policy_utils.go b/components/egress/policy_utils.go index 10c0a6cad..b3aa42176 100644 --- a/components/egress/policy_utils.go +++ b/components/egress/policy_utils.go @@ -83,6 +83,32 @@ func mergeEgressRules(base, additions []policy.EgressRule) []policy.EgressRule { return out } +// removeRulesByTarget returns a new slice with rules matching targets removed, +// plus the removed rules. Domain targets are matched case-insensitively. +// Targets not found are silently ignored. +func removeRulesByTarget(rules []policy.EgressRule, targets []string) (kept, removed []policy.EgressRule) { + if len(targets) == 0 || len(rules) == 0 { + return rules, nil + } + removeSet := make(map[string]struct{}, len(targets)) + for _, t := range targets { + key := strings.ToLower(strings.TrimSpace(t)) + if key == "" { + continue + } + removeSet[key] = struct{}{} + } + kept = make([]policy.EgressRule, 0, len(rules)) + for _, r := range rules { + if _, ok := removeSet[strings.ToLower(r.Target)]; ok { + removed = append(removed, r) + } else { + kept = append(kept, r) + } + } + return kept, removed +} + // mergeKey: domain targets lowercased for dedupe; IP/CIDR left as-is. func mergeKey(r policy.EgressRule) string { if r.Target == "" { diff --git a/components/egress/tests/smoke-nft.sh b/components/egress/tests/smoke-nft.sh index eac5c232e..ff704f7dd 100755 --- a/components/egress/tests/smoke-nft.sh +++ b/components/egress/tests/smoke-nft.sh @@ -155,6 +155,47 @@ else pass "www.mozilla.org blocked after patch" fi +info "DELETE: deny two hosts, then delete one rule" +curl -sSf -XPOST "http://127.0.0.1:${POLICY_PORT}/policy" \ + -d '{"defaultAction":"allow","egress":[{"action":"deny","target":"api.github.com"},{"action":"deny","target":"www.cloudflare.com"}]}' + +info "Test: both hosts should be blocked before delete" +if run_in_app -I https://api.github.com --max-time 8 >/dev/null 2>&1; then + fail "api.github.com should be blocked before delete" +fi +if run_in_app -I https://www.cloudflare.com --max-time 8 >/dev/null 2>&1; then + fail "www.cloudflare.com should be blocked before delete" +fi +pass "both hosts blocked before delete" + +info "Deleting api.github.com rule" +curl -sSf -XDELETE "http://127.0.0.1:${POLICY_PORT}/policy" \ + -d '["api.github.com"]' + +info "Test: api.github.com allowed, www.cloudflare.com still blocked after delete" +run_in_app -I https://api.github.com --max-time 20 >/dev/null 2>&1 || fail "api.github.com should be allowed after delete" +pass "api.github.com allowed after delete" +if run_in_app -I https://www.cloudflare.com --max-time 8 >/dev/null 2>&1; then + fail "www.cloudflare.com should remain blocked after delete" +fi +pass "www.cloudflare.com still blocked" + +info "Deleting non-existent target (idempotent)" +resp="$(curl -sSf -XDELETE "http://127.0.0.1:${POLICY_PORT}/policy" -d '["nonexistent.com"]')" +if echo "${resp}" | grep -q '"no matching targets found"'; then + pass "idempotent delete returns no matching targets found" +else + fail "expected no matching targets found, got: ${resp}" +fi + +info "Deleting with empty body (expect 400)" +http_code="$(curl -s -o /dev/null -w '%{http_code}' -XDELETE "http://127.0.0.1:${POLICY_PORT}/policy" -d '')" +if [ "${http_code}" = "400" ]; then + pass "empty body returns 400" +else + fail "empty body should return 400, got ${http_code}" +fi + info "Always-rule dynamic check (single transition)" curl -sSf -XPOST "http://127.0.0.1:${POLICY_PORT}/policy" \ -d '{"defaultAction":"deny","egress":[{"action":"allow","target":"api.github.com"}]}' diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Adapters/EgressAdapter.cs b/sdks/sandbox/csharp/src/OpenSandbox/Adapters/EgressAdapter.cs index 05eb00527..dd913783a 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Adapters/EgressAdapter.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Adapters/EgressAdapter.cs @@ -54,6 +54,13 @@ public async Task PatchRulesAsync( await _client.PatchAsync("/policy", normalizedRules, cancellationToken).ConfigureAwait(false); } + public async Task DeleteRulesAsync( + IReadOnlyList targets, + CancellationToken cancellationToken = default) + { + await _client.DeleteAsync("/policy", targets.ToList(), cancellationToken).ConfigureAwait(false); + } + private static NetworkPolicy ParseNetworkPolicy(JsonElement element) { var policy = new NetworkPolicy(); diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Internal/HttpClientWrapper.cs b/sdks/sandbox/csharp/src/OpenSandbox/Internal/HttpClientWrapper.cs index dbc575598..af6462533 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Internal/HttpClientWrapper.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Internal/HttpClientWrapper.cs @@ -189,6 +189,23 @@ public async Task DeleteAsync( await EnsureSuccessAsync(response, cancellationToken).ConfigureAwait(false); } + public async Task DeleteAsync( + string path, + object body, + CancellationToken cancellationToken) + { + var url = BuildUrl(path); + _logger.LogDebug("HTTP DELETE {Url}", url); + using var request = new HttpRequestMessage(HttpMethod.Delete, url); + ApplyDefaultHeaders(request); + + var json = JsonSerializer.Serialize(body, JsonOptions); + request.Content = new StringContent(json, Encoding.UTF8, "application/json"); + + using var response = await _httpClient.SendAsync(request, cancellationToken).ConfigureAwait(false); + await EnsureSuccessAsync(response, cancellationToken).ConfigureAwait(false); + } + public async Task SendAsync( HttpRequestMessage request, CancellationToken cancellationToken = default) diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Sandbox.cs b/sdks/sandbox/csharp/src/OpenSandbox/Sandbox.cs index 132b26451..a93193f66 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Sandbox.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Sandbox.cs @@ -593,6 +593,23 @@ public async Task PatchEgressRulesAsync( await _egress.PatchRulesAsync(rules, cancellationToken).ConfigureAwait(false); } + /// + /// Deletes egress rules for this sandbox by target. + /// + /// Each entry is a FQDN or wildcard domain. Matching rules are removed + /// from the currently enforced policy. Targets not present in the policy + /// are silently ignored (idempotent). The current defaultAction is + /// preserved. + /// + /// Target FQDNs or wildcard domains to remove. + /// Cancellation token. + public async Task DeleteEgressRulesAsync( + IReadOnlyList targets, + CancellationToken cancellationToken = default) + { + await _egress.DeleteRulesAsync(targets, cancellationToken).ConfigureAwait(false); + } + /// /// Gets the endpoint for a port. /// diff --git a/sdks/sandbox/csharp/src/OpenSandbox/Services/IEgress.cs b/sdks/sandbox/csharp/src/OpenSandbox/Services/IEgress.cs index aaaca49c4..5f8fde0b4 100644 --- a/sdks/sandbox/csharp/src/OpenSandbox/Services/IEgress.cs +++ b/sdks/sandbox/csharp/src/OpenSandbox/Services/IEgress.cs @@ -26,4 +26,8 @@ public interface IEgress Task PatchRulesAsync( IReadOnlyList rules, CancellationToken cancellationToken = default); + + Task DeleteRulesAsync( + IReadOnlyList targets, + CancellationToken cancellationToken = default); } diff --git a/sdks/sandbox/csharp/tests/OpenSandbox.Tests/SandboxEgressLifecycleTests.cs b/sdks/sandbox/csharp/tests/OpenSandbox.Tests/SandboxEgressLifecycleTests.cs index 685539a6f..b33ffa24e 100644 --- a/sdks/sandbox/csharp/tests/OpenSandbox.Tests/SandboxEgressLifecycleTests.cs +++ b/sdks/sandbox/csharp/tests/OpenSandbox.Tests/SandboxEgressLifecycleTests.cs @@ -55,12 +55,15 @@ await sandbox.PatchEgressRulesAsync([new NetworkRule Action = NetworkRuleAction.Allow, Target = "www.github.com" }]); + await sandbox.DeleteEgressRulesAsync(["www.github.com", "*.blocked.org"]); sandboxes.EndpointCalls.Should().Equal(Constants.DefaultExecdPort, Constants.DefaultEgressPort); adapterFactory.EgressStackCallCount.Should().Be(1); adapterFactory.LastEgressBaseUrl.Should().Be($"http://127.0.0.1:{Constants.DefaultEgressPort}"); egress.GetPolicyCallCount.Should().Be(1); egress.PatchRulesCallCount.Should().Be(1); + egress.DeleteRulesCallCount.Should().Be(1); + egress.LastDeleteTargets.Should().Equal("www.github.com", "*.blocked.org"); } [Fact] @@ -300,6 +303,10 @@ private sealed class StubEgress : IEgress public int PatchRulesCallCount { get; private set; } + public int DeleteRulesCallCount { get; private set; } + + public IReadOnlyList LastDeleteTargets { get; private set; } = []; + public Task GetPolicyAsync(CancellationToken cancellationToken = default) { GetPolicyCallCount++; @@ -319,6 +326,13 @@ public Task PatchRulesAsync(IReadOnlyList rules, CancellationToken PatchRulesCallCount++; return Task.CompletedTask; } + + public Task DeleteRulesAsync(IReadOnlyList targets, CancellationToken cancellationToken = default) + { + DeleteRulesCallCount++; + LastDeleteTargets = targets.ToList(); + return Task.CompletedTask; + } } private sealed class StubFiles : ISandboxFiles diff --git a/sdks/sandbox/go/egress.go b/sdks/sandbox/go/egress.go index 6a5ef1cf4..d0536ea23 100644 --- a/sdks/sandbox/go/egress.go +++ b/sdks/sandbox/go/egress.go @@ -55,3 +55,15 @@ func (c *EgressClient) PatchPolicy(ctx context.Context, rules []NetworkRule) (*P } return &resp, nil } + +// DeletePolicy removes egress rules matching the given targets from the current +// policy. Each target is a FQDN or wildcard domain. Targets not present in the +// policy are silently ignored (idempotent). The current defaultAction is +// preserved. +func (c *EgressClient) DeletePolicy(ctx context.Context, targets []string) (*PolicyStatusResponse, error) { + var resp PolicyStatusResponse + if err := c.doRequest(ctx, "DELETE", "/policy", targets, &resp); err != nil { + return nil, err + } + return &resp, nil +} diff --git a/sdks/sandbox/go/opensandbox_test.go b/sdks/sandbox/go/opensandbox_test.go index da7cec6b9..7ae3cf71f 100644 --- a/sdks/sandbox/go/opensandbox_test.go +++ b/sdks/sandbox/go/opensandbox_test.go @@ -640,6 +640,47 @@ func TestPatchPolicy(t *testing.T) { require.Len(t, got.Policy.Egress, 2) } +func TestDeletePolicy(t *testing.T) { + want := PolicyStatusResponse{ + Status: "ok", + Mode: "deny_all", + Policy: &NetworkPolicy{ + DefaultAction: "deny", + Egress: []NetworkRule{ + {Action: "allow", Target: "api.example.com"}, + }, + }, + } + + _, client := newEgressServer(t, func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodDelete { + assert.Fail(t, fmt.Sprintf("expected DELETE, got %s", r.Method)) + } + + var targets []string + if err := json.NewDecoder(r.Body).Decode(&targets); err != nil { + assert.Fail(t, fmt.Sprintf("decode body: %v", err)) + } + if len(targets) != 2 { + assert.Fail(t, fmt.Sprintf("expected 2 targets in request, got %d", len(targets))) + } + if targets[0] != "bad.example.com" || targets[1] != "*.blocked.org" { + assert.Fail(t, fmt.Sprintf("unexpected targets: %v", targets)) + } + + jsonResponse(w, http.StatusOK, want) + }) + + got, err := client.DeletePolicy(context.Background(), []string{ + "bad.example.com", + "*.blocked.org", + }) + require.NoErrorf(t, err, "DeletePolicy") + require.NotNil(t, got.Policy) + require.Len(t, got.Policy.Egress, 1) + require.Equal(t, "api.example.com", got.Policy.Egress[0].Target) +} + func TestPing(t *testing.T) { _, client := newExecdServer(t, func(w http.ResponseWriter, r *http.Request) { if r.Method != http.MethodGet { diff --git a/sdks/sandbox/go/sandbox_egress.go b/sdks/sandbox/go/sandbox_egress.go index 471293688..baa32403d 100644 --- a/sdks/sandbox/go/sandbox_egress.go +++ b/sdks/sandbox/go/sandbox_egress.go @@ -31,3 +31,12 @@ func (s *Sandbox) PatchEgressRules(ctx context.Context, rules []NetworkRule) (*P } return s.egress.PatchPolicy(ctx, rules) } + +// DeleteEgressRules removes egress rules matching the given targets from the +// current egress policy. Targets not present in the policy are silently ignored. +func (s *Sandbox) DeleteEgressRules(ctx context.Context, targets []string) (*PolicyStatusResponse, error) { + if err := s.resolveEgress(ctx); err != nil { + return nil, err + } + return s.egress.DeletePolicy(ctx, targets) +} diff --git a/sdks/sandbox/javascript/src/adapters/egressAdapter.ts b/sdks/sandbox/javascript/src/adapters/egressAdapter.ts index 93aa90a8f..e2262ddae 100644 --- a/sdks/sandbox/javascript/src/adapters/egressAdapter.ts +++ b/sdks/sandbox/javascript/src/adapters/egressAdapter.ts @@ -22,6 +22,8 @@ type ApiGetPolicyOk = EgressPaths["/policy"]["get"]["responses"][200]["content"]["application/json"]; type ApiPatchRulesRequest = EgressPaths["/policy"]["patch"]["requestBody"]["content"]["application/json"]; +type ApiDeleteRulesRequest = + EgressPaths["/policy"]["delete"]["requestBody"]["content"]["application/json"]; export class EgressAdapter implements Egress { constructor(private readonly client: EgressClient) {} @@ -43,4 +45,12 @@ export class EgressAdapter implements Egress { }); throwOnOpenApiFetchError({ error, response }, "Patch sandbox egress rules failed"); } + + async deleteRules(targets: string[]): Promise { + const body: ApiDeleteRulesRequest = targets; + const { error, response } = await this.client.DELETE("/policy", { + body, + }); + throwOnOpenApiFetchError({ error, response }, "Delete sandbox egress rules failed"); + } } diff --git a/sdks/sandbox/javascript/src/api/egress.ts b/sdks/sandbox/javascript/src/api/egress.ts index 2cca3230a..934e869d7 100644 --- a/sdks/sandbox/javascript/src/api/egress.ts +++ b/sdks/sandbox/javascript/src/api/egress.ts @@ -54,7 +54,41 @@ export interface paths { }; put?: never; post?: never; - delete?: never; + /** + * Delete egress rules + * @description Remove specific egress rules from the currently enforced policy by target. + * + * - Accepts a list of target strings (FQDNs or wildcard domains). + * - Matching rules are removed; targets not found in the current policy + * are silently ignored (idempotent). + */ + delete: { + parameters: { + query?: never; + header?: never; + path?: never; + cookie?: never; + }; + requestBody: { + content: { + "application/json": string[]; + }; + }; + responses: { + /** @description Rules removed successfully. */ + 200: { + headers: { + [name: string]: unknown; + }; + content: { + "application/json": components["schemas"]["PolicyStatusResponse"]; + }; + }; + 400: components["responses"]["BadRequest"]; + 401: components["responses"]["Unauthorized"]; + 500: components["responses"]["InternalServerError"]; + }; + }; options?: never; head?: never; /** diff --git a/sdks/sandbox/javascript/src/sandbox.ts b/sdks/sandbox/javascript/src/sandbox.ts index 4315f7a44..6ad984322 100644 --- a/sdks/sandbox/javascript/src/sandbox.ts +++ b/sdks/sandbox/javascript/src/sandbox.ts @@ -570,6 +570,10 @@ export class Sandbox { await Sandbox._priv.get(this)!.egress.patchRules(rules); } + async deleteEgressRules(targets: string[]): Promise { + await Sandbox._priv.get(this)!.egress.deleteRules(targets); + } + /** * Get sandbox endpoint for a port (STRICT: no scheme), e.g. "localhost:44772" or "domain/route/.../44772". */ diff --git a/sdks/sandbox/javascript/src/services/egress.ts b/sdks/sandbox/javascript/src/services/egress.ts index 0a248d725..ff9efe3b7 100644 --- a/sdks/sandbox/javascript/src/services/egress.ts +++ b/sdks/sandbox/javascript/src/services/egress.ts @@ -24,4 +24,12 @@ export interface Egress { * the first rule for a target wins. The current defaultAction is preserved. */ patchRules(rules: NetworkRule[]): Promise; + /** + * Delete egress rules by target. + * + * Each entry is a FQDN or wildcard domain. Matching rules are removed from + * the currently enforced policy. Targets not present in the policy are + * silently ignored (idempotent). The current defaultAction is preserved. + */ + deleteRules(targets: string[]): Promise; } diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/Sandbox.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/Sandbox.kt index 75cf9b66c..b490015c1 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/Sandbox.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/Sandbox.kt @@ -528,6 +528,19 @@ class Sandbox internal constructor( egressService.patchRules(rules) } + /** + * Deletes egress rules for this sandbox by target. + * + * Each entry is a FQDN or wildcard domain. Matching rules are removed from + * the currently enforced policy. Targets not present in the policy are + * silently ignored (idempotent). The current defaultAction is preserved. + * + * @throws SandboxException if operation fails + */ + fun deleteEgressRules(targets: List) { + egressService.deleteRules(targets) + } + /** * Pauses the sandbox while preserving its state. * diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Egress.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Egress.kt index 61aa78e5b..dc0b44b37 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Egress.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/domain/services/Egress.kt @@ -23,4 +23,6 @@ interface Egress { fun getPolicy(): NetworkPolicy fun patchRules(rules: List) + + fun deleteRules(targets: List) } diff --git a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/EgressAdapter.kt b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/EgressAdapter.kt index fc3b6192a..dc4460065 100644 --- a/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/EgressAdapter.kt +++ b/sdks/sandbox/kotlin/sandbox/src/main/kotlin/com/alibaba/opensandbox/sandbox/infrastructure/adapters/service/EgressAdapter.kt @@ -66,4 +66,13 @@ internal class EgressAdapter( throw e.toSandboxException() } } + + override fun deleteRules(targets: List) { + try { + api.policyDelete(targets) + } catch (e: Exception) { + logger.error("Failed to delete egress rules via endpoint {}", egressEndpoint.endpoint, e) + throw e.toSandboxException() + } + } } diff --git a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/SandboxTest.kt b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/SandboxTest.kt index 9c0041368..295f92e15 100644 --- a/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/SandboxTest.kt +++ b/sdks/sandbox/kotlin/sandbox/src/test/kotlin/com/alibaba/opensandbox/sandbox/SandboxTest.kt @@ -218,6 +218,16 @@ class SandboxTest { verify { egressService.patchRules(rules) } } + @Test + fun `deleteEgressRules should delegate to egressService`() { + val targets = listOf("bad.example.com", "*.blocked.org") + every { egressService.deleteRules(targets) } just Runs + + sandbox.deleteEgressRules(targets) + + verify { egressService.deleteRules(targets) } + } + @Test fun `builder manualCleanup should clear timeout`() { val builder = diff --git a/sdks/sandbox/python/src/opensandbox/adapters/egress_adapter.py b/sdks/sandbox/python/src/opensandbox/adapters/egress_adapter.py index 99a00e9d4..7db99867a 100644 --- a/sdks/sandbox/python/src/opensandbox/adapters/egress_adapter.py +++ b/sdks/sandbox/python/src/opensandbox/adapters/egress_adapter.py @@ -110,3 +110,16 @@ async def patch_rules(self, rules: list[NetworkRule]) -> None: except Exception as e: logger.error("Failed to patch egress policy via endpoint %s", self.endpoint.endpoint, exc_info=e) raise ExceptionConverter.to_sandbox_exception(e) from e + + async def delete_rules(self, targets: list[str]) -> None: + try: + from opensandbox.api.egress.api.policy import delete_policy + + response_obj = await delete_policy.asyncio_detailed( + client=self._client, + body=list(targets), + ) + handle_api_error(response_obj, "Delete egress rules") + except Exception as e: + logger.error("Failed to delete egress rules via endpoint %s", self.endpoint.endpoint, exc_info=e) + raise ExceptionConverter.to_sandbox_exception(e) from e diff --git a/sdks/sandbox/python/src/opensandbox/api/egress/api/policy/delete_policy.py b/sdks/sandbox/python/src/opensandbox/api/egress/api/policy/delete_policy.py new file mode 100644 index 000000000..cae7bb9a6 --- /dev/null +++ b/sdks/sandbox/python/src/opensandbox/api/egress/api/policy/delete_policy.py @@ -0,0 +1,211 @@ +# +# Copyright 2026 Alibaba Group Holding Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from http import HTTPStatus +from typing import Any + +import httpx + +from ... import errors +from ...client import AuthenticatedClient, Client +from ...models.policy_status_response import PolicyStatusResponse +from ...types import Response + + +def _get_kwargs( + *, + body: list[str], +) -> dict[str, Any]: + headers: dict[str, Any] = {} + + _kwargs: dict[str, Any] = { + "method": "delete", + "url": "/policy", + } + + _kwargs["json"] = body + + headers["Content-Type"] = "application/json" + + _kwargs["headers"] = headers + return _kwargs + + +def _parse_response( + *, client: AuthenticatedClient | Client, response: httpx.Response +) -> PolicyStatusResponse | str | None: + if response.status_code == 200: + response_200 = PolicyStatusResponse.from_dict(response.json()) + + return response_200 + + if response.status_code == 400: + response_400 = response.text + return response_400 + + if response.status_code == 401: + response_401 = response.text + return response_401 + + if response.status_code == 500: + response_500 = response.text + return response_500 + + if client.raise_on_unexpected_status: + raise errors.UnexpectedStatus(response.status_code, response.content) + else: + return None + + +def _build_response( + *, client: AuthenticatedClient | Client, response: httpx.Response +) -> Response[PolicyStatusResponse | str]: + return Response( + status_code=HTTPStatus(response.status_code), + content=response.content, + headers=response.headers, + parsed=_parse_response(client=client, response=response), + ) + + +def sync_detailed( + *, + client: AuthenticatedClient | Client, + body: list[str], +) -> Response[PolicyStatusResponse | str]: + """Delete egress rules + + Remove specific egress rules from the currently enforced policy by target. + + - Accepts a list of target strings (FQDNs or wildcard domains). + - Matching rules are removed; targets not found in the current policy + are silently ignored (idempotent). + + Args: + body (list[str]): Example: ['bad.example.com', '*.blocked.org']. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Response[PolicyStatusResponse | str] + """ + + kwargs = _get_kwargs( + body=body, + ) + + response = client.get_httpx_client().request( + **kwargs, + ) + + return _build_response(client=client, response=response) + + +def sync( + *, + client: AuthenticatedClient | Client, + body: list[str], +) -> PolicyStatusResponse | str | None: + """Delete egress rules + + Remove specific egress rules from the currently enforced policy by target. + + - Accepts a list of target strings (FQDNs or wildcard domains). + - Matching rules are removed; targets not found in the current policy + are silently ignored (idempotent). + + Args: + body (list[str]): Example: ['bad.example.com', '*.blocked.org']. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + PolicyStatusResponse | str + """ + + return sync_detailed( + client=client, + body=body, + ).parsed + + +async def asyncio_detailed( + *, + client: AuthenticatedClient | Client, + body: list[str], +) -> Response[PolicyStatusResponse | str]: + """Delete egress rules + + Remove specific egress rules from the currently enforced policy by target. + + - Accepts a list of target strings (FQDNs or wildcard domains). + - Matching rules are removed; targets not found in the current policy + are silently ignored (idempotent). + + Args: + body (list[str]): Example: ['bad.example.com', '*.blocked.org']. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + Response[PolicyStatusResponse | str] + """ + + kwargs = _get_kwargs( + body=body, + ) + + response = await client.get_async_httpx_client().request(**kwargs) + + return _build_response(client=client, response=response) + + +async def asyncio( + *, + client: AuthenticatedClient | Client, + body: list[str], +) -> PolicyStatusResponse | str | None: + """Delete egress rules + + Remove specific egress rules from the currently enforced policy by target. + + - Accepts a list of target strings (FQDNs or wildcard domains). + - Matching rules are removed; targets not found in the current policy + are silently ignored (idempotent). + + Args: + body (list[str]): Example: ['bad.example.com', '*.blocked.org']. + + Raises: + errors.UnexpectedStatus: If the server returns an undocumented status code and Client.raise_on_unexpected_status is True. + httpx.TimeoutException: If the request takes longer than Client.timeout. + + Returns: + PolicyStatusResponse | str + """ + + return ( + await asyncio_detailed( + client=client, + body=body, + ) + ).parsed diff --git a/sdks/sandbox/python/src/opensandbox/sandbox.py b/sdks/sandbox/python/src/opensandbox/sandbox.py index ac6b06b24..90e997c77 100644 --- a/sdks/sandbox/python/src/opensandbox/sandbox.py +++ b/sdks/sandbox/python/src/opensandbox/sandbox.py @@ -310,6 +310,17 @@ async def patch_egress_rules(self, rules: list[NetworkRule]) -> None: """ await self._egress_service.patch_rules(rules) + async def delete_egress_rules(self, targets: list[str]) -> None: + """ + Delete egress rules for this sandbox by target. + + Each entry is a FQDN or wildcard domain. Matching rules are removed + from the currently enforced policy. Targets not present in the policy + are silently ignored (idempotent). The current defaultAction is + preserved. + """ + await self._egress_service.delete_rules(targets) + async def pause(self) -> None: """ Pause the sandbox while preserving its state. diff --git a/sdks/sandbox/python/src/opensandbox/services/egress.py b/sdks/sandbox/python/src/opensandbox/services/egress.py index 89e8a162f..c863bd40f 100644 --- a/sdks/sandbox/python/src/opensandbox/services/egress.py +++ b/sdks/sandbox/python/src/opensandbox/services/egress.py @@ -50,3 +50,17 @@ async def patch_rules(self, rules: list[NetworkRule]) -> None: SandboxException: if the operation fails """ ... + + async def delete_rules(self, targets: list[str]) -> None: + """ + Delete egress rules by target via the sidecar policy API. + + Each entry is a FQDN or wildcard domain. Matching rules are removed + from the currently enforced policy. Targets not present in the policy + are silently ignored (idempotent). The current defaultAction is + preserved. + + Raises: + SandboxException: if the operation fails + """ + ... diff --git a/sdks/sandbox/python/src/opensandbox/sync/adapters/egress_adapter.py b/sdks/sandbox/python/src/opensandbox/sync/adapters/egress_adapter.py index bf3e48714..56ffc1971 100644 --- a/sdks/sandbox/python/src/opensandbox/sync/adapters/egress_adapter.py +++ b/sdks/sandbox/python/src/opensandbox/sync/adapters/egress_adapter.py @@ -110,3 +110,16 @@ def patch_rules(self, rules: list[NetworkRule]) -> None: except Exception as e: logger.error("Failed to patch egress policy via endpoint %s", self.endpoint.endpoint, exc_info=e) raise ExceptionConverter.to_sandbox_exception(e) from e + + def delete_rules(self, targets: list[str]) -> None: + try: + from opensandbox.api.egress.api.policy import delete_policy + + response_obj = delete_policy.sync_detailed( + client=self._client, + body=list(targets), + ) + handle_api_error(response_obj, "Delete egress rules") + except Exception as e: + logger.error("Failed to delete egress rules via endpoint %s", self.endpoint.endpoint, exc_info=e) + raise ExceptionConverter.to_sandbox_exception(e) from e diff --git a/sdks/sandbox/python/src/opensandbox/sync/sandbox.py b/sdks/sandbox/python/src/opensandbox/sync/sandbox.py index c236067ff..83cce5919 100644 --- a/sdks/sandbox/python/src/opensandbox/sync/sandbox.py +++ b/sdks/sandbox/python/src/opensandbox/sync/sandbox.py @@ -318,6 +318,17 @@ def patch_egress_rules(self, rules: list[NetworkRule]) -> None: """ self._egress_service.patch_rules(rules) + def delete_egress_rules(self, targets: list[str]) -> None: + """ + Delete egress rules for this sandbox by target. + + Each entry is a FQDN or wildcard domain. Matching rules are removed + from the currently enforced policy. Targets not present in the policy + are silently ignored (idempotent). The current defaultAction is + preserved. + """ + self._egress_service.delete_rules(targets) + def pause(self) -> None: """ Pause the sandbox while preserving its state. diff --git a/sdks/sandbox/python/src/opensandbox/sync/services/egress.py b/sdks/sandbox/python/src/opensandbox/sync/services/egress.py index c9d2ec730..71fb5977a 100644 --- a/sdks/sandbox/python/src/opensandbox/sync/services/egress.py +++ b/sdks/sandbox/python/src/opensandbox/sync/services/egress.py @@ -38,3 +38,13 @@ def patch_rules(self, rules: list[NetworkRule]) -> None: preserved. """ ... + + def delete_rules(self, targets: list[str]) -> None: + """Delete egress rules by target via the sidecar policy API. + + Each entry is a FQDN or wildcard domain. Matching rules are removed + from the currently enforced policy. Targets not present in the policy + are silently ignored (idempotent). The current defaultAction is + preserved. + """ + ... diff --git a/specs/README.md b/specs/README.md index c56523504..1d04d34da 100644 --- a/specs/README.md +++ b/specs/README.md @@ -122,6 +122,7 @@ the sandbox endpoint for the egress port and then calling the sidecar endpoint d **Main Endpoints:** - `GET /policy` - Get the current egress policy - `PATCH /policy` - Merge new egress rules into the current policy +- `DELETE /policy` - Remove specific egress rules from the current policy by target ## Technical Features diff --git a/specs/README_zh.md b/specs/README_zh.md index f251cb0da..0f4ef1c6d 100644 --- a/specs/README_zh.md +++ b/specs/README_zh.md @@ -121,6 +121,7 @@ **主要端点:** - `GET /policy` - 获取当前 egress 策略 - `PATCH /policy` - 将新的 egress 规则合并到当前策略 +- `DELETE /policy` - 按 target 删除当前策略中的指定 egress 规则 ## 技术特性 diff --git a/specs/egress-api.yaml b/specs/egress-api.yaml index f7dce4ec5..fba36525f 100644 --- a/specs/egress-api.yaml +++ b/specs/egress-api.yaml @@ -109,6 +109,48 @@ paths: $ref: '#/components/responses/Unauthorized' '500': $ref: '#/components/responses/InternalServerError' + delete: + tags: [Policy] + summary: Delete egress rules + description: | + Remove specific egress rules from the currently enforced policy by target. + + - Accepts a list of target strings (FQDNs or wildcard domains). + - Matching rules are removed; targets not found in the current policy + are silently ignored (idempotent). + requestBody: + required: true + content: + application/json: + schema: + type: array + minItems: 1 + items: + type: string + description: FQDN or wildcard domain to remove from the policy. + example: + - bad.example.com + - "*.blocked.org" + responses: + '200': + description: Rules removed successfully. + content: + application/json: + schema: + $ref: '#/components/schemas/PolicyStatusResponse' + examples: + removed: + summary: Rules removed + value: + status: ok + mode: deny_all + enforcementMode: dns + '400': + $ref: '#/components/responses/BadRequest' + '401': + $ref: '#/components/responses/Unauthorized' + '500': + $ref: '#/components/responses/InternalServerError' components: responses: BadRequest: diff --git a/tests/python/tests/test_sandbox_e2e.py b/tests/python/tests/test_sandbox_e2e.py index 0eb331d7c..3815ed74f 100644 --- a/tests/python/tests/test_sandbox_e2e.py +++ b/tests/python/tests/test_sandbox_e2e.py @@ -462,6 +462,82 @@ async def test_01aa_network_policy_get_and_patch(self): pass await sandbox.close() + @pytest.mark.timeout(180) + @pytest.mark.order(1) + async def test_01ac_network_policy_delete(self): + if is_kubernetes_runtime(): + pytest.skip("Network policy is not covered in the Kubernetes runtime suite") + + logger.info("=" * 80) + logger.info("TEST 1ac: networkPolicy delete (async)") + logger.info("=" * 80) + + cfg = create_connection_config() + sandbox = await Sandbox.create( + image=SandboxImageSpec(get_sandbox_image()), + resource=get_e2e_sandbox_resource(), + connection_config=cfg, + timeout=timedelta(minutes=5), + ready_timeout=timedelta(seconds=30), + network_policy=NetworkPolicy( + defaultAction="deny", + egress=[ + NetworkRule(action="allow", target="pypi.org"), + NetworkRule(action="allow", target="www.github.com"), + ], + ), + ) + try: + await asyncio.sleep(5) + + # Baseline: both targets reachable under deny-default policy. + initial_policy = await sandbox.get_egress_policy() + assert initial_policy.egress is not None + assert any(r.target == "pypi.org" and r.action == "allow" for r in initial_policy.egress) + assert any( + r.target == "www.github.com" and r.action == "allow" for r in initial_policy.egress + ) + pypi_ok = await sandbox.commands.run("curl -I https://pypi.org") + assert pypi_ok.error is None + github_ok = await sandbox.commands.run("curl -I https://www.github.com") + assert github_ok.error is None + + # Delete the github allow-rule. Include a non-existent target to + # confirm DELETE is idempotent (no error, silently ignored). + await sandbox.delete_egress_rules(["www.github.com", "nonexistent.example.com"]) + await asyncio.sleep(2) + + deleted_policy = await sandbox.get_egress_policy() + assert deleted_policy.egress is not None + assert not any( + r.target == "www.github.com" for r in deleted_policy.egress + ), "www.github.com rule should be removed" + assert any( + r.target == "pypi.org" and r.action == "allow" for r in deleted_policy.egress + ), "pypi.org rule should remain (other targets untouched)" + assert deleted_policy.default_action == "deny", "defaultAction must be preserved" + + # github now falls under default-deny; pypi still allowed. + github_blocked = await sandbox.commands.run("curl -I https://www.github.com") + assert github_blocked.error is not None + pypi_still_ok = await sandbox.commands.run("curl -I https://pypi.org") + assert pypi_still_ok.error is None + + # Second delete of the same target is a no-op. + await sandbox.delete_egress_rules(["www.github.com"]) + await asyncio.sleep(1) + unchanged_policy = await sandbox.get_egress_policy() + assert unchanged_policy.egress is not None + assert {r.target for r in unchanged_policy.egress} == { + r.target for r in deleted_policy.egress + } + finally: + try: + await sandbox.kill() + except Exception: + pass + await sandbox.close() + @pytest.mark.timeout(240) @pytest.mark.order(1) async def test_01ab_network_policy_get_and_patch_with_server_proxy(self): From 47447fafb9c1d89f9416640c3249d1e4fe8f5ac6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 26 May 2026 04:05:47 +0000 Subject: [PATCH 57/58] chore(deps): bump idna from 3.11 to 3.15 in /cli Bumps [idna](https://github.com/kjd/idna) from 3.11 to 3.15. - [Release notes](https://github.com/kjd/idna/releases) - [Changelog](https://github.com/kjd/idna/blob/master/HISTORY.md) - [Commits](https://github.com/kjd/idna/compare/v3.11...v3.15) --- updated-dependencies: - dependency-name: idna dependency-version: '3.15' dependency-type: indirect ... Signed-off-by: dependabot[bot] --- cli/uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/uv.lock b/cli/uv.lock index f957c991f..be0135be6 100644 --- a/cli/uv.lock +++ b/cli/uv.lock @@ -233,11 +233,11 @@ wheels = [ [[package]] name = "idna" -version = "3.11" +version = "3.15" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +sdist = { url = "https://files.pythonhosted.org/packages/82/77/7b3966d0b9d1d31a36ddf1746926a11dface89a83409bf1483f0237aa758/idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc", size = 199245, upload-time = "2026-05-12T22:45:57.011Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, + { url = "https://files.pythonhosted.org/packages/d2/23/408243171aa9aaba178d3e2559159c24c1171a641aa83b67bdd3394ead8e/idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8", size = 72340, upload-time = "2026-05-12T22:45:55.733Z" }, ] [[package]] From e567cc496f9dbdd241ae76fd6bbfc4d4bcbc26e7 Mon Sep 17 00:00:00 2001 From: longsuizhi Date: Thu, 28 May 2026 15:55:00 +0800 Subject: [PATCH 58/58] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=20Pool=20?= =?UTF-8?q?=E6=A8=A1=E5=BC=8F=E4=B8=8B=20Pod=20=E8=A2=AB=E5=88=A0=E9=99=A4?= =?UTF-8?q?=E5=90=8E=20BatchSandbox=20=E4=B8=8D=E4=BC=9A=E9=87=8D=E6=96=B0?= =?UTF-8?q?=E5=88=86=E9=85=8D=E6=96=B0=20Pod=20=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 当 Pool 中已分配给 BatchSandbox 的 Pod 被外部删除时,alloc-status 注解中仍保留 已删除 Pod 的名称,导致 supplement 计算为 0,无法触发重新分配。 本次修复在 getSandboxRequest 中增加了存活检测:将已删除的 Pod 从有效分配中 排除并加入 ToRelease 队列,使 supplement > 0 从而触发 Pool 重新分配新 Pod。 --- kubernetes/internal/controller/allocator.go | 46 +++++++++++++++++---- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/kubernetes/internal/controller/allocator.go b/kubernetes/internal/controller/allocator.go index 008b14669..436c22242 100644 --- a/kubernetes/internal/controller/allocator.go +++ b/kubernetes/internal/controller/allocator.go @@ -460,6 +460,12 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec return nil, err } + // Build a set of live pool pod names for dead-pod detection during allocation requests. + livePodSet := make(map[string]struct{}, len(spec.Pods)) + for _, p := range spec.Pods { + livePodSet[p.Name] = struct{}{} + } + // Fetch pool allocation once and reuse it for both stale-sandbox cleanup and available-pod filtering. // This avoids a double store read on every reconcile. podAllocation, err := allocator.GetPoolAllocation(ctx, spec.Pool) @@ -473,7 +479,7 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec // handles them without any special-casing outside this function. // Terminating sandboxes are handled inside getSandboxRequest: they receive no new supplement and // all unreleased pods are queued for release. - allRequest, err := allocator.getAllRequest(ctx, spec.Sandboxes, podAllocation) + allRequest, err := allocator.getAllRequest(ctx, spec.Sandboxes, podAllocation, livePodSet) if err != nil { return nil, err } @@ -494,13 +500,13 @@ func (allocator *defaultAllocator) Schedule(ctx context.Context, spec *AllocSpec // orphan entries for pods in podAllocation whose sandbox is no longer in the sandboxes list // (e.g. force-deleted). Orphan entries carry PodSupplement=0 and ToRelease set to the orphan // pods so the normal recycle path handles them without special-casing in the caller. -func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes []*sandboxv1alpha1.BatchSandbox, podAllocation map[string]string) ([]*algorithm.SandboxRequest, error) { +func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes []*sandboxv1alpha1.BatchSandbox, podAllocation map[string]string, livePodSet map[string]struct{}) ([]*algorithm.SandboxRequest, error) { log := logf.FromContext(ctx) existingSandboxes := make(map[string]struct{}, len(sandboxes)) allRequest := make([]*algorithm.SandboxRequest, 0, len(sandboxes)) for _, sandbox := range sandboxes { existingSandboxes[sandbox.Name] = struct{}{} - request, err := allocator.getSandboxRequest(ctx, sandbox) + request, err := allocator.getSandboxRequest(ctx, sandbox, livePodSet) if err != nil { return nil, err } @@ -523,7 +529,7 @@ func (allocator *defaultAllocator) getAllRequest(ctx context.Context, sandboxes return allRequest, nil } -func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox) (*algorithm.SandboxRequest, error) { +func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbox *sandboxv1alpha1.BatchSandbox, livePodSet map[string]struct{}) (*algorithm.SandboxRequest, error) { log := logf.FromContext(ctx) allocated, err := allocator.GetSandboxAllocation(ctx, sandbox) if err != nil { @@ -539,15 +545,37 @@ func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbo releasedSet[r] = struct{}{} } + // Filter out pods that no longer exist in the pool (e.g. externally deleted). + // Dead pods are treated as released so the sandbox can receive replacement allocations. + liveAllocated := make([]string, 0, len(allocated)) + deadPods := make([]string, 0) + for _, p := range allocated { + if _, exists := releasedSet[p]; exists { + // Already released, keep in allocated for bookkeeping consistency. + liveAllocated = append(liveAllocated, p) + continue + } + if _, alive := livePodSet[p]; alive { + liveAllocated = append(liveAllocated, p) + } else { + deadPods = append(deadPods, p) + } + } + if len(deadPods) > 0 { + log.Info("Detected dead allocated pods, queuing for release to trigger re-allocation", + "sandbox", sandbox.Name, "deadPods", deadPods) + } + // Terminating sandboxes should not receive new allocations. // Queue all unreleased allocated pods for release and set supplement to zero. if !sandbox.DeletionTimestamp.IsZero() { toRelease := make([]string, 0) - for _, p := range allocated { + for _, p := range liveAllocated { if _, ok := releasedSet[p]; !ok { toRelease = append(toRelease, p) } } + toRelease = append(toRelease, deadPods...) if len(toRelease) > 0 { log.Info("Queuing terminating sandbox pods for release", "sandbox", sandbox.Name, "pods", toRelease) } @@ -571,15 +599,19 @@ func (allocator *defaultAllocator) getSandboxRequest(ctx context.Context, sandbo toRelease = append(toRelease, r) } } + // Also queue dead pods for release so their allocation records are cleaned up. + toRelease = append(toRelease, deadPods...) replica := int32(0) if sandbox.Spec.Replicas != nil { replica = *sandbox.Spec.Replicas } + // Use liveAllocated count (excluding dead pods) to compute supplement, + // so deleted pods trigger re-allocation from the pool. supplement := int32(0) - if replica-int32(len(allocated)) > 0 { - supplement = replica - int32(len(allocated)) + if replica-int32(len(liveAllocated)) > 0 { + supplement = replica - int32(len(liveAllocated)) } return &algorithm.SandboxRequest{