From 388007c7fadc3112d0ba563bd39a6e07277c77a1 Mon Sep 17 00:00:00 2001 From: Alice-space Date: Thu, 14 May 2026 18:08:39 +0800 Subject: [PATCH 1/2] refactor(llm): replace immediate SIGKILL with graceful subprocess shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce shared.WaitOrKill() that escalates gracefully: wait for natural exit after stdin close → SIGTERM → SIGKILL, each with a 10s grace period. Replace all direct Kill()+Wait() calls across claude_stream_driver, jsonrpc_line_client, and opencode_appserver_driver. Also fix a nil-Process bug in jsonrpc_line_client.Close() that would call cmd.Wait() on a never-started command. Co-Authored-By: Claude Opus 4.7 --- internal/llm/claude_stream_driver.go | 6 ++-- internal/llm/internal/shared/shared.go | 42 +++++++++++++++++++++-- internal/llm/jsonrpc_line_client.go | 4 +-- internal/llm/opencode_appserver_driver.go | 15 +++----- 4 files changed, 49 insertions(+), 18 deletions(-) diff --git a/internal/llm/claude_stream_driver.go b/internal/llm/claude_stream_driver.go index 9817a090..220dde4c 100644 --- a/internal/llm/claude_stream_driver.go +++ b/internal/llm/claude_stream_driver.go @@ -106,8 +106,7 @@ func (d *claudeStreamDriver) Close() error { _ = stdin.Close() } if cmd != nil && cmd.Process != nil { - _ = cmd.Process.Kill() - err = cmd.Wait() + err = shared.WaitOrKill(cmd) } close(d.events) }) @@ -159,8 +158,7 @@ func (d *claudeStreamDriver) ensureStarted(ctx context.Context, req RunRequest) if d.closed { d.mu.Unlock() _ = stdin.Close() - _ = cmd.Process.Kill() - _ = cmd.Wait() + _ = shared.WaitOrKill(cmd) return ErrInteractiveClosed } d.cmd = cmd diff --git a/internal/llm/internal/shared/shared.go b/internal/llm/internal/shared/shared.go index effc575d..43d3abf1 100644 --- a/internal/llm/internal/shared/shared.go +++ b/internal/llm/internal/shared/shared.go @@ -1,7 +1,11 @@ -// Package shared provides scanner buffer constants used across LLM providers. +// Package shared provides utilities and constants used across LLM providers. package shared -import "time" +import ( + "os/exec" + "syscall" + "time" +) // Default scanner buffer and token size constants used across providers. const ( @@ -12,3 +16,37 @@ const ( // AuthCheckTimeout is the default timeout for CLI login/auth status checks. const AuthCheckTimeout = 15 * time.Second + +const ( + // SubprocessGracePeriod is how long to wait for a subprocess to exit + // naturally after its stdin is closed before escalating to SIGTERM. + SubprocessGracePeriod = 10 * time.Second + // SubprocessTermGracePeriod is how long to wait after SIGTERM before + // escalating to SIGKILL. + SubprocessTermGracePeriod = 10 * time.Second +) + +// WaitOrKill closes the subprocess gracefully: wait for natural exit after +// stdin close, then SIGTERM, then SIGKILL as last resort. +func WaitOrKill(cmd *exec.Cmd) error { + done := make(chan error, 1) + go func() { + done <- cmd.Wait() + }() + select { + case err := <-done: + return err + case <-time.After(SubprocessGracePeriod): + } + if err := cmd.Process.Signal(syscall.SIGTERM); err != nil { + _ = cmd.Process.Kill() + return <-done + } + select { + case err := <-done: + return err + case <-time.After(SubprocessTermGracePeriod): + } + _ = cmd.Process.Kill() + return <-done +} diff --git a/internal/llm/jsonrpc_line_client.go b/internal/llm/jsonrpc_line_client.go index c6a97aa8..0749e4f6 100644 --- a/internal/llm/jsonrpc_line_client.go +++ b/internal/llm/jsonrpc_line_client.go @@ -176,9 +176,9 @@ func (c *lineRPCClient) Close() error { } _ = c.stdin.Close() if c.cmd.Process != nil { - _ = c.cmd.Process.Kill() + return shared.WaitOrKill(c.cmd) } - return c.cmd.Wait() + return nil } func (c *lineRPCClient) write(payload any) error { diff --git a/internal/llm/opencode_appserver_driver.go b/internal/llm/opencode_appserver_driver.go index f03cf949..a18dd10f 100644 --- a/internal/llm/opencode_appserver_driver.go +++ b/internal/llm/opencode_appserver_driver.go @@ -117,8 +117,7 @@ func (d *openCodeAppServerDriver) Close() error { d.eventCancel = nil d.mu.Unlock() if cmd != nil && cmd.Process != nil { - _ = cmd.Process.Kill() - err = cmd.Wait() + err = shared.WaitOrKill(cmd) } if cancelEvents != nil { cancelEvents() @@ -176,15 +175,13 @@ func (d *openCodeAppServerDriver) ensureServer(ctx context.Context, req RunReque select { case serverURL := <-urlCh: if serverURL == "" { - _ = cmd.Process.Kill() - _ = cmd.Wait() + _ = shared.WaitOrKill(cmd) return fmt.Errorf("opencode serve exited before reporting URL: %s", strings.TrimSpace(d.stderr.String())) } d.mu.Lock() if d.closed { d.mu.Unlock() - _ = cmd.Process.Kill() - _ = cmd.Wait() + _ = shared.WaitOrKill(cmd) return ErrInteractiveClosed } d.cmd = cmd @@ -193,8 +190,7 @@ func (d *openCodeAppServerDriver) ensureServer(ctx context.Context, req RunReque d.ensureEventStream() return nil case <-ctx.Done(): - _ = cmd.Process.Kill() - _ = cmd.Wait() + _ = shared.WaitOrKill(cmd) return ctx.Err() } } @@ -710,8 +706,7 @@ func (d *openCodeAppServerDriver) resetServerForNextRequest() { d.eventCancel = nil } if d.cmd != nil && d.cmd.Process != nil { - _ = d.cmd.Process.Kill() - _ = d.cmd.Wait() + _ = shared.WaitOrKill(d.cmd) } d.cmd = nil } From f32812abd15465be82b56f02eaeec14fab4ecdac Mon Sep 17 00:00:00 2001 From: Alice-space Date: Thu, 14 May 2026 18:23:07 +0800 Subject: [PATCH 2/2] fix(llm): fix race in resetServerForNextRequest and add WaitOrKill tests resetServerForNextRequest was holding d.mu while calling WaitOrKill which can block up to 20s. Release the lock before waiting so new requests are not blocked. Change SubprocessGracePeriod/SubprocessTermGracePeriod from const to var for testability, and add unit tests covering normal exit, non-zero exit, already-exited, SIGTERM escalation, and SIGTERM acceptance. Co-Authored-By: Claude Opus 4.7 --- internal/llm/internal/shared/shared.go | 2 +- internal/llm/internal/shared/shared_test.go | 130 ++++++++++++++++++++ internal/llm/opencode_appserver_driver.go | 10 +- 3 files changed, 137 insertions(+), 5 deletions(-) create mode 100644 internal/llm/internal/shared/shared_test.go diff --git a/internal/llm/internal/shared/shared.go b/internal/llm/internal/shared/shared.go index 43d3abf1..03efb788 100644 --- a/internal/llm/internal/shared/shared.go +++ b/internal/llm/internal/shared/shared.go @@ -17,7 +17,7 @@ const ( // AuthCheckTimeout is the default timeout for CLI login/auth status checks. const AuthCheckTimeout = 15 * time.Second -const ( +var ( // SubprocessGracePeriod is how long to wait for a subprocess to exit // naturally after its stdin is closed before escalating to SIGTERM. SubprocessGracePeriod = 10 * time.Second diff --git a/internal/llm/internal/shared/shared_test.go b/internal/llm/internal/shared/shared_test.go new file mode 100644 index 00000000..e0059c4c --- /dev/null +++ b/internal/llm/internal/shared/shared_test.go @@ -0,0 +1,130 @@ +package shared + +import ( + "os/exec" + "syscall" + "testing" + "time" +) + +func TestWaitOrKill_NormalExit(t *testing.T) { + cmd := exec.Command("true") + if err := cmd.Start(); err != nil { + t.Fatalf("start: %v", err) + } + if err := WaitOrKill(cmd); err != nil { + t.Fatalf("expected nil for exit 0, got %v", err) + } +} + +func TestWaitOrKill_NonZeroExit(t *testing.T) { + cmd := exec.Command("sh", "-c", "exit 42") + if err := cmd.Start(); err != nil { + t.Fatalf("start: %v", err) + } + err := WaitOrKill(cmd) + if err == nil { + t.Fatal("expected non-nil error for exit 42") + } + exitErr, ok := err.(*exec.ExitError) + if !ok { + t.Fatalf("expected *exec.ExitError, got %T: %v", err, err) + } + if status := exitErr.Sys().(syscall.WaitStatus); status.ExitStatus() != 42 { + t.Fatalf("expected exit code 42, got %d", status.ExitStatus()) + } +} + +func TestWaitOrKill_AlreadyExited(t *testing.T) { + cmd := exec.Command("true") + if err := cmd.Run(); err != nil { + t.Fatalf("run: %v", err) + } + // WaitOrKill on already-waited process should return an error + // because cmd.Wait() can only be called once. + err := WaitOrKill(cmd) + if err == nil { + t.Fatal("expected error for already-waited process") + } +} + +func TestWaitOrKill_SIGTERM_Escalation(t *testing.T) { + // Restore original durations after test. + oldGrace := SubprocessGracePeriod + oldTerm := SubprocessTermGracePeriod + SubprocessGracePeriod = 100 * time.Millisecond + SubprocessTermGracePeriod = 100 * time.Millisecond + defer func() { + SubprocessGracePeriod = oldGrace + SubprocessTermGracePeriod = oldTerm + }() + + // Start a process that ignores SIGTERM. + // We use a shell script that traps SIGTERM and sleeps. + cmd := exec.Command("sh", "-c", "trap '' TERM; sleep 60") + if err := cmd.Start(); err != nil { + t.Fatalf("start: %v", err) + } + + start := time.Now() + err := WaitOrKill(cmd) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected non-nil error from killed process") + } + + // Should have been killed after both grace periods expire + // (wait 100ms + SIGTERM 100ms ≈ 200ms, then SIGKILL). + // Allow some slack for scheduling. + if elapsed < 150*time.Millisecond { + t.Fatalf("expected at least ~200ms before kill, took %v", elapsed) + } + if elapsed > 2*time.Second { + t.Fatalf("kill took too long: %v", elapsed) + } + + exitErr, ok := err.(*exec.ExitError) + if !ok { + t.Fatalf("expected *exec.ExitError, got %T: %v", err, err) + } + status := exitErr.Sys().(syscall.WaitStatus) + if !status.Signaled() { + t.Fatalf("expected process killed by signal, got status %v", status) + } +} + +func TestWaitOrKill_SIGTERM_Accepted(t *testing.T) { + oldGrace := SubprocessGracePeriod + oldTerm := SubprocessTermGracePeriod + SubprocessGracePeriod = 100 * time.Millisecond + SubprocessTermGracePeriod = 100 * time.Millisecond + defer func() { + SubprocessGracePeriod = oldGrace + SubprocessTermGracePeriod = oldTerm + }() + + // Start a process that accepts SIGTERM (the default). + cmd := exec.Command("sleep", "60") + if err := cmd.Start(); err != nil { + t.Fatalf("start: %v", err) + } + + err := WaitOrKill(cmd) + if err == nil { + t.Fatal("expected non-nil error from signaled process") + } + + exitErr, ok := err.(*exec.ExitError) + if !ok { + t.Fatalf("expected *exec.ExitError, got %T: %v", err, err) + } + status := exitErr.Sys().(syscall.WaitStatus) + if !status.Signaled() { + t.Fatalf("expected process killed by signal, got status %v", status) + } + // SIGTERM (15) is the expected signal. + if status.Signal() != syscall.SIGTERM { + t.Fatalf("expected SIGTERM (15), got signal %d", status.Signal()) + } +} diff --git a/internal/llm/opencode_appserver_driver.go b/internal/llm/opencode_appserver_driver.go index a18dd10f..20113dd5 100644 --- a/internal/llm/opencode_appserver_driver.go +++ b/internal/llm/opencode_appserver_driver.go @@ -694,7 +694,6 @@ func (d *openCodeAppServerDriver) hasOpenCodeAssistantText() bool { func (d *openCodeAppServerDriver) resetServerForNextRequest() { d.mu.Lock() - defer d.mu.Unlock() d.baseURL = "" d.sessionID = "" d.activeID = "" @@ -705,10 +704,13 @@ func (d *openCodeAppServerDriver) resetServerForNextRequest() { d.eventCancel() d.eventCancel = nil } - if d.cmd != nil && d.cmd.Process != nil { - _ = shared.WaitOrKill(d.cmd) - } + cmd := d.cmd d.cmd = nil + d.mu.Unlock() + + if cmd != nil && cmd.Process != nil { + _ = shared.WaitOrKill(cmd) + } } func (d *openCodeAppServerDriver) emit(event TurnEvent) {