From 34a0e3e9257c5ec469da967dbe27fa57e05e179b Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Mon, 1 Jun 2026 17:57:46 -0700 Subject: [PATCH 1/9] fix: flush tunnel logger before agent shutdown The agent-side tunnelLogger forwards log lines over the tunnel asynchronously via a background worker. On shutdown (e.g. a failing lifecycle hook), the agent process exited and cancelled the context before the worker drained the channel, silently dropping the last queued line(s). Add a Flush() that drains the channel and is deferred in the container setup Run path, send with a cancellation-detached context, and flush before Fatal exits. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/agent/container/setup.go | 8 ++ pkg/agent/tunnelserver/logger.go | 135 ++++++++++++++++++------------- 2 files changed, 87 insertions(+), 56 deletions(-) diff --git a/cmd/agent/container/setup.go b/cmd/agent/container/setup.go index 55db5a291d..e9286c1c58 100644 --- a/cmd/agent/container/setup.go +++ b/cmd/agent/container/setup.go @@ -104,6 +104,14 @@ func (cmd *SetupContainerCmd) Run(ctx context.Context) error { return err } + // The tunnel logger forwards messages over the tunnel asynchronously, so + // flush before returning to make sure the final log lines (e.g. the last + // stderr output of a failing lifecycle hook) reach the client before the + // connection is torn down. ctx is still alive while this defer runs. + if f, ok := logger.(interface{ Flush() }); ok { + defer f.Flush() + } + workspaceInfo, setupInfo, err := cmd.parseWorkspaceAndSetupInfo(logger) if err != nil { return err diff --git a/pkg/agent/tunnelserver/logger.go b/pkg/agent/tunnelserver/logger.go index 75c40f2fd8..493c749786 100644 --- a/pkg/agent/tunnelserver/logger.go +++ b/pkg/agent/tunnelserver/logger.go @@ -27,7 +27,7 @@ func NewTunnelLogger(ctx context.Context, client tunnel.TunnelClient, debug bool ctx: ctx, client: client, level: level, - logChan: make(chan *tunnel.LogMessage, 1000), // Buffer size of 1000 messages + logChan: make(chan logEntry, 1000), // Buffer size of 1000 messages } go logger.worker() @@ -35,28 +35,83 @@ func NewTunnelLogger(ctx context.Context, client tunnel.TunnelClient, debug bool return logger } +// logEntry is what flows through the logger channel. It is either a log +// message to forward over the tunnel, or a flush request whose ack channel is +// closed once all preceding messages have been sent. +type logEntry struct { + msg *tunnel.LogMessage + flush chan struct{} +} + type tunnelLogger struct { ctx context.Context level logrus.Level client tunnel.TunnelClient - logChan chan *tunnel.LogMessage + logChan chan logEntry fields logrus.Fields } func (s *tunnelLogger) worker() { for { select { - case msg := <-s.logChan: - ctx, cancel := context.WithTimeout(s.ctx, 5*time.Second) - _, _ = s.client.Log(ctx, msg) - // ignore error since we can't use the logger itself - cancel() + case entry := <-s.logChan: + s.handle(entry) case <-s.ctx.Done(): - return + // Best-effort drain so the final lines aren't lost when the + // agent shuts down before the channel has been emptied. + for { + select { + case entry := <-s.logChan: + s.handle(entry) + default: + return + } + } } } } +// handle forwards a single message over the tunnel, or acknowledges a flush +// request. Sends use a context detached from s.ctx's cancellation so that +// queued messages can still be delivered during shutdown as long as the +// underlying connection is alive. +func (s *tunnelLogger) handle(entry logEntry) { + if entry.flush != nil { + close(entry.flush) + return + } + + ctx, cancel := context.WithTimeout(context.WithoutCancel(s.ctx), 5*time.Second) + _, _ = s.client.Log(ctx, entry.msg) + // ignore error since we can't use the logger itself + cancel() +} + +// Flush blocks until all messages queued before the call have been forwarded +// over the tunnel. It must be called before the process exits (and before +// s.ctx is cancelled) to guarantee the final log lines are delivered. +func (s *tunnelLogger) Flush() { + ack := make(chan struct{}) + select { + case s.logChan <- logEntry{flush: ack}: + case <-s.ctx.Done(): + return + } + + select { + case <-ack: + case <-s.ctx.Done(): + } +} + +// enqueue formats and queues a message for the worker to forward. +func (s *tunnelLogger) enqueue(level tunnel.LogLevel, message string) { + s.logChan <- logEntry{msg: &tunnel.LogMessage{ + LogLevel: level, + Message: s.formatMessage(message), + }} +} + // formatMessage appends structured fields to the message. func (s *tunnelLogger) formatMessage(message string) string { if len(s.fields) == 0 { @@ -75,10 +130,7 @@ func (s *tunnelLogger) Debug(args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_DEBUG, - Message: s.formatMessage(fmt.Sprintln(args...)), - } + s.enqueue(tunnel.LogLevel_DEBUG, fmt.Sprintln(args...)) } func (s *tunnelLogger) Debugf(format string, args ...any) { @@ -86,10 +138,7 @@ func (s *tunnelLogger) Debugf(format string, args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_DEBUG, - Message: s.formatMessage(fmt.Sprintf(format, args...) + "\n"), - } + s.enqueue(tunnel.LogLevel_DEBUG, fmt.Sprintf(format, args...)+"\n") } func (s *tunnelLogger) Info(args ...any) { @@ -97,10 +146,7 @@ func (s *tunnelLogger) Info(args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_INFO, - Message: s.formatMessage(fmt.Sprintln(args...)), - } + s.enqueue(tunnel.LogLevel_INFO, fmt.Sprintln(args...)) } func (s *tunnelLogger) Infof(format string, args ...any) { @@ -108,10 +154,7 @@ func (s *tunnelLogger) Infof(format string, args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_INFO, - Message: s.formatMessage(fmt.Sprintf(format, args...) + "\n"), - } + s.enqueue(tunnel.LogLevel_INFO, fmt.Sprintf(format, args...)+"\n") } func (s *tunnelLogger) Warn(args ...any) { @@ -119,10 +162,7 @@ func (s *tunnelLogger) Warn(args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_WARNING, - Message: s.formatMessage(fmt.Sprintln(args...)), - } + s.enqueue(tunnel.LogLevel_WARNING, fmt.Sprintln(args...)) } func (s *tunnelLogger) Warnf(format string, args ...any) { @@ -130,10 +170,7 @@ func (s *tunnelLogger) Warnf(format string, args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_WARNING, - Message: s.formatMessage(fmt.Sprintf(format, args...) + "\n"), - } + s.enqueue(tunnel.LogLevel_WARNING, fmt.Sprintf(format, args...)+"\n") } func (s *tunnelLogger) Error(args ...any) { @@ -141,10 +178,7 @@ func (s *tunnelLogger) Error(args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_ERROR, - Message: s.formatMessage(fmt.Sprintln(args...)), - } + s.enqueue(tunnel.LogLevel_ERROR, fmt.Sprintln(args...)) } func (s *tunnelLogger) Errorf(format string, args ...any) { @@ -152,10 +186,7 @@ func (s *tunnelLogger) Errorf(format string, args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_ERROR, - Message: s.formatMessage(fmt.Sprintf(format, args...) + "\n"), - } + s.enqueue(tunnel.LogLevel_ERROR, fmt.Sprintf(format, args...)+"\n") } func (s *tunnelLogger) Fatal(args ...any) { @@ -163,11 +194,10 @@ func (s *tunnelLogger) Fatal(args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_ERROR, - Message: s.formatMessage(fmt.Sprintln(args...)), - } + s.enqueue(tunnel.LogLevel_ERROR, fmt.Sprintln(args...)) + // make sure the message is delivered before we exit + s.Flush() os.Exit(1) } @@ -176,11 +206,10 @@ func (s *tunnelLogger) Fatalf(format string, args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_ERROR, - Message: s.formatMessage(fmt.Sprintf(format, args...) + "\n"), - } + s.enqueue(tunnel.LogLevel_ERROR, fmt.Sprintf(format, args...)+"\n") + // make sure the message is delivered before we exit + s.Flush() os.Exit(1) } @@ -189,10 +218,7 @@ func (s *tunnelLogger) Done(args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_DONE, - Message: s.formatMessage(fmt.Sprintln(args...)), - } + s.enqueue(tunnel.LogLevel_DONE, fmt.Sprintln(args...)) } func (s *tunnelLogger) Donef(format string, args ...any) { @@ -200,10 +226,7 @@ func (s *tunnelLogger) Donef(format string, args ...any) { return } - s.logChan <- &tunnel.LogMessage{ - LogLevel: tunnel.LogLevel_DONE, - Message: s.formatMessage(fmt.Sprintf(format, args...) + "\n"), - } + s.enqueue(tunnel.LogLevel_DONE, fmt.Sprintf(format, args...)+"\n") } func (s *tunnelLogger) Print(level logrus.Level, args ...any) { From 10baeeca0cca1f94a00ff7cc7ef1a275523afbc2 Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Mon, 1 Jun 2026 18:51:25 -0700 Subject: [PATCH 2/9] fix: satisfy funcorder lint, move unexported logger methods to end Co-Authored-By: Claude Opus 4.8 (1M context) --- pkg/agent/tunnelserver/logger.go | 48 ++++++++++++++++---------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/pkg/agent/tunnelserver/logger.go b/pkg/agent/tunnelserver/logger.go index 493c749786..59deca8521 100644 --- a/pkg/agent/tunnelserver/logger.go +++ b/pkg/agent/tunnelserver/logger.go @@ -71,22 +71,6 @@ func (s *tunnelLogger) worker() { } } -// handle forwards a single message over the tunnel, or acknowledges a flush -// request. Sends use a context detached from s.ctx's cancellation so that -// queued messages can still be delivered during shutdown as long as the -// underlying connection is alive. -func (s *tunnelLogger) handle(entry logEntry) { - if entry.flush != nil { - close(entry.flush) - return - } - - ctx, cancel := context.WithTimeout(context.WithoutCancel(s.ctx), 5*time.Second) - _, _ = s.client.Log(ctx, entry.msg) - // ignore error since we can't use the logger itself - cancel() -} - // Flush blocks until all messages queued before the call have been forwarded // over the tunnel. It must be called before the process exits (and before // s.ctx is cancelled) to guarantee the final log lines are delivered. @@ -104,14 +88,6 @@ func (s *tunnelLogger) Flush() { } } -// enqueue formats and queues a message for the worker to forward. -func (s *tunnelLogger) enqueue(level tunnel.LogLevel, message string) { - s.logChan <- logEntry{msg: &tunnel.LogMessage{ - LogLevel: level, - Message: s.formatMessage(message), - }} -} - // formatMessage appends structured fields to the message. func (s *tunnelLogger) formatMessage(message string) string { if len(s.fields) == 0 { @@ -338,3 +314,27 @@ func (s *tunnelLogger) WithFields(fields logrus.Fields) log.Logger { func (*tunnelLogger) LogrLogSink() logr.LogSink { return nil } + +// enqueue formats and queues a message for the worker to forward. +func (s *tunnelLogger) enqueue(level tunnel.LogLevel, message string) { + s.logChan <- logEntry{msg: &tunnel.LogMessage{ + LogLevel: level, + Message: s.formatMessage(message), + }} +} + +// handle forwards a single message over the tunnel, or acknowledges a flush +// request. Sends use a context detached from s.ctx's cancellation so that +// queued messages can still be delivered during shutdown as long as the +// underlying connection is alive. +func (s *tunnelLogger) handle(entry logEntry) { + if entry.flush != nil { + close(entry.flush) + return + } + + ctx, cancel := context.WithTimeout(context.WithoutCancel(s.ctx), 5*time.Second) + _, _ = s.client.Log(ctx, entry.msg) + // ignore error since we can't use the logger itself + cancel() +} From 7e0944a08d59c8677d516b968f756994a4bbf230 Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Mon, 1 Jun 2026 19:22:39 -0700 Subject: [PATCH 3/9] refactor: use named tunnelserver.Flusher interface for log flush Replace the inline anonymous interface{ Flush() } assertion with a named Flusher interface, plus a compile-time assertion that *tunnelLogger implements it, so the contract is explicit and rename-safe. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/agent/container/setup.go | 2 +- pkg/agent/tunnelserver/logger.go | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/cmd/agent/container/setup.go b/cmd/agent/container/setup.go index e9286c1c58..8e3d905831 100644 --- a/cmd/agent/container/setup.go +++ b/cmd/agent/container/setup.go @@ -108,7 +108,7 @@ func (cmd *SetupContainerCmd) Run(ctx context.Context) error { // flush before returning to make sure the final log lines (e.g. the last // stderr output of a failing lifecycle hook) reach the client before the // connection is torn down. ctx is still alive while this defer runs. - if f, ok := logger.(interface{ Flush() }); ok { + if f, ok := logger.(tunnelserver.Flusher); ok { defer f.Flush() } diff --git a/pkg/agent/tunnelserver/logger.go b/pkg/agent/tunnelserver/logger.go index 59deca8521..f386fc6767 100644 --- a/pkg/agent/tunnelserver/logger.go +++ b/pkg/agent/tunnelserver/logger.go @@ -35,6 +35,15 @@ func NewTunnelLogger(ctx context.Context, client tunnel.TunnelClient, debug bool return logger } +// Flusher is implemented by loggers that buffer output and can block until +// everything queued so far has been delivered. NewTunnelLogger returns a value +// satisfying this interface. +type Flusher interface { + Flush() +} + +var _ Flusher = (*tunnelLogger)(nil) + // logEntry is what flows through the logger channel. It is either a log // message to forward over the tunnel, or a flush request whose ack channel is // closed once all preceding messages have been sent. From b2a189a44cb9e24e7c8c18eb46096bb2168afc4f Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Tue, 2 Jun 2026 09:46:37 -0700 Subject: [PATCH 4/9] test(e2e): guard against dropped final lifecycle hook log line Add an up-handle-errors e2e spec: a postCreateCommand prints a unique marker to stderr then exits non-zero. The agent forwards lifecycle hook output over the tunnel asynchronously, so on failure it must flush the final queued line before tearing down. Asserts devpod up fails and the marker still appears in the captured log output. Regression guard for the bug fixed in this PR. Co-Authored-By: Claude Opus 4.8 (1M context) --- e2e/tests/up/handles_errors.go | 35 +++++++++++++++++++ .../.devcontainer.json | 5 +++ 2 files changed, 40 insertions(+) create mode 100644 e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json diff --git a/e2e/tests/up/handles_errors.go b/e2e/tests/up/handles_errors.go index 575acf583a..5a7577cb31 100644 --- a/e2e/tests/up/handles_errors.go +++ b/e2e/tests/up/handles_errors.go @@ -63,6 +63,41 @@ var _ = ginkgo.Describe( ginkgo.SpecTimeout(framework.GetTimeout()), ) + ginkgo.It( + "forwards the final lifecycle hook log line before the agent exits", + func(ctx context.Context) { + f, err := setupDockerProvider(initialDir+"/bin", "docker") + framework.ExpectNoError(err) + ginkgo.DeferCleanup(func(cleanupCtx context.Context) { + _ = f.DevPodProviderDelete(cleanupCtx, "docker") + }) + + tempDir, err := framework.CopyToTempDir( + "tests/up/testdata/docker-lifecycle-stderr", + ) + framework.ExpectNoError(err) + ginkgo.DeferCleanup(framework.CleanupTempDir, initialDir, tempDir) + ginkgo.DeferCleanup(func(cleanupCtx context.Context) { + _ = f.DevPodWorkspaceDelete(cleanupCtx, tempDir, "--force") + }) + + // The postCreateCommand prints a marker to stderr and exits + // non-zero. The agent forwards lifecycle hook output over the + // tunnel asynchronously, so on failure it must flush the final + // queued line before tearing down. Regression guard for the + // dropped-last-line bug. + stdout, stderr, err := f.DevPodUpStreams(ctx, tempDir, "--log-output=json") + framework.ExpectError(err, "expected lifecycle hook failure") + framework.ExpectNoError( + findMessage( + strings.NewReader(stdout+stderr), + "DEVPOD_LIFECYCLE_FLUSH_MARKER", + ), + ) + }, + ginkgo.SpecTimeout(framework.GetTimeout()), + ) + ginkgo.It( "ensure workspace cleanup when failing to create a workspace", func(ctx context.Context) { diff --git a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json new file mode 100644 index 0000000000..bd119fd4e5 --- /dev/null +++ b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json @@ -0,0 +1,5 @@ +{ + "name": "lifecycle-stderr", + "image": "mcr.microsoft.com/devcontainers/go:1", + "postCreateCommand": "echo DEVPOD_LIFECYCLE_FLUSH_MARKER >&2; exit 1" +} From 40deba7fe87f227417093392cd4323f6111edb97 Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Tue, 2 Jun 2026 11:07:25 -0700 Subject: [PATCH 5/9] test(e2e): emit a burst before the marker so the drop reproduces A lone final log line drains before teardown (the async worker has ~ms to send it), so the original single-line fixture passed even without the flush fix. Emit a burst of stderr lines ending with the marker so the sender is still busy when the hook fails, making the dropped-last-line reliably reproduce without the fix. Co-Authored-By: Claude Opus 4.8 (1M context) --- e2e/tests/up/handles_errors.go | 13 ++++++++----- .../docker-lifecycle-stderr/.devcontainer.json | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/e2e/tests/up/handles_errors.go b/e2e/tests/up/handles_errors.go index 5a7577cb31..fa0581afb7 100644 --- a/e2e/tests/up/handles_errors.go +++ b/e2e/tests/up/handles_errors.go @@ -81,11 +81,14 @@ var _ = ginkgo.Describe( _ = f.DevPodWorkspaceDelete(cleanupCtx, tempDir, "--force") }) - // The postCreateCommand prints a marker to stderr and exits - // non-zero. The agent forwards lifecycle hook output over the - // tunnel asynchronously, so on failure it must flush the final - // queued line before tearing down. Regression guard for the - // dropped-last-line bug. + // The postCreateCommand prints a burst of stderr lines ending + // with a marker, then exits non-zero. The agent forwards + // lifecycle hook output over the tunnel asynchronously; the + // burst keeps the sender busy so the final marker is still + // queued when the hook fails, and the agent must flush it + // before tearing down. Without the flush the marker is dropped + // (a lone final line drains in time and would not reproduce the + // bug). Regression guard for the dropped-last-line bug. stdout, stderr, err := f.DevPodUpStreams(ctx, tempDir, "--log-output=json") framework.ExpectError(err, "expected lifecycle hook failure") framework.ExpectNoError( diff --git a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json index bd119fd4e5..de2a54bf6e 100644 --- a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json +++ b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json @@ -1,5 +1,5 @@ { "name": "lifecycle-stderr", "image": "mcr.microsoft.com/devcontainers/go:1", - "postCreateCommand": "echo DEVPOD_LIFECYCLE_FLUSH_MARKER >&2; exit 1" + "postCreateCommand": "i=0; while [ $i -lt 300 ]; do echo \"filler line $i\" >&2; i=$((i+1)); done; echo DEVPOD_LIFECYCLE_FLUSH_MARKER >&2; exit 1" } From 1672e8bfd3917da2d48b6b0decd0d82d866809a7 Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Tue, 2 Jun 2026 11:30:16 -0700 Subject: [PATCH 6/9] test(e2e): move lifecycle hook to a script to avoid marker leak devpod's "failed to run " error echoes the command verbatim, so an inline marker matched the error text even when the hook's output line was actually dropped (false positive). Move the burst+marker into .devcontainer/postCreateCommand.sh so the command echo contains only the script path; the marker can now only match a genuinely forwarded line. Co-Authored-By: Claude Opus 4.8 (1M context) --- e2e/tests/up/handles_errors.go | 18 +++++++++------- .../.devcontainer.json | 5 ----- .../.devcontainer/devcontainer.json | 5 +++++ .../.devcontainer/postCreateCommand.sh | 21 +++++++++++++++++++ 4 files changed, 36 insertions(+), 13 deletions(-) delete mode 100644 e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json create mode 100644 e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/devcontainer.json create mode 100644 e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh diff --git a/e2e/tests/up/handles_errors.go b/e2e/tests/up/handles_errors.go index fa0581afb7..096f1d21bf 100644 --- a/e2e/tests/up/handles_errors.go +++ b/e2e/tests/up/handles_errors.go @@ -81,14 +81,16 @@ var _ = ginkgo.Describe( _ = f.DevPodWorkspaceDelete(cleanupCtx, tempDir, "--force") }) - // The postCreateCommand prints a burst of stderr lines ending - // with a marker, then exits non-zero. The agent forwards - // lifecycle hook output over the tunnel asynchronously; the - // burst keeps the sender busy so the final marker is still - // queued when the hook fails, and the agent must flush it - // before tearing down. Without the flush the marker is dropped - // (a lone final line drains in time and would not reproduce the - // bug). Regression guard for the dropped-last-line bug. + // The postCreateCommand runs a script that prints a burst of + // stderr lines ending with a marker, then exits non-zero. The + // agent forwards lifecycle hook output over the tunnel + // asynchronously; the burst keeps the sender busy so the final + // marker is still queued when the hook fails, and the agent + // must flush it before tearing down. Without the flush the + // marker is dropped. The marker lives in the script (not the + // inline command) so it cannot leak into devpod's "failed to + // run " error and cause a false positive. Regression + // guard for the dropped-last-line bug. stdout, stderr, err := f.DevPodUpStreams(ctx, tempDir, "--log-output=json") framework.ExpectError(err, "expected lifecycle hook failure") framework.ExpectNoError( diff --git a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json deleted file mode 100644 index de2a54bf6e..0000000000 --- a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "name": "lifecycle-stderr", - "image": "mcr.microsoft.com/devcontainers/go:1", - "postCreateCommand": "i=0; while [ $i -lt 300 ]; do echo \"filler line $i\" >&2; i=$((i+1)); done; echo DEVPOD_LIFECYCLE_FLUSH_MARKER >&2; exit 1" -} diff --git a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/devcontainer.json b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/devcontainer.json new file mode 100644 index 0000000000..2820040787 --- /dev/null +++ b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/devcontainer.json @@ -0,0 +1,5 @@ +{ + "name": "lifecycle-stderr", + "image": "mcr.microsoft.com/devcontainers/go:1", + "postCreateCommand": "sh .devcontainer/postCreateCommand.sh" +} diff --git a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh new file mode 100644 index 0000000000..c49990d697 --- /dev/null +++ b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# Regression fixture for the dropped-last-line bug (see the e2e spec +# "forwards the final lifecycle hook log line before the agent exits"). +# +# Emit a burst of stderr lines ending with a marker, then fail. The agent +# forwards lifecycle hook output over the tunnel asynchronously; the burst +# keeps the sender busy so the marker is still queued when the hook exits +# non-zero. Without the flush-on-shutdown fix the agent tears down before the +# marker is sent and it is dropped (the observed drain window is only ~11 +# lines, so the 300-line burst leaves a wide margin). +# +# The marker lives in this script rather than the inline postCreateCommand so +# it cannot leak into devpod's "failed to run " error message and +# produce a false positive. +i=0 +while [ "$i" -lt 300 ]; do + echo "filler line $i" >&2 + i=$((i + 1)) +done +echo DEVPOD_LIFECYCLE_FLUSH_MARKER >&2 +exit 1 From 8c3727536347799fd0277766954cfd1733bab9e0 Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Tue, 2 Jun 2026 13:13:25 -0700 Subject: [PATCH 7/9] fix: gate logger Flush on worker completion, not ctx cancellation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flush previously short-circuited on s.ctx.Done(), so once the context was cancelled it could skip enqueueing the flush marker or return before the worker drained it — defeating the barrier on exactly the shutdown path it guards (deferred flush in SetupContainerCmd.Run, Fatal/Fatalf). Add a done channel closed when worker() exits and wait on that instead. This keeps Flush a real barrier under cancellation while still avoiding a deadlock when the worker has already stopped. --- pkg/agent/tunnelserver/logger.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pkg/agent/tunnelserver/logger.go b/pkg/agent/tunnelserver/logger.go index f386fc6767..e105633f24 100644 --- a/pkg/agent/tunnelserver/logger.go +++ b/pkg/agent/tunnelserver/logger.go @@ -28,6 +28,7 @@ func NewTunnelLogger(ctx context.Context, client tunnel.TunnelClient, debug bool client: client, level: level, logChan: make(chan logEntry, 1000), // Buffer size of 1000 messages + done: make(chan struct{}), } go logger.worker() @@ -57,10 +58,12 @@ type tunnelLogger struct { level logrus.Level client tunnel.TunnelClient logChan chan logEntry + done chan struct{} // closed when worker() exits fields logrus.Fields } func (s *tunnelLogger) worker() { + defer close(s.done) for { select { case entry := <-s.logChan: @@ -81,19 +84,21 @@ func (s *tunnelLogger) worker() { } // Flush blocks until all messages queued before the call have been forwarded -// over the tunnel. It must be called before the process exits (and before -// s.ctx is cancelled) to guarantee the final log lines are delivered. +// over the tunnel, or until the worker has stopped. It gates on worker +// completion rather than s.ctx so it stays a real barrier even when s.ctx is +// already cancelled (e.g. the deferred flush on the shutdown path, or +// Fatal/Fatalf). func (s *tunnelLogger) Flush() { ack := make(chan struct{}) select { case s.logChan <- logEntry{flush: ack}: - case <-s.ctx.Done(): + case <-s.done: return } select { case <-ack: - case <-s.ctx.Done(): + case <-s.done: } } @@ -316,6 +321,7 @@ func (s *tunnelLogger) WithFields(fields logrus.Fields) log.Logger { client: s.client, level: s.level, logChan: s.logChan, + done: s.done, fields: newFields, } } From 25ef54c0c87bca313498ae7bf0a65e92d8e17a61 Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Tue, 2 Jun 2026 13:13:54 -0700 Subject: [PATCH 8/9] fix: bound logger Flush by a single overall deadline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit handle() applies a 5s timeout per message, and Flush waits for the whole queue to drain, so a stalled-but-open tunnel with a full buffer could block shutdown for buffer-size × 5s (~5000s). Cap the entire flush with one 30s deadline; the worker (and the process) tear down afterward. --- pkg/agent/tunnelserver/logger.go | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pkg/agent/tunnelserver/logger.go b/pkg/agent/tunnelserver/logger.go index e105633f24..7edcd3e778 100644 --- a/pkg/agent/tunnelserver/logger.go +++ b/pkg/agent/tunnelserver/logger.go @@ -83,22 +83,34 @@ func (s *tunnelLogger) worker() { } } +// flushTimeout caps the entire drain. handle() applies a per-message timeout, +// so without an overall bound a backlog on a stalled tunnel could block +// shutdown for buffer-size × per-message timeout. The cap is generous so a +// healthy drain of a full backlog always completes. +const flushTimeout = 30 * time.Second + // Flush blocks until all messages queued before the call have been forwarded -// over the tunnel, or until the worker has stopped. It gates on worker -// completion rather than s.ctx so it stays a real barrier even when s.ctx is -// already cancelled (e.g. the deferred flush on the shutdown path, or -// Fatal/Fatalf). +// over the tunnel, until the worker has stopped, or until flushTimeout elapses. +// It gates on worker completion rather than s.ctx so it stays a real barrier +// even when s.ctx is already cancelled (e.g. the deferred flush on the shutdown +// path, or Fatal/Fatalf). func (s *tunnelLogger) Flush() { ack := make(chan struct{}) + timer := time.NewTimer(flushTimeout) + defer timer.Stop() + select { case s.logChan <- logEntry{flush: ack}: case <-s.done: return + case <-timer.C: + return } select { case <-ack: case <-s.done: + case <-timer.C: } } From d25ac668e764706c5a518e6e9a5089541c6c022e Mon Sep 17 00:00:00 2001 From: Terence Ho Date: Tue, 2 Jun 2026 13:24:42 -0700 Subject: [PATCH 9/9] style: shfmt postCreateCommand.sh fixture (4-space indent) --- .../.devcontainer/postCreateCommand.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh index c49990d697..4918cb2eaa 100644 --- a/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh +++ b/e2e/tests/up/testdata/docker-lifecycle-stderr/.devcontainer/postCreateCommand.sh @@ -14,8 +14,8 @@ # produce a false positive. i=0 while [ "$i" -lt 300 ]; do - echo "filler line $i" >&2 - i=$((i + 1)) + echo "filler line $i" >&2 + i=$((i + 1)) done echo DEVPOD_LIFECYCLE_FLUSH_MARKER >&2 exit 1