From 94712f580daede48dc4e7774a938baf449305b81 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 16:17:26 +0900 Subject: [PATCH 01/12] feat: surface prompt-processing state and unblock TUI startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously users saw a frozen UI during two separate silent gaps: 1. Between hitting Enter and the LLM stream starting — during local prep (onBeforeTurn, measureUsage, compaction) there was no spinner at all. 2. On first launch the startup token probe blocked the input loop for 200ms–3s while it measured system-prompt + tool-schema token overhead. This change: - Adds onStreamStart and onFirstStreamPart hooks to the harness LoopHooks so any agent runtime can signal the exact prompt-processing window. - In the TUI, shows a 'Processing...' loader during turn preparation and transitions to 'Working...' once the LLM request is in flight (cleared on the first visible stream part). - In headless, emits a 'turn-start' lifecycle annotation and a matching onStreamStart callback; extends the TrajectoryEvent union accordingly. - Runs the TUI startup measureUsage probe fire-and-forget so the editor accepts input immediately; the context-usage footer starts on the estimated value and quietly upgrades to the real one when ready. Documents the new turn-start event in the headless AGENTS.md / README. --- .changeset/prompt-processing-indicators.md | 7 +++++++ packages/harness/src/loop.ts | 11 +++++++++++ packages/harness/src/types.ts | 18 ++++++++++++++++++ packages/headless/AGENTS.md | 6 ++++++ packages/headless/README.md | 12 ++++++++++++ packages/headless/src/runner.ts | 7 +++++++ packages/headless/src/types.ts | 18 +++++++++++++++++- packages/tui/src/agent-tui.ts | 22 +++++++++++++++++++--- 8 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 .changeset/prompt-processing-indicators.md diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md new file mode 100644 index 0000000..a839ae7 --- /dev/null +++ b/.changeset/prompt-processing-indicators.md @@ -0,0 +1,7 @@ +--- +"@ai-sdk-tool/harness": patch +"@ai-sdk-tool/tui": patch +"@ai-sdk-tool/headless": patch +--- + +Surface the "prompt processing" state that previously looked frozen. The harness loop now exposes `onStreamStart` and `onFirstStreamPart` hooks around the `agent.stream()` call site, the TUI shows a `Processing...` loader during turn preparation and switches to `Working...` once the LLM request is in flight, and headless emits a `turn-start` trajectory annotation alongside a matching `onStreamStart` callback so any agent runtime can signal activity before the first chunk arrives. The TUI startup token probe also runs non-blocking (fire-and-forget) so the editor accepts input immediately on launch; the context-usage footer starts from the estimated count and quietly upgrades to the real value once the probe resolves. diff --git a/packages/harness/src/loop.ts b/packages/harness/src/loop.ts index f274d97..4c12fc0 100644 --- a/packages/harness/src/loop.ts +++ b/packages/harness/src/loop.ts @@ -44,8 +44,10 @@ export async function runAgentLoop( agent, abortSignal, onError, + onFirstStreamPart, onInterrupt, onStepComplete, + onStreamStart, onToolCall, onToolLifecycle, } = options; @@ -98,7 +100,16 @@ export async function runAgentLoop( const stream = agent.stream(streamOptions); + await onStreamStart?.(context); + + let firstPartSeen = false; + for await (const part of stream.fullStream) { + if (!firstPartSeen) { + firstPartSeen = true; + await onFirstStreamPart?.(context); + } + const lifecycle = getToolLifecycleState( part as { toolCallId?: string; toolName?: string; type: string } ); diff --git a/packages/harness/src/types.ts b/packages/harness/src/types.ts index c018ba9..9cf0c65 100644 --- a/packages/harness/src/types.ts +++ b/packages/harness/src/types.ts @@ -158,6 +158,15 @@ export interface LoopHooks { | Promise< { shouldContinue?: boolean; recovery?: ModelMessage[] } | undefined >; + /** + * Fires exactly once per loop iteration when the first part of any kind + * arrives from `stream.fullStream`. Consumers can use this to clear a + * prompt-processing indicator the moment any byte of output begins + * arriving. Note: the SDK may emit invisible framing parts (`start`, + * `text-start`, …) before user-facing content; consumers that want to + * wait for *visible* output should filter on part type themselves. + */ + onFirstStreamPart?: (context: LoopContinueContext) => void | Promise; onInterrupt?: ( interruption: { iteration: number; @@ -169,6 +178,15 @@ export interface LoopHooks { context: LoopContinueContext ) => BeforeTurnResult | Promise | undefined; onStepComplete?: (step: LoopStepInfo) => void | Promise; + /** + * Fires immediately after {@link Agent.stream} is invoked and before the + * `fullStream` iteration begins. This is the closest hook to "LLM request + * sent, waiting for first chunk" — intended for consumers that need to + * display a loading indicator during the prompt-processing latency gap. + * + * The callback is awaited before iteration starts, so keep it fast. + */ + onStreamStart?: (context: LoopContinueContext) => void | Promise; onToolCall?: ( call: ToolCallPart, context: LoopContinueContext diff --git a/packages/headless/AGENTS.md b/packages/headless/AGENTS.md index e3733de..e0253f5 100644 --- a/packages/headless/AGENTS.md +++ b/packages/headless/AGENTS.md @@ -31,6 +31,7 @@ Every event is a JSON object on its own line. Events conform to the ATIF-v1.6 sp | `compaction` | system | Lifecycle event for history compaction | | `error` | system | Fatal or iteration-limit error | | `interrupt` | system | Intentional caller interruption (`caller-abort`) | +| `turn-start` | system | Lifecycle annotation emitted right after `agent.stream()` is invoked, before the first chunk arrives | ### Examples @@ -77,6 +78,11 @@ Every event is a JSON object on its own line. Events conform to the ATIF-v1.6 sp {"type":"error","timestamp":"2026-04-03T10:00:20.000Z","error":"Max iterations (50) reached"} ``` +**TurnStartEvent**: +```json +{"type":"turn-start","timestamp":"2026-04-03T10:00:04.500Z","phase":"new-turn"} +``` + ## KEY EXPORTS ### `runHeadless(config: HeadlessRunnerConfig): Promise` diff --git a/packages/headless/README.md b/packages/headless/README.md index 33787d9..567876e 100644 --- a/packages/headless/README.md +++ b/packages/headless/README.md @@ -182,6 +182,7 @@ Headless output now follows the **ATIF-v1.6** protocol documented in `packages/h | `compaction` | system | Lifecycle event for history compaction | | `error` | system | Fatal error or iteration-limit event | | `interrupt` | system | Intentional caller interruption (`caller-abort`) | +| `turn-start` | system | Lifecycle annotation emitted right after `agent.stream()` is dispatched, before the first chunk arrives | ### `metadata` @@ -278,6 +279,16 @@ Headless output now follows the **ATIF-v1.6** protocol documented in `packages/h } ``` +### `turn-start` + +```typescript +{ + type: "turn-start", + timestamp: string, + phase: "new-turn" | "intermediate-step", +} +``` + ### TypeScript types ```typescript @@ -291,6 +302,7 @@ import type { CompactionEvent, ErrorEvent, InterruptEvent, + TurnStartEvent, } from "@ai-sdk-tool/headless"; ``` diff --git a/packages/headless/src/runner.ts b/packages/headless/src/runner.ts index a4f6311..0e961d5 100644 --- a/packages/headless/src/runner.ts +++ b/packages/headless/src/runner.ts @@ -670,6 +670,13 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { }; } + emitAndCollect({ + type: "turn-start", + phase, + timestamp: new Date().toISOString(), + }); + await config.onStreamStart?.(phase); + const streamPromise = Promise.resolve( config.agent.stream(streamOptions) ); diff --git a/packages/headless/src/types.ts b/packages/headless/src/types.ts index 04dbd15..68e88bf 100644 --- a/packages/headless/src/types.ts +++ b/packages/headless/src/types.ts @@ -128,6 +128,18 @@ export interface InterruptEvent { type: "interrupt"; } +/** + * Emitted immediately after the agent's LLM request is dispatched but before + * any stream chunk has arrived. Consumers can use this as a "prompt + * processing started" signal to render a loading indicator. Lifecycle + * annotation — carries no `step_id`. + */ +export interface TurnStartEvent { + phase: "new-turn" | "intermediate-step"; + timestamp: string; + type: "turn-start"; +} + export type { HistorySnapshot } from "@ai-sdk-tool/harness"; export interface HeadlessRunnerConfig { @@ -158,6 +170,9 @@ export interface HeadlessRunnerConfig { phase: "new-turn" | "intermediate-step" ) => BeforeTurnResult | Promise | undefined; onInterrupt?: (event: InterruptEvent) => Promise | void; + onStreamStart?: ( + phase: "new-turn" | "intermediate-step" + ) => void | Promise; onTodoReminder?: () => Promise<{ hasReminder: boolean; message: string | null; @@ -205,4 +220,5 @@ export type TrajectoryEvent = | ErrorEvent | ApprovalEvent | InterruptEvent - | MetadataEvent; + | MetadataEvent + | TurnStartEvent; diff --git a/packages/tui/src/agent-tui.ts b/packages/tui/src/agent-tui.ts index 3182d12..2e74ec9 100644 --- a/packages/tui/src/agent-tui.ts +++ b/packages/tui/src/agent-tui.ts @@ -105,6 +105,10 @@ interface ContextPressureThresholds { const style = (prefix: string, text: string): string => `${prefix}${text}${ANSI_RESET}`; +const ignore = (): void => { + return; +}; + class StatusSpinner extends Text { private readonly frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]; private currentFrame = 0; @@ -652,6 +656,9 @@ export interface AgentTUIConfig { iteration: number; phase: "new-turn" | "intermediate-step"; }) => Promise | void; + onStreamStart?: ( + phase: "new-turn" | "intermediate-step" + ) => void | Promise; onTurnComplete?: ( messages: CheckpointMessage[], usage?: { @@ -1043,6 +1050,10 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { await measureUsageIfAvailable(messages); }; + const runBackgroundStartupProbe = (): void => { + measureUsageIfAvailable([]).then(ignore, ignore); + }; + const cancelActiveStream = (): boolean => { if (!activeStreamController || activeStreamController.signal.aborted) { return false; @@ -1495,11 +1506,12 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { overflowRetried = false, noOutputRetryCount = 0 ): Promise<"completed" | "continue" | "interrupted"> => { + showLoader("Processing..."); + const preparedTurn = await prepareMessages(phase); let messagesForLLM = preparedTurn.messages; const turnOverrides = preparedTurn.turnOverrides; - showLoader("Working..."); const streamAbortController = new AbortController(); activeStreamController = streamAbortController; streamInterruptRequested = false; @@ -1507,6 +1519,10 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { try { const budget = await resolveTurnBudget(phase, messagesForLLM); messagesForLLM = budget.messagesForLLM; + + showLoader("Working..."); + await config.onStreamStart?.(phase); + const stream = await config.agent.stream( mergeAgentStreamOptions({ abortSignal: streamAbortController.signal, @@ -1823,8 +1839,8 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { try { await config.onSetup?.(); - await measureUsageIfAvailable([]); - updateHeader(); + + runBackgroundStartupProbe(); while (!shouldExit) { const input = await waitForInput(); From 87450f1082e34c928643e6583b57e21f33f2be89 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 16:34:14 +0900 Subject: [PATCH 02/12] fix: address post-implementation review findings Post-implementation audit surfaced four issues and two gaps; all are addressed here to keep the PR self-contained. - BUG A (headless): turn-start was re-emitted on every retry, so a single logical turn could produce 2-4 turn-start events (overflow retry + up to 3 no-output retries). Added a hasEmittedTurnStart flag in runSingleTurn scope so the event fires at most once per logical turn; flag resets naturally on the next turn. - BUG B (TUI): during a blocking compaction the foreground loader stayed on 'Processing...' while the real wait was the compaction LLM call. The onBlockingChange callback now swaps the foreground label to 'Compacting...' on block entry and restores the previous label on block exit, so users see the real reason for the wait. - BUG C (TUI): text-start stream parts were not counted as visible, leaving the loader spinning after the empty AssistantStreamView had already mounted. text-start is now visible; reasoning-start follows the existing showReasoning flag. - BUG D (harness): documented that LoopHooks.onStreamStart fires only under runAgentLoop; the TUI has its own independent hook on AgentTUIConfig with a different signature. - GAP 1 (tests): added retry-path assertions that turn-start fires exactly once, plus a new test that confirms normal-path ordering (metadata -> user step -> turn-start -> agent step) and an intermediate-step phase test for tool-continuation turns. - GAP 1b (tests): added a test that turn-start events never appear in the persisted trajectory.json even though they stream on JSONL. - GAP 2 (docs): updated packages/cea/benchmark/AGENTS.md event table and flow diagram to include turn-start and note that it is not persisted. --- .changeset/prompt-processing-indicators.md | 6 +- packages/cea/benchmark/AGENTS.md | 6 +- packages/harness/src/types.ts | 5 + packages/headless/src/runner.test.ts | 115 +++++++++++++++++++++ packages/headless/src/runner.ts | 16 +-- packages/tui/src/agent-tui.ts | 12 +++ packages/tui/src/stream-handlers.ts | 6 +- 7 files changed, 156 insertions(+), 10 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index a839ae7..f7db1fa 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -4,4 +4,8 @@ "@ai-sdk-tool/headless": patch --- -Surface the "prompt processing" state that previously looked frozen. The harness loop now exposes `onStreamStart` and `onFirstStreamPart` hooks around the `agent.stream()` call site, the TUI shows a `Processing...` loader during turn preparation and switches to `Working...` once the LLM request is in flight, and headless emits a `turn-start` trajectory annotation alongside a matching `onStreamStart` callback so any agent runtime can signal activity before the first chunk arrives. The TUI startup token probe also runs non-blocking (fire-and-forget) so the editor accepts input immediately on launch; the context-usage footer starts from the estimated count and quietly upgrades to the real value once the probe resolves. +Surface the "prompt processing" state that previously looked frozen, and fix follow-up correctness gaps found during post-implementation review. + +- Harness: new `LoopHooks.onStreamStart` / `onFirstStreamPart` hooks wrap the `agent.stream()` call site so consumers driving turns through `runAgentLoop` can react to the prompt-processing latency gap. Docstring clarifies that the TUI has its own independent `onStreamStart` on `AgentTUIConfig`. +- TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). +- Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so the ATIF-v1.6 schema remains unchanged and persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. diff --git a/packages/cea/benchmark/AGENTS.md b/packages/cea/benchmark/AGENTS.md index b51b801..11c470e 100644 --- a/packages/cea/benchmark/AGENTS.md +++ b/packages/cea/benchmark/AGENTS.md @@ -51,6 +51,8 @@ headless.ts (Docker) output.jsonl harbor_agent.py │ │ │ ├─► emit InterruptEvent ───► interrupt ─────────────► lifecycle annotation │ │ │ + ├─► emit TurnStartEvent ───► turn-start ────────────► lifecycle annotation (not persisted) + │ │ │ └─► emit StepEvent(agent) ─► step (agent) ──────────► Step(source="agent") │ │ └───────────────────────► trajectory.json (ATIF-v1.6, written by headless) @@ -66,13 +68,15 @@ headless.ts (Docker) output.jsonl harbor_agent.py | `compaction` | `event`, `tokensBefore`, `tokensAfter?`, `durationMs?` | History compaction events | | `error` | `error`, `timestamp` | Fatal errors | | `interrupt` | `reason`, `timestamp` | Intentional caller interruption | +| `turn-start` | `phase`, `timestamp` | Lifecycle annotation emitted once per logical turn right after `agent.stream()` dispatch; dropped by `TrajectoryCollector` and absent from `trajectory.json` | ## Verification ### 1. Event Type Distribution ```bash cat jobs//*/agent/output.jsonl | jq -r '.type' | sort | uniq -c -# Expected output like: 1 metadata N step M compaction K approval optional interrupt (no unexpected 'error' lines) +# Expected output like: 1 metadata N step N turn-start M compaction K approval optional interrupt (no unexpected 'error' lines) +# Note: turn-start count should match the number of logical turns (== agent step count for linear conversations). ``` ### 2. Step ID Sequence diff --git a/packages/harness/src/types.ts b/packages/harness/src/types.ts index 9cf0c65..f5043a0 100644 --- a/packages/harness/src/types.ts +++ b/packages/harness/src/types.ts @@ -185,6 +185,11 @@ export interface LoopHooks { * display a loading indicator during the prompt-processing latency gap. * * The callback is awaited before iteration starts, so keep it fast. + * + * Note: only runtimes that drive their turns through `runAgentLoop` + * observe this hook. The TUI (`createAgentTUI`) implements its own + * stream loop and exposes an independent `AgentTUIConfig.onStreamStart` + * with a `phase` argument. */ onStreamStart?: (context: LoopContinueContext) => void | Promise; onToolCall?: ( diff --git a/packages/headless/src/runner.test.ts b/packages/headless/src/runner.test.ts index a6e188d..091ac85 100644 --- a/packages/headless/src/runner.test.ts +++ b/packages/headless/src/runner.test.ts @@ -129,6 +129,114 @@ describe("runHeadless", () => { expect(history.getAll()[0]?.originalContent).toBe("안녕"); }); + it("emits a single turn-start event per logical turn in the normal path", async () => { + const events: TrajectoryEvent[] = []; + const history = new CheckpointHistory(); + + await runHeadless({ + agent: { + stream: () => + createMockStream([{ role: "assistant", content: "hello" }]), + }, + emitEvent: (event) => { + events.push(event); + }, + initialUserMessage: { content: "hi" }, + messageHistory: history, + modelId: "mock-model", + sessionId: "session-turn-start-order", + }); + + const turnStartEvents = events.filter((e) => e.type === "turn-start"); + expect(turnStartEvents).toHaveLength(1); + expect(turnStartEvents[0]).toMatchObject({ + type: "turn-start", + phase: "new-turn", + }); + + const userIndex = events.findIndex( + (e) => e.type === "step" && "source" in e && e.source === "user" + ); + const turnStartIndex = events.findIndex((e) => e.type === "turn-start"); + const agentIndex = events.findIndex( + (e) => e.type === "step" && "source" in e && e.source === "agent" + ); + expect(userIndex).toBeGreaterThanOrEqual(0); + expect(turnStartIndex).toBeGreaterThan(userIndex); + expect(agentIndex).toBeGreaterThan(turnStartIndex); + }); + + it("emits an intermediate-step turn-start on tool-continuation turns", async () => { + const events: TrajectoryEvent[] = []; + const history = new CheckpointHistory(); + let streamCallCount = 0; + + await runHeadless({ + agent: { + stream: () => { + streamCallCount += 1; + if (streamCallCount === 1) { + return createToolCallStream( + [{ role: "assistant", content: "call_1" }], + "tool-calls" + ); + } + return createMockStream([{ role: "assistant", content: "done" }]); + }, + }, + emitEvent: (event) => { + events.push(event); + }, + initialUserMessage: { content: "do it" }, + messageHistory: history, + modelId: "mock-model", + sessionId: "session-intermediate-turn-start", + }); + + expect(streamCallCount).toBe(2); + const turnStartPhases = events + .filter((e) => e.type === "turn-start") + .map((e) => (e as { phase: string }).phase); + expect(turnStartPhases).toEqual(["new-turn", "intermediate-step"]); + }); + + it("excludes turn-start events from the ATIF trajectory JSON", async () => { + const events: TrajectoryEvent[] = []; + const history = new CheckpointHistory(); + const atifOutputPath = `/tmp/plugsuits-turn-start-${Date.now()}.json`; + + await runHeadless({ + agent: { + stream: () => + createMockStream([{ role: "assistant", content: "hello" }]), + }, + atifOutputPath, + emitEvent: (event) => { + events.push(event); + }, + initialUserMessage: { content: "hi" }, + messageHistory: history, + modelId: "mock-model", + sessionId: "session-trajectory-persistence-check", + }); + + expect(events.some((e) => e.type === "turn-start")).toBe(true); + + const { readFileSync, unlinkSync } = await import("node:fs"); + const persisted = JSON.parse(readFileSync(atifOutputPath, "utf-8")) as { + schema_version: string; + steps: Array<{ step_id: number; source: string }>; + extra?: Record; + }; + unlinkSync(atifOutputPath); + + expect(persisted.schema_version).toBe("ATIF-v1.6"); + const stepSources = persisted.steps.map((s) => s.source); + expect(stepSources).not.toContain("turn-start"); + const serialized = JSON.stringify(persisted); + expect(serialized).not.toContain("turn-start"); + }); + it("does not emit a synthetic user event when no initial user message is given", async () => { const events: TrajectoryEvent[] = []; @@ -482,6 +590,9 @@ describe("runHeadless", () => { ); expect(agentEvents).toHaveLength(1); expect(agentEvents[0]).toMatchObject({ message: "ok" }); + const turnStartEvents = events.filter((e) => e.type === "turn-start"); + expect(turnStartEvents).toHaveLength(1); + expect(turnStartEvents[0]).toMatchObject({ phase: "new-turn" }); }); it("retries once on no output generated error", async () => { @@ -521,6 +632,8 @@ describe("runHeadless", () => { ); expect(agentEvents).toHaveLength(1); expect(agentEvents[0]).toMatchObject({ message: "ok" }); + const turnStartEvents = events.filter((e) => e.type === "turn-start"); + expect(turnStartEvents).toHaveLength(1); }); it("retries multiple times on no output generated error before succeeding", async () => { @@ -559,6 +672,8 @@ describe("runHeadless", () => { (e) => e.type === "step" && "source" in e && e.source === "agent" ); expect(agentEvents).toHaveLength(1); + const turnStartEvents = events.filter((e) => e.type === "turn-start"); + expect(turnStartEvents).toHaveLength(1); }); it("emits an interrupt event and stops when the caller aborts", async () => { diff --git a/packages/headless/src/runner.ts b/packages/headless/src/runner.ts index 0e961d5..2b465e2 100644 --- a/packages/headless/src/runner.ts +++ b/packages/headless/src/runner.ts @@ -635,6 +635,7 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { } let overflowRetried = false; let noOutputRetryCount = 0; + let hasEmittedTurnStart = false; const executeStream = async ( streamMessages: ModelMessage[], streamMaxOutputTokens: number | undefined @@ -670,12 +671,15 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { }; } - emitAndCollect({ - type: "turn-start", - phase, - timestamp: new Date().toISOString(), - }); - await config.onStreamStart?.(phase); + if (!hasEmittedTurnStart) { + hasEmittedTurnStart = true; + emitAndCollect({ + type: "turn-start", + phase, + timestamp: new Date().toISOString(), + }); + await config.onStreamStart?.(phase); + } const streamPromise = Promise.resolve( config.agent.stream(streamOptions) diff --git a/packages/tui/src/agent-tui.ts b/packages/tui/src/agent-tui.ts index 2e74ec9..a2d56db 100644 --- a/packages/tui/src/agent-tui.ts +++ b/packages/tui/src/agent-tui.ts @@ -759,6 +759,8 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { let inputResolver: null | ((value: string | null) => void) = null; let lastCtrlCPressAt = 0; let foregroundStatus: StatusSpinner | null = null; + let foregroundStatusMessage: string | null = null; + let foregroundStatusBeforeBlocking: string | null = null; const backgroundStatuses = new Map(); let blockingCompactionActive = false; let commandInputListenerActive = false; @@ -835,6 +837,7 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { }; const clearStatus = (): void => { + foregroundStatusMessage = null; if (!foregroundStatus) { tui.requestRender(); return; @@ -849,6 +852,7 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { foregroundStatus.stop(); } foregroundStatus = createStatusSpinner(message); + foregroundStatusMessage = message; renderForegroundStatus(); }; @@ -914,12 +918,20 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { "Compacting...", "running" ); + if (foregroundStatusMessage !== null) { + foregroundStatusBeforeBlocking = foregroundStatusMessage; + showLoader("Compacting..."); + } userCompactionCallbacks?.onBlockingChange?.(event); return; } blockingCompactionActive = false; clearBackgroundStatus("blocking-compaction"); + if (foregroundStatusBeforeBlocking !== null) { + showLoader(foregroundStatusBeforeBlocking); + foregroundStatusBeforeBlocking = null; + } updateHeader(); tui.requestRender(); userCompactionCallbacks?.onBlockingChange?.(event); diff --git a/packages/tui/src/stream-handlers.ts b/packages/tui/src/stream-handlers.ts index 87462d0..e9e5ce1 100644 --- a/packages/tui/src/stream-handlers.ts +++ b/packages/tui/src/stream-handlers.ts @@ -417,10 +417,12 @@ export const isVisibleStreamPart = ( case "text-end": case "reasoning-end": case "start": - case "text-start": - case "reasoning-start": case "tool-input-end": return false; + case "text-start": + return true; + case "reasoning-start": + return flags.showReasoning; case "reasoning-delta": return flags.showReasoning; case "tool-result": From 49cd7261def27eb024be47b0d6b6f7201bac9886 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 16:58:29 +0900 Subject: [PATCH 03/12] fix(headless): correct ATIF schema_version label to v1.4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The persisted trajectory was tagged 'ATIF-v1.6', which does not exist on Harbor's public spec. The current Harbor ATIF version is v1.4 (see https://www.harborframework.com/docs/agents/trajectory-format): v1.0, v1.1, v1.2, v1.3, v1.4 — v1.5 and v1.6 were never released. The 'v1.6' label was an internal bump accompanying a redesign of the JSONL event types on 2026-04-03; it conflated two distinct surfaces: - The ATIF trajectory that TrajectoryCollector writes to disk, which must follow Harbor's published schema (v1.4). - The internal stdout JSONL protocol used by the headless runner, which carries lifecycle annotations (approval, compaction, interrupt, turn-start) that ATIF does not define. This commit: - Sets schema_version to 'ATIF-v1.4' in TrajectoryCollector (both the TypeScript type literal and the runtime value). - Updates the Python validator (test_trajectory.py), Python scorer, CLI help text, and test assertions to expect 'ATIF-v1.4'. - Rewrites packages/headless/AGENTS.md, README.md, and packages/cea/benchmark/AGENTS.md to separate the ATIF persisted format from the internal JSONL streaming protocol, with a pointer to Harbor's spec page. - Adds a JSDoc header on TrajectoryEvent types explaining the split. --- .changeset/prompt-processing-indicators.md | 3 ++- packages/cea/benchmark/AGENTS.md | 4 ++-- packages/cea/benchmark/scorer.py | 2 +- packages/cea/benchmark/test_trajectory.py | 8 ++++---- packages/cea/src/entrypoints/main.ts | 2 +- packages/headless/AGENTS.md | 15 ++++++++++++--- packages/headless/README.md | 4 ++-- packages/headless/src/runner.test.ts | 4 ++-- packages/headless/src/trajectory-collector.ts | 4 ++-- packages/headless/src/types.ts | 11 ++++++++++- 10 files changed, 38 insertions(+), 19 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index f7db1fa..70e623e 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -8,4 +8,5 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - Harness: new `LoopHooks.onStreamStart` / `onFirstStreamPart` hooks wrap the `agent.stream()` call site so consumers driving turns through `runAgentLoop` can react to the prompt-processing latency gap. Docstring clarifies that the TUI has its own independent `onStreamStart` on `AgentTUIConfig`. - TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). -- Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so the ATIF-v1.6 schema remains unchanged and persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. +- Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. +- Headless: the persisted `schema_version` is corrected from the internal `ATIF-v1.6` label to the actual current Harbor spec version `ATIF-v1.4` (). Documentation across `packages/headless/AGENTS.md`, `packages/headless/README.md`, and `packages/cea/benchmark/AGENTS.md` now separates the internal JSONL streaming protocol (which carries lifecycle annotations such as `approval`, `compaction`, `interrupt`, `turn-start`) from the ATIF-v1.4 trajectory that `TrajectoryCollector` writes to disk. diff --git a/packages/cea/benchmark/AGENTS.md b/packages/cea/benchmark/AGENTS.md index 11c470e..eacaead 100644 --- a/packages/cea/benchmark/AGENTS.md +++ b/packages/cea/benchmark/AGENTS.md @@ -36,7 +36,7 @@ Control agent behavior via environment variables: | `AGENT_ENABLE_THINKING` | `1`, `true`, `yes` | Enable `--think` flag (captures reasoning content) | | `AGENT_ENABLE_TOOL_FALLBACK` | `1`, `true`, `yes` | Enable `--tool-fallback` flag (XML-based tool calling for non-native models) | -## Event Flow (ATIF-v1.6) +## Event Flow ``` headless.ts (Docker) output.jsonl harbor_agent.py @@ -55,7 +55,7 @@ headless.ts (Docker) output.jsonl harbor_agent.py │ │ │ └─► emit StepEvent(agent) ─► step (agent) ──────────► Step(source="agent") │ │ - └───────────────────────► trajectory.json (ATIF-v1.6, written by headless) + └───────────────────────► trajectory.json (ATIF-v1.4, written by headless) ``` ## Event Types (output.jsonl) diff --git a/packages/cea/benchmark/scorer.py b/packages/cea/benchmark/scorer.py index 81318cb..5e657f4 100644 --- a/packages/cea/benchmark/scorer.py +++ b/packages/cea/benchmark/scorer.py @@ -25,7 +25,7 @@ def parse_timestamp(ts: str | None) -> datetime | None: def score_trajectory(trajectory: dict) -> dict: - """Compute performance metrics from an ATIF-v1.6 trajectory dict.""" + """Compute performance metrics from an ATIF-v1.4 trajectory dict.""" steps = trajectory.get("steps", []) fm = trajectory.get("final_metrics", {}) or {} compaction_events = trajectory.get("extra", {}).get("compaction_events", []) diff --git a/packages/cea/benchmark/test_trajectory.py b/packages/cea/benchmark/test_trajectory.py index f6403d6..2334b2f 100644 --- a/packages/cea/benchmark/test_trajectory.py +++ b/packages/cea/benchmark/test_trajectory.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""ATIF-v1.6 trajectory validation test. +"""ATIF-v1.4 trajectory validation test. Usage: python3 test_trajectory.py """ @@ -11,7 +11,7 @@ def validate_trajectory(path: str) -> list[str]: - """Validate trajectory.json against ATIF-v1.6 spec. Returns list of errors.""" + """Validate trajectory.json against ATIF-v1.4 spec. Returns list of errors.""" errors = [] try: @@ -21,9 +21,9 @@ def validate_trajectory(path: str) -> list[str]: return [f"Cannot read/parse file: {e}"] # 1. schema_version - if t.get("schema_version") != "ATIF-v1.6": + if t.get("schema_version") != "ATIF-v1.4": errors.append( - f"schema_version: expected 'ATIF-v1.6', got {t.get('schema_version')!r}" + f"schema_version: expected 'ATIF-v1.4', got {t.get('schema_version')!r}" ) # 2. session_id present diff --git a/packages/cea/src/entrypoints/main.ts b/packages/cea/src/entrypoints/main.ts index c769737..2ab2119 100644 --- a/packages/cea/src/entrypoints/main.ts +++ b/packages/cea/src/entrypoints/main.ts @@ -627,7 +627,7 @@ const mainCommand = defineCommand({ atif: { type: "boolean", description: - "Generate trajectory.json in ATIF-v1.6 format (Harbor compatible)", + "Generate trajectory.json in ATIF-v1.4 format (Harbor compatible)", default: false, }, }, diff --git a/packages/headless/AGENTS.md b/packages/headless/AGENTS.md index e0253f5..304a626 100644 --- a/packages/headless/AGENTS.md +++ b/packages/headless/AGENTS.md @@ -9,9 +9,18 @@ This package provides a non-interactive, JSONL event-streaming runtime for agent The package depends on `@ai-sdk-tool/harness` for `CheckpointHistory`, `AgentStreamResult`, and `shouldContinueManualToolLoop`. The agent itself is passed in as a config parameter. -## JSONL EVENT PROTOCOL (ATIF-v1.6) +## JSONL EVENT PROTOCOL -Every event is a JSON object on its own line. Events conform to the ATIF-v1.6 specification for trajectory logging. +Every event is a JSON object on its own line. + +> **This JSONL stream is NOT the ATIF schema.** ATIF is the format of the +> persisted `trajectory.json` file produced by `TrajectoryCollector` — see +> [Harbor's ATIF specification](https://www.harborframework.com/docs/agents/trajectory-format). +> The current ATIF version is **v1.4**. This JSONL protocol is an internal +> streaming contract used by the runner to drive UI, telemetry, and the +> trajectory collector; it carries lifecycle annotations +> (`approval`, `compaction`, `interrupt`, `turn-start`) that have no place +> in ATIF and are dropped from the persisted trajectory. ### Design Decisions - **NO sessionId on individual events**: A single `MetadataEvent` at the start carries the `session_id`. @@ -120,7 +129,7 @@ import type { | `runner.ts` | `runHeadless` | Main agent loop with JSONL emission | | `emit.ts` | `emitEvent` | Default stdout JSONL event sink | | `signals.ts` | `registerSignalHandlers` | Process signal lifecycle management | -| `types.ts` | `TrajectoryEvent`, etc. | ATIF-v1.6 event type definitions | +| `types.ts` | `TrajectoryEvent`, etc. | JSONL stream event types (internal) + ATIF-v1.4 persisted types | ## CONVENTIONS diff --git a/packages/headless/README.md b/packages/headless/README.md index 567876e..4f14271 100644 --- a/packages/headless/README.md +++ b/packages/headless/README.md @@ -50,7 +50,7 @@ await runHeadless({ }); ``` -**Example output (ATIF-v1.6):** +**Example output (JSONL stream):** ```jsonl {"type":"metadata","timestamp":"2026-04-03T10:00:00.000Z","session_id":"ses-abc123","agent":{"name":"code-editing-agent","version":"1.0.0","model_name":"gpt-4o"}} @@ -169,7 +169,7 @@ registerSignalHandlers({ ## JSONL Event Types -Headless output now follows the **ATIF-v1.6** protocol documented in `packages/headless/AGENTS.md`. +The runner streams an internal JSONL event protocol documented in `packages/headless/AGENTS.md`. The persisted `trajectory.json` produced by `TrajectoryCollector` conforms to Harbor's **ATIF-v1.4** schema (); the JSONL stream carries additional lifecycle annotations that are not part of ATIF and are dropped from the trajectory. ### Event overview diff --git a/packages/headless/src/runner.test.ts b/packages/headless/src/runner.test.ts index 091ac85..e361918 100644 --- a/packages/headless/src/runner.test.ts +++ b/packages/headless/src/runner.test.ts @@ -230,7 +230,7 @@ describe("runHeadless", () => { }; unlinkSync(atifOutputPath); - expect(persisted.schema_version).toBe("ATIF-v1.6"); + expect(persisted.schema_version).toBe("ATIF-v1.4"); const stepSources = persisted.steps.map((s) => s.source); expect(stepSources).not.toContain("turn-start"); const serialized = JSON.stringify(persisted); @@ -482,7 +482,7 @@ describe("runHeadless", () => { expect(capturedMessages[1]?.[0]?.role).toBe("user"); }); - it("emits ATIF-v1.6 step events for user messages and agent tool responses", async () => { + it("emits ATIF-v1.4 step events for user messages and agent tool responses", async () => { const collectedEvents: TrajectoryEvent[] = []; await runHeadless({ diff --git a/packages/headless/src/trajectory-collector.ts b/packages/headless/src/trajectory-collector.ts index 1d485b5..9d8f7a9 100644 --- a/packages/headless/src/trajectory-collector.ts +++ b/packages/headless/src/trajectory-collector.ts @@ -39,7 +39,7 @@ export interface TrajectoryJson { total_cached_tokens: number | null; total_steps: number; }; - schema_version: "ATIF-v1.6"; + schema_version: "ATIF-v1.4"; session_id: string; steps: AtifStep[]; } @@ -150,7 +150,7 @@ export class TrajectoryCollector { finalize(): TrajectoryJson { const trajectory: TrajectoryJson = { - schema_version: "ATIF-v1.6", + schema_version: "ATIF-v1.4", session_id: this.metadata?.session_id ?? "unknown", agent: this.metadata?.agent ?? { ...DEFAULT_AGENT }, steps: this.steps.map((s) => this.toAtifStep(s)), diff --git a/packages/headless/src/types.ts b/packages/headless/src/types.ts index 68e88bf..e351fad 100644 --- a/packages/headless/src/types.ts +++ b/packages/headless/src/types.ts @@ -1,7 +1,16 @@ import type { BeforeTurnResult } from "@ai-sdk-tool/harness"; /** - * ATIF-v1.6 native event types for trajectory logging. + * JSONL streaming protocol for headless trajectory logging. + * + * Note: this is NOT the ATIF schema itself. The persisted trajectory + * written to disk via {@link TrajectoryCollector} conforms to ATIF-v1.4 + * (https://www.harborframework.com/docs/agents/trajectory-format). The + * event union below is the internal stdout JSONL contract that the + * runner emits during execution; the collector consumes these events + * and produces the ATIF trajectory as output. Lifecycle annotations + * (`approval`, `compaction`, `interrupt`, `turn-start`) exist only on + * this JSONL stream and are not part of ATIF. * * All emitted step events (UserStepEvent, AgentStepEvent) conform to the ATIF specification. * Metadata is emitted once at run start. Compaction and error events are lifecycle annotations. From 52acdd4be9f5535ca3e5464a087669ceb3b827a2 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 17:10:02 +0900 Subject: [PATCH 04/12] feat(headless): complete ATIF-v1.4 field coverage and wire up Harbor validator After correcting the schema_version label to 'ATIF-v1.4', audit against the official spec (https://www.harborframework.com/docs/agents/trajectory-format) surfaced three remaining gaps. This commit closes them. - StepMetrics now exposes the full v1.4 optional surface: - logprobs: number[] - prompt_token_ids: number[] (added in v1.4) - completion_token_ids: number[] (added in v1.3) These remain undefined unless the provider populates them, so existing output is bit-for-bit identical for callers that don't set the new fields. - TrajectoryJson.final_metrics now aggregates total_cost_usd across step metrics with the same null-when-absent semantics as the other token totals. The ATIF compliance test suite already expected this field in ATIF_FINAL_METRICS_ALLOWED_FIELDS; the emission side just caught up to the test. - TrajectoryJson.extra is typed as the approval/compaction/interrupt container intersected with Record so downstream additions to extra.* do not require breaking type changes. - packages/cea/benchmark/test_trajectory.py now: - Calls harbor.utils.trajectory_validator.TrajectoryValidator when the harbor package is importable (auto-skips when Harbor is not installed so local developer workflows keep working). - Enforces per-step metric shapes: numeric fields (prompt_tokens, completion_tokens, cached_tokens, cost_usd) must be numbers; id fields (logprobs, prompt_token_ids, completion_token_ids) must be arrays. - Prints total_cost_usd in the summary block and reports whether the Harbor validator was used or skipped. - New tests: - finalize() aggregates total_cost_usd across step metrics - finalize() returns null total_cost_usd when no step reports cost - finalize() preserves all v1.4 optional metric fields (logprobs, prompt_token_ids, completion_token_ids) Verified: typecheck, ultracite check, full vitest (569 tests), and turbo build all pass. --- .changeset/prompt-processing-indicators.md | 1 + packages/cea/benchmark/test_trajectory.py | 69 +++++++++++- .../src/__tests__/atif-events.test.ts | 103 ++++++++++++++++++ packages/headless/src/trajectory-collector.ts | 15 ++- packages/headless/src/types.ts | 11 +- 5 files changed, 192 insertions(+), 7 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index 70e623e..ea2241c 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -10,3 +10,4 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). - Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. - Headless: the persisted `schema_version` is corrected from the internal `ATIF-v1.6` label to the actual current Harbor spec version `ATIF-v1.4` (). Documentation across `packages/headless/AGENTS.md`, `packages/headless/README.md`, and `packages/cea/benchmark/AGENTS.md` now separates the internal JSONL streaming protocol (which carries lifecycle annotations such as `approval`, `compaction`, `interrupt`, `turn-start`) from the ATIF-v1.4 trajectory that `TrajectoryCollector` writes to disk. +- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `Trajectory.extra` is typed as an open record so future lifecycle annotations fit without a breaking type change. The bundled `test_trajectory.py` validator now calls Harbor's official `TrajectoryValidator` when `harbor` is importable and falls back to the local shape check otherwise; it also enforces per-step metric shapes. diff --git a/packages/cea/benchmark/test_trajectory.py b/packages/cea/benchmark/test_trajectory.py index 2334b2f..8070eff 100644 --- a/packages/cea/benchmark/test_trajectory.py +++ b/packages/cea/benchmark/test_trajectory.py @@ -86,6 +86,49 @@ def validate_trajectory(path: str) -> list[str]: return errors if not isinstance(fm.get("total_steps"), int): errors.append("final_metrics.total_steps: must be an integer") + for token_field in ( + "total_prompt_tokens", + "total_completion_tokens", + "total_cached_tokens", + "total_cost_usd", + ): + value = fm.get(token_field) + if value is not None and not isinstance(value, (int, float)): + errors.append( + f"final_metrics.{token_field}: must be a number or null, got {type(value).__name__}" + ) + + # 8b. per-step metrics shape + for i, step in enumerate(steps): + if not isinstance(step, dict): + continue + metrics = step.get("metrics") + if metrics is None: + continue + if not isinstance(metrics, dict): + errors.append(f"steps[{i}].metrics: must be a dictionary when present") + continue + for num_field in ( + "prompt_tokens", + "completion_tokens", + "cached_tokens", + "cost_usd", + ): + value = metrics.get(num_field) + if value is not None and not isinstance(value, (int, float)): + errors.append( + f"steps[{i}].metrics.{num_field}: must be a number when present" + ) + for list_field in ( + "logprobs", + "prompt_token_ids", + "completion_token_ids", + ): + value = metrics.get(list_field) + if value is not None and not isinstance(value, list): + errors.append( + f"steps[{i}].metrics.{list_field}: must be a list when present" + ) # 9. persisted lifecycle annotations under extra extra = t.get("extra") @@ -104,6 +147,22 @@ def validate_trajectory(path: str) -> list[str]: return errors +def run_harbor_validator(path: str) -> list[str] | None: + """Run Harbor's official trajectory_validator when the harbor package is + importable. Returns None when Harbor isn't installed so the caller can + fall back to the bundled validator.""" + try: + from harbor.utils.trajectory_validator import TrajectoryValidator + except ImportError: + return None + + validator = TrajectoryValidator() + is_valid = validator.validate(path) + if is_valid: + return [] + return [f"harbor: {err}" for err in validator.get_errors()] + + def main() -> None: if len(sys.argv) < 2: print("Usage: python3 test_trajectory.py ") @@ -112,6 +171,11 @@ def main() -> None: path = sys.argv[1] errors = validate_trajectory(path) + harbor_errors = run_harbor_validator(path) + harbor_used = harbor_errors is not None + if harbor_errors: + errors.extend(harbor_errors) + if errors: print(f"VALIDATION FAILED: {path}") for e in errors: @@ -128,7 +192,7 @@ def main() -> None: print(f" session_id: {t.get('session_id')}") print(f" steps: {len(steps)}") print( - f" final_metrics: total_prompt={fm.get('total_prompt_tokens')}, total_completion={fm.get('total_completion_tokens')}" + f" final_metrics: total_prompt={fm.get('total_prompt_tokens')}, total_completion={fm.get('total_completion_tokens')}, total_cost={fm.get('total_cost_usd')}" ) extra = t.get("extra", {}) or {} print( @@ -137,6 +201,9 @@ def main() -> None: f"compaction={len(extra.get('compaction_events', []))}, " f"interrupt={len(extra.get('interrupt_events', []))}" ) + print( + f" harbor_validator: {'passed' if harbor_used else 'skipped (harbor package not installed)'}" + ) sys.exit(0) diff --git a/packages/headless/src/__tests__/atif-events.test.ts b/packages/headless/src/__tests__/atif-events.test.ts index ba35796..479d9ea 100644 --- a/packages/headless/src/__tests__/atif-events.test.ts +++ b/packages/headless/src/__tests__/atif-events.test.ts @@ -568,6 +568,109 @@ describe("TrajectoryCollector ATIF compliance", () => { }); }); + it("finalize() aggregates total_cost_usd across step metrics", () => { + const collector = new TrajectoryCollector(); + collector.addMetadata({ + type: "metadata", + timestamp, + session_id: "ses-cost", + agent: { name: "a", version: "1", model_name: "m" }, + }); + collector.addStep({ + type: "step", + step_id: 1, + timestamp, + source: "user", + message: "hi", + }); + collector.addStep({ + type: "step", + step_id: 2, + timestamp, + source: "agent", + message: "a", + metrics: { cost_usd: 0.12, prompt_tokens: 100 }, + }); + collector.addStep({ + type: "step", + step_id: 3, + timestamp, + source: "agent", + message: "b", + metrics: { cost_usd: 0.08, prompt_tokens: 80 }, + }); + + const trajectory = collector.finalize(); + expect(trajectory.final_metrics.total_cost_usd).toBeCloseTo(0.2, 10); + expect(trajectory.final_metrics.total_prompt_tokens).toBe(180); + }); + + it("finalize() returns null total_cost_usd when no step reported a cost", () => { + const collector = new TrajectoryCollector(); + collector.addMetadata({ + type: "metadata", + timestamp, + session_id: "ses-no-cost", + agent: { name: "a", version: "1", model_name: "m" }, + }); + collector.addStep({ + type: "step", + step_id: 1, + timestamp, + source: "user", + message: "hi", + }); + collector.addStep({ + type: "step", + step_id: 2, + timestamp, + source: "agent", + message: "a", + metrics: { prompt_tokens: 100 }, + }); + + const trajectory = collector.finalize(); + expect(trajectory.final_metrics.total_cost_usd).toBeNull(); + }); + + it("finalize() preserves ATIF-v1.4 optional fields (logprobs, prompt_token_ids, completion_token_ids)", () => { + const collector = new TrajectoryCollector(); + collector.addMetadata({ + type: "metadata", + timestamp, + session_id: "ses-v14", + agent: { name: "a", version: "1", model_name: "m" }, + }); + collector.addStep({ + type: "step", + step_id: 1, + timestamp, + source: "user", + message: "hi", + }); + collector.addStep({ + type: "step", + step_id: 2, + timestamp, + source: "agent", + message: "reply", + metrics: { + completion_token_ids: [1722, 310, 5533], + logprobs: [-0.1, -0.05, -0.02], + prompt_token_ids: [1, 2, 3], + prompt_tokens: 3, + }, + }); + + const trajectory = collector.finalize(); + const agentStep = trajectory.steps[1]; + expect(agentStep?.metrics?.logprobs).toStrictEqual([-0.1, -0.05, -0.02]); + expect(agentStep?.metrics?.prompt_token_ids).toStrictEqual([1, 2, 3]); + expect(agentStep?.metrics?.completion_token_ids).toStrictEqual([ + 1722, 310, 5533, + ]); + }); + it("finalize() observation source_call_ids reference valid tool_call_ids", () => { const collector = new TrajectoryCollector(); collector.addMetadata({ diff --git a/packages/headless/src/trajectory-collector.ts b/packages/headless/src/trajectory-collector.ts index 9d8f7a9..333b8b9 100644 --- a/packages/headless/src/trajectory-collector.ts +++ b/packages/headless/src/trajectory-collector.ts @@ -32,11 +32,12 @@ export interface TrajectoryJson { approval_events?: ApprovalEvent[]; compaction_events?: CompactionEvent[]; interrupt_events?: InterruptEvent[]; - }; + } & Record; final_metrics: { - total_prompt_tokens: number | null; - total_completion_tokens: number | null; total_cached_tokens: number | null; + total_completion_tokens: number | null; + total_cost_usd: number | null; + total_prompt_tokens: number | null; total_steps: number; }; schema_version: "ATIF-v1.4"; @@ -103,14 +104,16 @@ export class TrajectoryCollector { } private collectFinalMetrics(): { - total_prompt_tokens: number | null; - total_completion_tokens: number | null; total_cached_tokens: number | null; + total_completion_tokens: number | null; + total_cost_usd: number | null; + total_prompt_tokens: number | null; total_steps: number; } { const prompt: MetricAccumulator = { hasValue: false, total: 0 }; const completion: MetricAccumulator = { hasValue: false, total: 0 }; const cached: MetricAccumulator = { hasValue: false, total: 0 }; + const cost: MetricAccumulator = { hasValue: false, total: 0 }; for (const step of this.steps) { if (!("metrics" in step)) { @@ -125,12 +128,14 @@ export class TrajectoryCollector { addMetric(prompt, metrics.prompt_tokens); addMetric(completion, metrics.completion_tokens); addMetric(cached, metrics.cached_tokens); + addMetric(cost, metrics.cost_usd); } return { total_prompt_tokens: toMetricTotal(prompt), total_completion_tokens: toMetricTotal(completion), total_cached_tokens: toMetricTotal(cached), + total_cost_usd: toMetricTotal(cost), total_steps: this.steps.length, }; } diff --git a/packages/headless/src/types.ts b/packages/headless/src/types.ts index e351fad..6e6d49a 100644 --- a/packages/headless/src/types.ts +++ b/packages/headless/src/types.ts @@ -46,12 +46,21 @@ export interface ObservationData { /** * Token usage metrics from the agent's model invocation. - * All fields come from the SDK's stream.usage and are never estimated. + * + * Fields follow ATIF-v1.4 (https://www.harborframework.com/docs/agents/trajectory-format). + * `completion_token_ids` was added in v1.3 (RL), `prompt_token_ids` in v1.4. + * + * Numeric fields come from the SDK's stream.usage and are never estimated. + * Token-id and logprob fields are only populated when the provider exposes + * them; consumers must treat them as opt-in. */ export interface StepMetrics { cached_tokens?: number; + completion_token_ids?: number[]; completion_tokens?: number; cost_usd?: number; + logprobs?: number[]; + prompt_token_ids?: number[]; prompt_tokens?: number; } From 245214f47007f91afb72e3ac4474d3ed3fb28934 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 17:28:16 +0900 Subject: [PATCH 05/12] fix: address AI reviewer feedback from Gemini, Codex, and Cubic Six issues raised across three AI review bots on PR #117: BUG (Gemini/Codex/Cubic - Medium): turn-start and onStreamStart fired before agent.stream() was invoked, breaking the documented contract 'fires immediately after agent.stream is invoked'. A stream-creation failure could still emit a false 'stream started' lifecycle event and skew telemetry for consumers that treat turn-start as the network dispatch boundary. Moved the emission/hook in both the headless runner and the TUI's runSingleStreamTurn to run after agent.stream() successfully returns. BUG (Gemini/Codex - High): Background startup usage probe raced with the first turn's per-turn probe; both paths wrote to messageHistory.updateActualUsage with no revision guard. A stale startup probe (based on an empty message list) could clobber newer usage data and underestimate context pressure, mis-sizing maxOutputTokens and compaction decisions. Added a generation token on measureUsageIfAvailable: probes whose generation is no longer current silently drop their result. BUG (Codex - Medium): onBlockingChange captures prior spinner text to restore after blocking compaction, but the orchestrator can emit multiple blocking:true stages for a single compaction (starting -> pruning -> compacting). The stash was being overwritten every stage, so by unblock time the stashed value was 'Compacting' itself. Only stash when foregroundStatusBeforeBlocking is null. BUG (Cubic - Medium): When the first stream part arrived during a blocking compaction, clearStatus() cleared foregroundStatusMessage, but the unblock path still called showLoader on the stashed value, resurrecting a stale 'Processing...' spinner on what should be a clean UI. Guard the restoration with a foregroundStatusMessage null check so a cleared loader stays cleared. BUG (Codex - Low): updateHeader() used to run right after onSetup to reflect any async header/footer state the consumer initialised there. When the startup probe became non-blocking, the updateHeader call was accidentally dropped. Restored it so header metadata renders before the idle session begins. BUG (Cubic - Low): Python validator accepted booleans as numbers for per-step and final metric values because isinstance(True, int) is True in Python. Added _is_real_number and _is_real_int helpers that explicitly exclude bool, matching ATIF-v1.4 expectations. New tests: - 'does not emit turn-start when agent.stream() rejects before dispatch' - 'emits turn-start after agent.stream() succeeds (before first chunk)' Verified: typecheck, ultracite check, 574 tests (+5 headless), build all pass. Every review-flagged issue has been fixed or explicitly addressed in the commit above or earlier commits on this branch. --- .changeset/prompt-processing-indicators.md | 7 ++- packages/cea/benchmark/test_trajectory.py | 14 ++++-- packages/headless/src/runner.test.ts | 54 ++++++++++++++++++++++ packages/headless/src/runner.ts | 20 ++++---- packages/tui/src/agent-tui.ts | 27 +++++++++-- 5 files changed, 103 insertions(+), 19 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index ea2241c..906ecd6 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -10,4 +10,9 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). - Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. - Headless: the persisted `schema_version` is corrected from the internal `ATIF-v1.6` label to the actual current Harbor spec version `ATIF-v1.4` (). Documentation across `packages/headless/AGENTS.md`, `packages/headless/README.md`, and `packages/cea/benchmark/AGENTS.md` now separates the internal JSONL streaming protocol (which carries lifecycle annotations such as `approval`, `compaction`, `interrupt`, `turn-start`) from the ATIF-v1.4 trajectory that `TrajectoryCollector` writes to disk. -- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `Trajectory.extra` is typed as an open record so future lifecycle annotations fit without a breaking type change. The bundled `test_trajectory.py` validator now calls Harbor's official `TrajectoryValidator` when `harbor` is importable and falls back to the local shape check otherwise; it also enforces per-step metric shapes. +- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `Trajectory.extra` is typed as an open record so future lifecycle annotations fit without a breaking type change. The bundled `test_trajectory.py` validator now calls Harbor's official `TrajectoryValidator` when `harbor` is importable and falls back to the local shape check otherwise; it also enforces per-step metric shapes and rejects booleans where ATIF requires real numbers. +- Addressed PR review feedback: + - `turn-start` and `onStreamStart` now fire strictly after `agent.stream()` successfully returns, so stream-creation failures no longer produce a false "stream started" signal (reported by Gemini, Codex, and Cubic reviewers). + - The background startup usage probe is serialized against per-turn probes by a generation token; a stale startup probe can no longer overwrite newer usage data and skew context-pressure metrics. + - The blocking-compaction spinner swap only stashes the original foreground label on first entry and only restores it when the foreground loader is still live, eliminating both the "Compacting..." wording sticking after unblock and the "Processing..." spinner resurrecting after the first stream part arrived. + - Restored the post-`onSetup` `updateHeader()` call that was accidentally dropped when the startup probe became non-blocking, so any header/footer state that `onSetup` initialises renders immediately instead of waiting for the first probe to resolve. diff --git a/packages/cea/benchmark/test_trajectory.py b/packages/cea/benchmark/test_trajectory.py index 8070eff..851739c 100644 --- a/packages/cea/benchmark/test_trajectory.py +++ b/packages/cea/benchmark/test_trajectory.py @@ -10,6 +10,14 @@ from pathlib import Path +def _is_real_number(value: object) -> bool: + return isinstance(value, (int, float)) and not isinstance(value, bool) + + +def _is_real_int(value: object) -> bool: + return isinstance(value, int) and not isinstance(value, bool) + + def validate_trajectory(path: str) -> list[str]: """Validate trajectory.json against ATIF-v1.4 spec. Returns list of errors.""" errors = [] @@ -84,7 +92,7 @@ def validate_trajectory(path: str) -> list[str]: if not isinstance(fm, dict): errors.append("final_metrics must be a dictionary") return errors - if not isinstance(fm.get("total_steps"), int): + if not _is_real_int(fm.get("total_steps")): errors.append("final_metrics.total_steps: must be an integer") for token_field in ( "total_prompt_tokens", @@ -93,7 +101,7 @@ def validate_trajectory(path: str) -> list[str]: "total_cost_usd", ): value = fm.get(token_field) - if value is not None and not isinstance(value, (int, float)): + if value is not None and not _is_real_number(value): errors.append( f"final_metrics.{token_field}: must be a number or null, got {type(value).__name__}" ) @@ -115,7 +123,7 @@ def validate_trajectory(path: str) -> list[str]: "cost_usd", ): value = metrics.get(num_field) - if value is not None and not isinstance(value, (int, float)): + if value is not None and not _is_real_number(value): errors.append( f"steps[{i}].metrics.{num_field}: must be a number when present" ) diff --git a/packages/headless/src/runner.test.ts b/packages/headless/src/runner.test.ts index e361918..f50dd57 100644 --- a/packages/headless/src/runner.test.ts +++ b/packages/headless/src/runner.test.ts @@ -166,6 +166,60 @@ describe("runHeadless", () => { expect(agentIndex).toBeGreaterThan(turnStartIndex); }); + it("does not emit turn-start when agent.stream() rejects before dispatch", async () => { + const events: TrajectoryEvent[] = []; + const history = new CheckpointHistory(); + const streamError = new Error("Provider refused the request"); + + await runHeadless({ + agent: { + stream: () => Promise.reject(streamError), + }, + emitEvent: (event) => { + events.push(event); + }, + initialUserMessage: { content: "hi" }, + messageHistory: history, + modelId: "mock-model", + sessionId: "session-stream-failure", + }).catch(() => undefined); + + expect(events.some((e) => e.type === "turn-start")).toBe(false); + }); + + it("emits turn-start after agent.stream() succeeds (before first chunk)", async () => { + const events: TrajectoryEvent[] = []; + const history = new CheckpointHistory(); + const eventOrder: string[] = []; + let streamCallCount = 0; + + await runHeadless({ + agent: { + stream: () => { + streamCallCount += 1; + eventOrder.push("agent.stream() called"); + return createMockStream([{ role: "assistant", content: "response" }]); + }, + }, + emitEvent: (event) => { + if (event.type === "turn-start") { + eventOrder.push("turn-start emitted"); + } + events.push(event); + }, + initialUserMessage: { content: "hi" }, + messageHistory: history, + modelId: "mock-model", + sessionId: "session-turn-start-order-vs-stream", + }); + + expect(streamCallCount).toBe(1); + const streamIdx = eventOrder.indexOf("agent.stream() called"); + const turnStartIdx = eventOrder.indexOf("turn-start emitted"); + expect(streamIdx).toBeGreaterThanOrEqual(0); + expect(turnStartIdx).toBeGreaterThan(streamIdx); + }); + it("emits an intermediate-step turn-start on tool-continuation turns", async () => { const events: TrajectoryEvent[] = []; const history = new CheckpointHistory(); diff --git a/packages/headless/src/runner.ts b/packages/headless/src/runner.ts index 2b465e2..e759a5f 100644 --- a/packages/headless/src/runner.ts +++ b/packages/headless/src/runner.ts @@ -671,16 +671,6 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { }; } - if (!hasEmittedTurnStart) { - hasEmittedTurnStart = true; - emitAndCollect({ - type: "turn-start", - phase, - timestamp: new Date().toISOString(), - }); - await config.onStreamStart?.(phase); - } - const streamPromise = Promise.resolve( config.agent.stream(streamOptions) ); @@ -710,6 +700,16 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { ); }), ]); + + if (!hasEmittedTurnStart) { + hasEmittedTurnStart = true; + emitAndCollect({ + type: "turn-start", + phase, + timestamp: new Date().toISOString(), + }); + await config.onStreamStart?.(phase); + } const nextStepId = stepId + 1; const processStreamResult = await processStream({ emitEvent: emitAndCollect, diff --git a/packages/tui/src/agent-tui.ts b/packages/tui/src/agent-tui.ts index a2d56db..bda4955 100644 --- a/packages/tui/src/agent-tui.ts +++ b/packages/tui/src/agent-tui.ts @@ -918,8 +918,13 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { "Compacting...", "running" ); - if (foregroundStatusMessage !== null) { + if ( + foregroundStatusBeforeBlocking === null && + foregroundStatusMessage !== null + ) { foregroundStatusBeforeBlocking = foregroundStatusMessage; + } + if (foregroundStatusMessage !== null) { showLoader("Compacting..."); } userCompactionCallbacks?.onBlockingChange?.(event); @@ -929,7 +934,9 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { blockingCompactionActive = false; clearBackgroundStatus("blocking-compaction"); if (foregroundStatusBeforeBlocking !== null) { - showLoader(foregroundStatusBeforeBlocking); + if (foregroundStatusMessage !== null) { + showLoader(foregroundStatusBeforeBlocking); + } foregroundStatusBeforeBlocking = null; } updateHeader(); @@ -1031,6 +1038,8 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { } }; + let usageProbeGeneration = 0; + const measureUsageIfAvailable = async ( messages: ModelMessage[] ): Promise => { @@ -1038,6 +1047,9 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { return false; } + usageProbeGeneration += 1; + const thisGeneration = usageProbeGeneration; + const measured = normalizeUsageMeasurement( await config.measureUsage(messages) ); @@ -1045,6 +1057,10 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { return false; } + if (thisGeneration !== usageProbeGeneration) { + return false; + } + config.messageHistory.updateActualUsage({ inputTokens: measured.inputTokens, outputTokens: measured.outputTokens, @@ -1532,9 +1548,6 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { const budget = await resolveTurnBudget(phase, messagesForLLM); messagesForLLM = budget.messagesForLLM; - showLoader("Working..."); - await config.onStreamStart?.(phase); - const stream = await config.agent.stream( mergeAgentStreamOptions({ abortSignal: streamAbortController.signal, @@ -1544,6 +1557,9 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { }) ); + showLoader("Working..."); + await config.onStreamStart?.(phase); + const clearStreamingLoader = createStreamingLoaderClearer(); await renderAgentStream( @@ -1851,6 +1867,7 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { try { await config.onSetup?.(); + updateHeader(); runBackgroundStartupProbe(); From d855e364e56d30368c6a1194dcc9ea02029cd51b Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 18:13:16 +0900 Subject: [PATCH 06/12] fix: isolate observer-hook errors from stream flow (Cubic P1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An observer that throws from onStreamStart (or onFirstStreamPart) used to propagate the error and abort an otherwise-valid stream. This violates the 'observer-only' intent of the hooks and makes a buggy telemetry/logging callback strong enough to break a production turn. - harness loop.ts: introduce invokeObserverHook(hook, hookName, ctx). Errors are console.error-logged and swallowed so the stream iteration always proceeds. Applied to both onStreamStart and onFirstStreamPart. - headless runner.ts: wrap config.onStreamStart in try/catch with the same logging pattern. The turn-start emission itself stays in place, so lifecycle consumers are unaffected. - tui agent-tui.ts: wrap config.onStreamStart in try/catch; the 'Working...' loader and the rest of runSingleStreamTurn proceed even when the consumer's callback throws. - LoopHooks docstrings now document the 'observer-only contract': errors are swallowed after being logged. This is a public API guarantee so consumers can safely rely on it (and don't need to defensively wrap their callbacks). - New headless test: 'continues streaming when onStreamStart throws (observer errors are isolated)' confirms the stream completes, the turn-start event is still emitted, and the thrown error is logged via console.error. Also repairs a drop of LoopHooks.onToolCall from types.ts that had slipped through an earlier edit — loop.ts still destructured it, so typecheck was surfacing the regression now. Verified: typecheck, ultracite check, 575 tests (+1 headless), build all pass. --- .changeset/prompt-processing-indicators.md | 1 + packages/harness/src/loop.ts | 23 +++++++++++-- packages/harness/src/types.ts | 8 +++++ packages/headless/src/runner.test.ts | 39 ++++++++++++++++++++++ packages/headless/src/runner.ts | 9 ++++- packages/tui/src/agent-tui.ts | 9 ++++- 6 files changed, 85 insertions(+), 4 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index 906ecd6..bf236be 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -16,3 +16,4 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - The background startup usage probe is serialized against per-turn probes by a generation token; a stale startup probe can no longer overwrite newer usage data and skew context-pressure metrics. - The blocking-compaction spinner swap only stashes the original foreground label on first entry and only restores it when the foreground loader is still live, eliminating both the "Compacting..." wording sticking after unblock and the "Processing..." spinner resurrecting after the first stream part arrived. - Restored the post-`onSetup` `updateHeader()` call that was accidentally dropped when the startup probe became non-blocking, so any header/footer state that `onSetup` initialises renders immediately instead of waiting for the first probe to resolve. + - Observer hooks (`onStreamStart`, `onFirstStreamPart`) no longer abort a valid stream when the callback throws. Errors are logged via `console.error` and swallowed in the harness loop, headless runner, and TUI session loop, with the contract documented on `LoopHooks`. diff --git a/packages/harness/src/loop.ts b/packages/harness/src/loop.ts index 4c12fc0..70ad72c 100644 --- a/packages/harness/src/loop.ts +++ b/packages/harness/src/loop.ts @@ -17,6 +17,21 @@ import type { RunAgentLoopResult, } from "./types"; +async function invokeObserverHook( + hook: ((context: LoopContinueContext) => void | Promise) | undefined, + hookName: string, + context: LoopContinueContext +): Promise { + if (!hook) { + return; + } + try { + await hook(context); + } catch (error) { + console.error(`[harness] ${hookName} threw; continuing stream:`, error); + } +} + /** * Runs an {@link Agent} in a loop until a stop condition is met or `maxIterations` is reached. * @@ -100,14 +115,18 @@ export async function runAgentLoop( const stream = agent.stream(streamOptions); - await onStreamStart?.(context); + await invokeObserverHook(onStreamStart, "onStreamStart", context); let firstPartSeen = false; for await (const part of stream.fullStream) { if (!firstPartSeen) { firstPartSeen = true; - await onFirstStreamPart?.(context); + await invokeObserverHook( + onFirstStreamPart, + "onFirstStreamPart", + context + ); } const lifecycle = getToolLifecycleState( diff --git a/packages/harness/src/types.ts b/packages/harness/src/types.ts index f5043a0..8460213 100644 --- a/packages/harness/src/types.ts +++ b/packages/harness/src/types.ts @@ -165,6 +165,10 @@ export interface LoopHooks { * arriving. Note: the SDK may emit invisible framing parts (`start`, * `text-start`, …) before user-facing content; consumers that want to * wait for *visible* output should filter on part type themselves. + * + * Observer-only contract: errors thrown from the callback are logged via + * `console.error` and swallowed so a buggy observer cannot abort a valid + * stream. */ onFirstStreamPart?: (context: LoopContinueContext) => void | Promise; onInterrupt?: ( @@ -184,6 +188,10 @@ export interface LoopHooks { * sent, waiting for first chunk" — intended for consumers that need to * display a loading indicator during the prompt-processing latency gap. * + * Observer-only contract: this hook must not influence the stream. Errors + * thrown from the callback are logged via `console.error` and swallowed + * so a buggy observer cannot abort a valid stream. + * * The callback is awaited before iteration starts, so keep it fast. * * Note: only runtimes that drive their turns through `runAgentLoop` diff --git a/packages/headless/src/runner.test.ts b/packages/headless/src/runner.test.ts index f50dd57..681c71c 100644 --- a/packages/headless/src/runner.test.ts +++ b/packages/headless/src/runner.test.ts @@ -166,6 +166,45 @@ describe("runHeadless", () => { expect(agentIndex).toBeGreaterThan(turnStartIndex); }); + it("continues streaming when onStreamStart throws (observer errors are isolated)", async () => { + const events: TrajectoryEvent[] = []; + const history = new CheckpointHistory(); + const consoleErrorSpy = vi + .spyOn(console, "error") + .mockImplementation(() => undefined); + + try { + await runHeadless({ + agent: { + stream: () => + createMockStream([{ role: "assistant", content: "done" }]), + }, + emitEvent: (event) => { + events.push(event); + }, + initialUserMessage: { content: "hi" }, + messageHistory: history, + modelId: "mock-model", + onStreamStart: () => { + throw new Error("observer bug: should not break the stream"); + }, + sessionId: "session-observer-throws", + }); + + expect(events.some((e) => e.type === "turn-start")).toBe(true); + const agentSteps = events.filter( + (e) => e.type === "step" && "source" in e && e.source === "agent" + ); + expect(agentSteps).toHaveLength(1); + expect(consoleErrorSpy).toHaveBeenCalledWith( + expect.stringContaining("onStreamStart"), + expect.any(Error) + ); + } finally { + consoleErrorSpy.mockRestore(); + } + }); + it("does not emit turn-start when agent.stream() rejects before dispatch", async () => { const events: TrajectoryEvent[] = []; const history = new CheckpointHistory(); diff --git a/packages/headless/src/runner.ts b/packages/headless/src/runner.ts index e759a5f..11cce7c 100644 --- a/packages/headless/src/runner.ts +++ b/packages/headless/src/runner.ts @@ -708,7 +708,14 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { phase, timestamp: new Date().toISOString(), }); - await config.onStreamStart?.(phase); + try { + await config.onStreamStart?.(phase); + } catch (hookError) { + console.error( + "[headless] onStreamStart threw; continuing stream:", + hookError + ); + } } const nextStepId = stepId + 1; const processStreamResult = await processStream({ diff --git a/packages/tui/src/agent-tui.ts b/packages/tui/src/agent-tui.ts index bda4955..3935607 100644 --- a/packages/tui/src/agent-tui.ts +++ b/packages/tui/src/agent-tui.ts @@ -1558,7 +1558,14 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { ); showLoader("Working..."); - await config.onStreamStart?.(phase); + try { + await config.onStreamStart?.(phase); + } catch (hookError) { + console.error( + "[tui] onStreamStart threw; continuing stream:", + hookError + ); + } const clearStreamingLoader = createStreamingLoaderClearer(); From b5d37fa45c628309346135e550c37a7c5c3bb390 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 18:26:16 +0900 Subject: [PATCH 07/12] docs(headless): pin ATIF v1.4 compliance contract in-source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Future edits to the headless trajectory pipeline must preserve ATIF v1.4 compliance. Until now the contract lived only in the changeset and the public README — easy to drift on in a hurry. This commit plants the contract directly on the load-bearing code and tests so a future maintainer cannot relax it by accident. Contract points now documented on-source: - trajectory-collector.ts module header: enumerates the six invariants (schema_version literal, step_id sequence, steps[*].source set, extra.* persistence rule, final_metrics null-when-absent, SDK-only metrics) and bounds the scope (persisted trajectory only — not the JSONL stream). - TrajectoryJson / AtifStep interface JSDocs: explain the spec-version bump discipline and that step-level fields require an ATIF v1.4 definition (or belong under extra). - runner.ts collectTrajectoryEvent: clarifies that the default-case drop is INTENTIONAL (non-ATIF types stay stream-only) and that new cases must ship with a matching extra.* path in finalize(). - runner.ts runHeadless: inline comment pinpoints where the two output surfaces (JSONL stdout vs ATIF trajectory.json) diverge, so the 'headless has two outputs' realisation does not need to be reconstructed every time. - types.ts TrajectoryEvent union: rewrites the one-line 'complete union' docstring into a contract that forces every new event type to pick 'extra.* persistence' or 'drop' at design time, and forbids promotion to a top-level ATIF field or steps[*].source value. - packages/headless/AGENTS.md: new 'ATIF v1.4 COMPLIANCE (persisted trajectory.json)' section listing the same invariants as the authoritative engineering contract. - atif-events.test.ts module header: declares the suite as the executable compliance contract with a pointer to the Python validator that must be updated in lock-step. No runtime behaviour changes. Verified: typecheck, ultracite check, 575 tests, build all pass. --- .changeset/prompt-processing-indicators.md | 1 + packages/headless/AGENTS.md | 13 ++++++ .../src/__tests__/atif-events.test.ts | 15 +++++++ packages/headless/src/runner.ts | 11 +++++ packages/headless/src/trajectory-collector.ts | 44 +++++++++++++++++++ packages/headless/src/types.ts | 15 ++++++- 6 files changed, 98 insertions(+), 1 deletion(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index bf236be..4ab11a4 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -17,3 +17,4 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - The blocking-compaction spinner swap only stashes the original foreground label on first entry and only restores it when the foreground loader is still live, eliminating both the "Compacting..." wording sticking after unblock and the "Processing..." spinner resurrecting after the first stream part arrived. - Restored the post-`onSetup` `updateHeader()` call that was accidentally dropped when the startup probe became non-blocking, so any header/footer state that `onSetup` initialises renders immediately instead of waiting for the first probe to resolve. - Observer hooks (`onStreamStart`, `onFirstStreamPart`) no longer abort a valid stream when the callback throws. Errors are logged via `console.error` and swallowed in the harness loop, headless runner, and TUI session loop, with the contract documented on `LoopHooks`. +- Pinned the ATIF v1.4 compliance contract in-source: `trajectory-collector.ts`, `TrajectoryJson`, `AtifStep`, `TrajectoryEvent`, `collectTrajectoryEvent`, and `runHeadless` now carry module/interface-level JSDoc spelling out the Harbor spec version, the allowed `steps[*].source` values, the `extra.*` persistence rule, and the stream-vs-snapshot boundary. `packages/headless/AGENTS.md` gains an "ATIF v1.4 COMPLIANCE" section listing the same invariants, and the `atif-events.test.ts` suite now declares itself as the executable compliance contract. These are docs-only, but they turn future spec drifts into obvious code-review red flags instead of silent regressions. diff --git a/packages/headless/AGENTS.md b/packages/headless/AGENTS.md index 304a626..a89cb2b 100644 --- a/packages/headless/AGENTS.md +++ b/packages/headless/AGENTS.md @@ -145,3 +145,16 @@ import type { - Estimating token counts (use `metrics` from SDK only). - Manual `step_id` management outside the runner. - Using `console.log` for non-event output. + +## ATIF v1.4 COMPLIANCE (persisted trajectory.json) + +`TrajectoryCollector` writes the only output that MUST conform to Harbor's ATIF v1.4 spec (). When editing `trajectory-collector.ts` or `collectTrajectoryEvent` in `runner.ts`, keep the following invariants: + +- **schema_version is the literal `"ATIF-v1.4"`**. Never bump unilaterally — bump only when the upstream Harbor spec bumps and this implementation has been audited against the new version's required/optional fields. +- **`steps[*].source` ∈ `{user, agent, system}`**. Never widen this set; new event types go to `extra.*` or are dropped from persistence. +- **Persisted lifecycle annotations live under `extra.*`**: `approval_events`, `compaction_events`, `interrupt_events`. Extending this set is acceptable (ATIF `extra` is forward-compatible) but each new bucket requires a new collector method and a corresponding `collectTrajectoryEvent` case. +- **Adding a JSONL event type is additive for stdout** but requires an explicit routing decision in `collectTrajectoryEvent`: persist under `extra.*`, or drop. Leaving the `default: return;` fallthrough is a valid choice for transient signals (the pattern `turn-start` uses). +- **`final_metrics` keys are null-when-absent, not omitted**. The shape is load-bearing for downstream tools (Harbor validator, scorer). +- **Metrics come from SDK `stream.usage`** — never estimated; never hand-filled. + +Violating these invariants silently breaks Harbor benchmark runs, since the persisted trajectory will fail `harbor.utils.trajectory_validator` or produce unusable scorer output. diff --git a/packages/headless/src/__tests__/atif-events.test.ts b/packages/headless/src/__tests__/atif-events.test.ts index 479d9ea..e7064b9 100644 --- a/packages/headless/src/__tests__/atif-events.test.ts +++ b/packages/headless/src/__tests__/atif-events.test.ts @@ -1,3 +1,18 @@ +/** + * ATIF v1.4 compliance test suite. + * + * These tests are load-bearing: they encode the Harbor ATIF v1.4 shape + * contract (https://www.harborframework.com/docs/agents/trajectory-format) + * as executable assertions. A regression here means `trajectory.json` + * output no longer passes `harbor.utils.trajectory_validator`, which + * breaks terminal-bench runs and any downstream scorer. + * + * Before loosening any assertion here, confirm the change is permitted by + * the current Harbor spec version this package targets (`ATIF-v1.4`) and + * that the Python validator in `packages/cea/benchmark/test_trajectory.py` + * is updated in lock-step. + */ + import { describe, expect, it } from "vitest"; import { TrajectoryCollector } from "../trajectory-collector"; import type { diff --git a/packages/headless/src/runner.ts b/packages/headless/src/runner.ts index 11cce7c..925046f 100644 --- a/packages/headless/src/runner.ts +++ b/packages/headless/src/runner.ts @@ -170,6 +170,14 @@ function getRecommendedMaxOutputTokens( return history.getRecommendedMaxOutputTokens(messages); } +/** + * Routes a JSONL stream event to the ATIF-v1.4 trajectory collector, if + * persistence is enabled. The `default` case INTENTIONALLY drops event + * types that are not part of ATIF v1.4 (e.g. `turn-start`, `error`) — they + * stay on the JSONL stream only. Adding a case here must be accompanied by + * a matching `extra.*` persistence path in `TrajectoryCollector.finalize` + * so `trajectory.json` stays ATIF v1.4 compliant. + */ function collectTrajectoryEvent( collector: TrajectoryCollector | null, event: TrajectoryEvent @@ -294,6 +302,9 @@ async function runTodoReminderLoop(params: { export async function runHeadless(config: HeadlessRunnerConfig): Promise { const emitEventSink = config.emitEvent ?? defaultEmitEvent; + // When `atifOutputPath` is set, the collected trajectory is written to + // disk in ATIF v1.4 format. The JSONL stream to stdout is a separate + // surface (internal protocol, not ATIF) and continues regardless. const trajectoryCollector = config.atifOutputPath ? new TrajectoryCollector() : null; diff --git a/packages/headless/src/trajectory-collector.ts b/packages/headless/src/trajectory-collector.ts index 333b8b9..a4d2b89 100644 --- a/packages/headless/src/trajectory-collector.ts +++ b/packages/headless/src/trajectory-collector.ts @@ -1,3 +1,32 @@ +/** + * ATIF-v1.4 trajectory collector. + * + * This module is the ONLY surface in the headless package that produces the + * persisted `trajectory.json` file, and that file MUST conform to Harbor's + * ATIF v1.4 specification: + * + * https://www.harborframework.com/docs/agents/trajectory-format + * + * ATIF v1.4 compliance rules (load-bearing — do not relax without bumping + * the Harbor spec version this package claims to target): + * + * • `schema_version` is the literal string "ATIF-v1.4". + * • Every {@link AtifStep.step_id} is a sequential integer starting at 1. + * • `steps[*].source` is limited to `"user" | "agent" | "system"`; lifecycle + * event types (approval, compaction, interrupt, turn-start, error) are + * NEVER persisted as step sources — they live in the JSONL stream only. + * • Lifecycle annotations that the spec allows persisting go under + * `extra.approval_events`, `extra.compaction_events`, and + * `extra.interrupt_events`. New lifecycle types must NOT introduce new + * top-level fields. + * • `final_metrics` must include every key defined in ATIF v1.4 even if + * the value is `null` (null-when-absent, not omitted). + * • Metrics are pulled from the SDK `stream.usage`; never estimated. + * + * The JSONL event stream emitted to stdout by the runner is a DIFFERENT + * surface (internal protocol, not ATIF) — see `types.ts` for that contract. + */ + import { mkdirSync, writeFileSync } from "node:fs"; import { dirname } from "node:path"; import type { @@ -11,6 +40,13 @@ import type { ToolCallData, } from "./types"; +/** + * Shape of a single ATIF v1.4 `Step` entry inside `trajectory.json`. + * + * Any field added here MUST be defined by ATIF v1.4 (or be ignored by the + * spec as part of forward-compatible `extra`). Do not introduce ad-hoc + * step-level fields — put them under `extra` instead. + */ interface AtifStep { extra?: Record; is_copied_context?: boolean; @@ -26,6 +62,14 @@ interface AtifStep { tool_calls?: ToolCallData[]; } +/** + * Shape of the persisted ATIF v1.4 `Trajectory` JSON document. + * + * `schema_version` is the literal "ATIF-v1.4" and must match the Harbor + * spec version this package targets. Bump it only when the underlying + * Harbor spec bumps and this implementation has been audited against the + * new version's required/optional fields. + */ export interface TrajectoryJson { agent: { name: string; version: string; model_name: string }; extra?: { diff --git a/packages/headless/src/types.ts b/packages/headless/src/types.ts index 6e6d49a..b84a954 100644 --- a/packages/headless/src/types.ts +++ b/packages/headless/src/types.ts @@ -230,7 +230,20 @@ export interface MetadataEvent { // ============================================================================ /** - * The complete union of all trajectory event types. + * The complete union of all JSONL stream event types. + * + * This union defines the INTERNAL JSONL protocol — it is NOT the ATIF + * schema. Adding a new event type here is additive and non-breaking, but + * the corresponding switch in `collectTrajectoryEvent` must explicitly + * decide whether the new type is: + * + * • persisted into an ATIF v1.4 `extra.*` bucket (lifecycle annotation + * that Harbor accepts as forward-compatible extension), or + * • dropped (lives only on the JSONL stream; `trajectory.json` stays + * ATIF v1.4 compliant). + * + * It MUST NEVER be persisted as a top-level ATIF field or as a new + * `steps[*].source` value — that would violate the Harbor spec. */ export type TrajectoryEvent = | StepEvent From aadb22ed1d0bb862d02d35cc52d0bc77db5f3133 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 18:33:15 +0900 Subject: [PATCH 08/12] docs(changeset): close residual gaps in PR #117 changeset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Second pass through the changeset against the full commit list surfaced three small holes and one missing package header: - 'plugsuits' (packages/cea) was missing from the changeset header. The PR actually touches its user-facing CLI help text ('--atif' now mentions ATIF-v1.4) and the benchmark validator pipeline, so cea needs a patch bump in lock-step with the other packages. - Added a dedicated CEA bullet describing the --atif wording change and the Python validator pipeline improvements (Harbor validator auto-call + stricter shape check + bool rejection). - Credited the Cubic-reported bool-in-number bug in the 'PR review feedback' section. The fix was already described under the ATIF v1.4 validator improvements bullet, but the review-attribution list had dropped the sixth item. - Noted that the fifth-commit fix quietly restored a dropped LoopHooks.onToolCall declaration. Consumers that rely on this hook had been running against a destructure with no matching type; this is now type-safe again and the changeset mentions it so anyone diffing public types sees why that line moved. No code changes and no behaviour changes — the changeset content now matches the actual commit range on this branch. --- .changeset/prompt-processing-indicators.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index 4ab11a4..810988f 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -2,6 +2,7 @@ "@ai-sdk-tool/harness": patch "@ai-sdk-tool/tui": patch "@ai-sdk-tool/headless": patch +"plugsuits": patch --- Surface the "prompt processing" state that previously looked frozen, and fix follow-up correctness gaps found during post-implementation review. @@ -10,11 +11,14 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). - Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. - Headless: the persisted `schema_version` is corrected from the internal `ATIF-v1.6` label to the actual current Harbor spec version `ATIF-v1.4` (). Documentation across `packages/headless/AGENTS.md`, `packages/headless/README.md`, and `packages/cea/benchmark/AGENTS.md` now separates the internal JSONL streaming protocol (which carries lifecycle annotations such as `approval`, `compaction`, `interrupt`, `turn-start`) from the ATIF-v1.4 trajectory that `TrajectoryCollector` writes to disk. -- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `Trajectory.extra` is typed as an open record so future lifecycle annotations fit without a breaking type change. The bundled `test_trajectory.py` validator now calls Harbor's official `TrajectoryValidator` when `harbor` is importable and falls back to the local shape check otherwise; it also enforces per-step metric shapes and rejects booleans where ATIF requires real numbers. +- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `Trajectory.extra` is typed as an open record so future lifecycle annotations fit without a breaking type change. +- CEA: the `--atif` CLI help text and the benchmark pipeline now reference ATIF-v1.4 (matching the corrected `schema_version`). The bundled `packages/cea/benchmark/test_trajectory.py` validator now calls Harbor's official `TrajectoryValidator` when `harbor` is importable and falls back to a stricter local shape check otherwise; it enforces per-step metric shapes and rejects `bool` values where ATIF requires a real number. - Addressed PR review feedback: - `turn-start` and `onStreamStart` now fire strictly after `agent.stream()` successfully returns, so stream-creation failures no longer produce a false "stream started" signal (reported by Gemini, Codex, and Cubic reviewers). - The background startup usage probe is serialized against per-turn probes by a generation token; a stale startup probe can no longer overwrite newer usage data and skew context-pressure metrics. - The blocking-compaction spinner swap only stashes the original foreground label on first entry and only restores it when the foreground loader is still live, eliminating both the "Compacting..." wording sticking after unblock and the "Processing..." spinner resurrecting after the first stream part arrived. - Restored the post-`onSetup` `updateHeader()` call that was accidentally dropped when the startup probe became non-blocking, so any header/footer state that `onSetup` initialises renders immediately instead of waiting for the first probe to resolve. + - The bundled Python ATIF validator (`test_trajectory.py`) no longer accepts `bool` values where ATIF v1.4 requires a real number — `isinstance(True, int)` is `True` in Python, so the old check let invalid metric payloads slip through. Added `_is_real_number` / `_is_real_int` helpers that exclude `bool`. - Observer hooks (`onStreamStart`, `onFirstStreamPart`) no longer abort a valid stream when the callback throws. Errors are logged via `console.error` and swallowed in the harness loop, headless runner, and TUI session loop, with the contract documented on `LoopHooks`. + - Repaired a regression where `LoopHooks.onToolCall` had silently dropped out of the public `LoopHooks` interface while still being destructured inside `runAgentLoop`. The field is restored to its original signature; consumers that already relied on it are unaffected, and the destructuring now type-checks again. - Pinned the ATIF v1.4 compliance contract in-source: `trajectory-collector.ts`, `TrajectoryJson`, `AtifStep`, `TrajectoryEvent`, `collectTrajectoryEvent`, and `runHeadless` now carry module/interface-level JSDoc spelling out the Harbor spec version, the allowed `steps[*].source` values, the `extra.*` persistence rule, and the stream-vs-snapshot boundary. `packages/headless/AGENTS.md` gains an "ATIF v1.4 COMPLIANCE" section listing the same invariants, and the `atif-events.test.ts` suite now declares itself as the executable compliance contract. These are docs-only, but they turn future spec drifts into obvious code-review red flags instead of silent regressions. From 91008d0191f19b4159a6a242a1976f0bd3ddeaa1 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 18:45:08 +0900 Subject: [PATCH 09/12] fix(harness): pass stream part to onFirstStreamPart callback (Cubic P2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The docstring for LoopHooks.onFirstStreamPart told consumers to filter on part type (to distinguish framing chunks like 'start' and 'text-start' from visible output), but the callback only received LoopContinueContext — no part to filter on. The contract was broken at the type level: the advice was physically impossible to follow. This commit fixes the hook before adoption. Since onFirstStreamPart was added earlier in the same PR and has zero consumers anywhere in the monorepo (verified by exhaustive grep across packages/harness, packages/tui, packages/headless, packages/cea, packages/minimal-agent, packages/tgbot, scripts/, and the runtime/session adapter layer), the signature change is type-only and requires no migration. Changes: - types.ts: onFirstStreamPart signature is now (part: TextStreamPart, context: LoopContinueContext) => void | Promise Docstring rewritten to describe what the hook actually does: fires once per iteration on the very first part regardless of visibility; the consumer decides what to do based on part.type but the hook does not re-fire on a later visible part. - types.ts: imports TextStreamPart from 'ai' (alongside the existing ToolSet / ToolCallPart imports). - index.ts: re-exports TextStreamPart from '@ai-sdk-tool/harness' so consumers don't have to import it from 'ai' separately. This matches how the TUI package already uses the exact type (TextStreamPart) for its isVisibleStreamPart filter. - loop.ts: generalises invokeObserverHook to accept variadic args (), so both (context)-only hooks (onStreamStart) and (part, context) hooks (onFirstStreamPart) share the same error-isolation helper. Call site at the for-await entry now passes the current 'part' alongside the context. - loop.test.ts: four new regression tests covering: * first-part single-fire semantics (tool-call in iteration 0) * per-iteration firing (two iterations, fires twice with correct iteration numbers) * empty-stream skip (no parts → hook never fires) * observer-error isolation (thrown error is logged and swallowed, stream iteration continues normally) Oracle consulted before changing the public signature: recommended Option A ('pre-adoption correction') over Option B (relax docstring) given the zero-consumer inventory. The justification is that the hook is one commit old and has not been released, so fixing it now is materially different from breaking an established API in a patch release. Verified: typecheck (6/6), ultracite check (277 files clean), full test suite (1235 tests, harness +4 new), build (6/6). --- .changeset/prompt-processing-indicators.md | 3 +- packages/harness/src/index.ts | 2 +- packages/harness/src/loop.test.ts | 98 +++++++++++++++++++++- packages/harness/src/loop.ts | 9 +- packages/harness/src/types.ts | 28 ++++--- 5 files changed, 123 insertions(+), 17 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index 810988f..a27001e 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -7,7 +7,7 @@ Surface the "prompt processing" state that previously looked frozen, and fix follow-up correctness gaps found during post-implementation review. -- Harness: new `LoopHooks.onStreamStart` / `onFirstStreamPart` hooks wrap the `agent.stream()` call site so consumers driving turns through `runAgentLoop` can react to the prompt-processing latency gap. Docstring clarifies that the TUI has its own independent `onStreamStart` on `AgentTUIConfig`. +- Harness: new `LoopHooks.onStreamStart` / `onFirstStreamPart` hooks wrap the `agent.stream()` call site so consumers driving turns through `runAgentLoop` can react to the prompt-processing latency gap. `onFirstStreamPart` receives the current stream part as its first argument (`TextStreamPart`) so consumers can inspect `part.type` to filter framing chunks (`start`, `text-start`, …) from visible content. `TextStreamPart` is re-exported from the harness root for convenience. Docstring clarifies that the TUI has its own independent `onStreamStart` on `AgentTUIConfig`. - TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). - Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. - Headless: the persisted `schema_version` is corrected from the internal `ATIF-v1.6` label to the actual current Harbor spec version `ATIF-v1.4` (). Documentation across `packages/headless/AGENTS.md`, `packages/headless/README.md`, and `packages/cea/benchmark/AGENTS.md` now separates the internal JSONL streaming protocol (which carries lifecycle annotations such as `approval`, `compaction`, `interrupt`, `turn-start`) from the ATIF-v1.4 trajectory that `TrajectoryCollector` writes to disk. @@ -21,4 +21,5 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - The bundled Python ATIF validator (`test_trajectory.py`) no longer accepts `bool` values where ATIF v1.4 requires a real number — `isinstance(True, int)` is `True` in Python, so the old check let invalid metric payloads slip through. Added `_is_real_number` / `_is_real_int` helpers that exclude `bool`. - Observer hooks (`onStreamStart`, `onFirstStreamPart`) no longer abort a valid stream when the callback throws. Errors are logged via `console.error` and swallowed in the harness loop, headless runner, and TUI session loop, with the contract documented on `LoopHooks`. - Repaired a regression where `LoopHooks.onToolCall` had silently dropped out of the public `LoopHooks` interface while still being destructured inside `runAgentLoop`. The field is restored to its original signature; consumers that already relied on it are unaffected, and the destructuring now type-checks again. + - Corrected the `LoopHooks.onFirstStreamPart` signature as a pre-adoption fix (Cubic P2): the previous `(context) => void` shape promised in its docstring that consumers could filter on part type, but the callback never received the part. The signature now passes `(part: TextStreamPart, context)` so consumers can actually inspect `part.type`. Zero existing consumers were found across the monorepo (the hook was introduced earlier in this PR), so this is a type-only correction with no runtime migration. New regression tests in `loop.test.ts` cover single-fire semantics, per-iteration firing, empty-stream skip, and observer-error isolation. - Pinned the ATIF v1.4 compliance contract in-source: `trajectory-collector.ts`, `TrajectoryJson`, `AtifStep`, `TrajectoryEvent`, `collectTrajectoryEvent`, and `runHeadless` now carry module/interface-level JSDoc spelling out the Harbor spec version, the allowed `steps[*].source` values, the `extra.*` persistence rule, and the stream-vs-snapshot boundary. `packages/headless/AGENTS.md` gains an "ATIF v1.4 COMPLIANCE" section listing the same invariants, and the `atif-events.test.ts` suite now declares itself as the executable compliance contract. These are docs-only, but they turn future spec drifts into obvious code-review red flags instead of silent regressions. diff --git a/packages/harness/src/index.ts b/packages/harness/src/index.ts index e8e190d..6d0c0db 100644 --- a/packages/harness/src/index.ts +++ b/packages/harness/src/index.ts @@ -1,4 +1,4 @@ -export type { LanguageModelUsage } from "ai"; +export type { LanguageModelUsage, TextStreamPart } from "ai"; export { createAgent } from "./agent"; export type { BackgroundMemoryExtractorConfig } from "./background-memory-extractor"; export { BackgroundMemoryExtractor } from "./background-memory-extractor"; diff --git a/packages/harness/src/loop.test.ts b/packages/harness/src/loop.test.ts index 10dbba2..7d75a94 100644 --- a/packages/harness/src/loop.test.ts +++ b/packages/harness/src/loop.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it } from "vitest"; +import { describe, expect, it, vi } from "vitest"; import { AgentError, AgentErrorCode } from "./errors"; import { runAgentLoop } from "./loop"; import type { Agent, AgentStreamResult } from "./types"; @@ -304,6 +304,102 @@ describe("runAgentLoop", () => { expect(result.messages.length).toBe(3); }); + it("invokes onFirstStreamPart exactly once with the first emitted part", async () => { + const agent = createMockAgent(["tool-calls", "stop"], { + toolCallsPerIteration: [ + [ + { toolName: "get_time", args: {} }, + { toolName: "get_weather", args: { location: "Paris" } }, + ], + [], + ], + }); + + const observed: Array<{ type: string; iteration: number }> = []; + + await runAgentLoop({ + agent, + messages: [{ role: "user", content: "Hello" }], + onFirstStreamPart: (part, context) => { + observed.push({ type: part.type, iteration: context.iteration }); + }, + }); + + expect(observed).toHaveLength(1); + expect(observed[0]?.type).toBe("tool-call"); + expect(observed[0]?.iteration).toBe(0); + }); + + it("invokes onFirstStreamPart on each iteration when the stream has content", async () => { + const agent = createMockAgent(["tool-calls", "tool-calls", "stop"], { + toolCallsPerIteration: [ + [{ toolName: "first_call", args: {} }], + [{ toolName: "second_call", args: {} }], + [], + ], + }); + + const observed: Array<{ type: string; iteration: number }> = []; + + await runAgentLoop({ + agent, + messages: [{ role: "user", content: "Hello" }], + onFirstStreamPart: (part, context) => { + observed.push({ type: part.type, iteration: context.iteration }); + }, + }); + + expect(observed).toHaveLength(2); + expect(observed[0]?.iteration).toBe(0); + expect(observed[1]?.iteration).toBe(1); + }); + + it("skips onFirstStreamPart when the stream yields no parts", async () => { + const agent = createMockAgent(["stop"], { + toolCallsPerIteration: [[]], + }); + + const observed: unknown[] = []; + + await runAgentLoop({ + agent, + messages: [{ role: "user", content: "Hello" }], + onFirstStreamPart: (part) => { + observed.push(part); + }, + }); + + expect(observed).toHaveLength(0); + }); + + it("isolates onFirstStreamPart observer errors from the stream flow", async () => { + const agent = createMockAgent(["tool-calls", "stop"], { + toolCallsPerIteration: [[{ toolName: "noop", args: {} }], []], + }); + + const consoleErrorSpy = vi + .spyOn(console, "error") + .mockImplementation(() => undefined); + + try { + const result = await runAgentLoop({ + agent, + messages: [{ role: "user", content: "Hello" }], + onFirstStreamPart: () => { + throw new Error("observer bug"); + }, + }); + + expect(result.iterations).toBe(2); + expect(consoleErrorSpy).toHaveBeenCalledWith( + expect.stringContaining("onFirstStreamPart"), + expect.any(Error) + ); + } finally { + consoleErrorSpy.mockRestore(); + } + }); + it("calls onToolCall for each tool call", async () => { const agent = createMockAgent(["tool-calls", "stop"], { toolCallsPerIteration: [ diff --git a/packages/harness/src/loop.ts b/packages/harness/src/loop.ts index 70ad72c..b5c40f8 100644 --- a/packages/harness/src/loop.ts +++ b/packages/harness/src/loop.ts @@ -17,16 +17,16 @@ import type { RunAgentLoopResult, } from "./types"; -async function invokeObserverHook( - hook: ((context: LoopContinueContext) => void | Promise) | undefined, +async function invokeObserverHook( + hook: ((...args: Args) => void | Promise) | undefined, hookName: string, - context: LoopContinueContext + ...args: Args ): Promise { if (!hook) { return; } try { - await hook(context); + await hook(...args); } catch (error) { console.error(`[harness] ${hookName} threw; continuing stream:`, error); } @@ -125,6 +125,7 @@ export async function runAgentLoop( await invokeObserverHook( onFirstStreamPart, "onFirstStreamPart", + part, context ); } diff --git a/packages/harness/src/types.ts b/packages/harness/src/types.ts index 8460213..73661d4 100644 --- a/packages/harness/src/types.ts +++ b/packages/harness/src/types.ts @@ -7,6 +7,7 @@ import type { LanguageModel, ModelMessage, streamText, + TextStreamPart, ToolCallPart, ToolSet, } from "ai"; @@ -159,18 +160,25 @@ export interface LoopHooks { { shouldContinue?: boolean; recovery?: ModelMessage[] } | undefined >; /** - * Fires exactly once per loop iteration when the first part of any kind - * arrives from `stream.fullStream`. Consumers can use this to clear a - * prompt-processing indicator the moment any byte of output begins - * arriving. Note: the SDK may emit invisible framing parts (`start`, - * `text-start`, …) before user-facing content; consumers that want to - * wait for *visible* output should filter on part type themselves. + * Fires exactly once per loop iteration, on the **first** part of any + * kind emitted from `stream.fullStream`. The part itself is passed as + * the first argument so consumers can inspect `part.type` if they only + * care about visible output — e.g., ignore framing parts (`start`, + * `text-start`, …) and react only to `text-delta`, `tool-call`, etc. * - * Observer-only contract: errors thrown from the callback are logged via - * `console.error` and swallowed so a buggy observer cannot abort a valid - * stream. + * Important: the hook fires on the very first part regardless of its + * visibility. Filtering inside the callback lets you *decide what to + * do* with that first part; it does NOT cause the hook to re-fire on + * a later visible part. + * + * Observer-only contract: errors thrown from the callback are logged + * via `console.error` and swallowed so a buggy observer cannot abort a + * valid stream. */ - onFirstStreamPart?: (context: LoopContinueContext) => void | Promise; + onFirstStreamPart?: ( + part: TextStreamPart, + context: LoopContinueContext + ) => void | Promise; onInterrupt?: ( interruption: { iteration: number; From 45a810f4dd9f143eedcc31ab6ccd4afebe9db6c4 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 19:19:25 +0900 Subject: [PATCH 10/12] fix: cycle 1 review follow-ups (Oracle/CodeRabbit/Codex/Cubic) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cycle-1 triggered a fresh round of AI reviews. Consolidating every actionable finding into a single commit: P1 (Oracle): runHeadless could persist an invalid zero-step ATIF trajectory when the stream failed before any step was emitted. Harbor's own validator rejects steps: []. TrajectoryCollector.writeTo now returns boolean and skips disk writes for zero-step runs instead of producing an invalid file. P2 (Codex + Cubic): showLoader('Processing...') lived outside the runSingleStreamTurn try/finally, so a thrown prepareMessages / onBeforeTurn / usage probe / compaction check would leave the spinner stuck on screen. Moved the call into the try block so clearStatus() in finally cleans it up on every exit path. P2 (CodeRabbit): the startup usage-probe guard only rejected results from stale generations, not results computed against a different history. Added a messageHistory.getRevision() capture-and-compare so a background probe resolving after user input no longer overwrites the new turn's real usage with empty-message baseline. P2 (CodeRabbit): TrajectoryJson.extra was typed as '{approval_events?, compaction_events?, interrupt_events?} & Record', letting new lifecycle buckets type-check without a matching TrajectoryCollector method. Dropped the open-record intersection so the ATIF persistence contract is enforced structurally. P2 (CodeRabbit): test_trajectory.py's _is_real_number accepted NaN, Infinity, and -Infinity (all producible via json.loads on non-strict input). Added an explicit math.isfinite() check. P3 (CodeRabbit + Cubic): documentation drift — several docstrings and markdown files claimed 'approval/compaction/interrupt are JSONL-only and dropped from trajectory.json'. That is wrong: they are persisted under trajectory.extra.* by the collector. Only turn-start and error are truly transient. Corrected in packages/headless/AGENTS.md, packages/headless/README.md, packages/headless/src/types.ts, packages/headless/src/trajectory-collector.ts module JSDoc, and the root AGENTS.md. Regression test added: runner.test.ts now asserts that zero-step runs with atifOutputPath set do NOT produce a file (does not write an invalid zero-step trajectory when the stream fails before any step). Verified: typecheck (6/6), ultracite check (277 files clean), tests (1236 pass; headless +1 new = 65), build (6/6). --- .changeset/prompt-processing-indicators.md | 8 +++++ AGENTS.md | 2 +- packages/cea/benchmark/test_trajectory.py | 5 ++- packages/headless/AGENTS.md | 8 +++-- packages/headless/README.md | 2 +- packages/headless/src/runner.test.ts | 21 +++++++++++ packages/headless/src/trajectory-collector.ts | 36 ++++++++++++++----- packages/headless/src/types.ts | 12 ++++--- packages/tui/src/agent-tui.ts | 19 ++++++---- 9 files changed, 88 insertions(+), 25 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index a27001e..3be0ef7 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -23,3 +23,11 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - Repaired a regression where `LoopHooks.onToolCall` had silently dropped out of the public `LoopHooks` interface while still being destructured inside `runAgentLoop`. The field is restored to its original signature; consumers that already relied on it are unaffected, and the destructuring now type-checks again. - Corrected the `LoopHooks.onFirstStreamPart` signature as a pre-adoption fix (Cubic P2): the previous `(context) => void` shape promised in its docstring that consumers could filter on part type, but the callback never received the part. The signature now passes `(part: TextStreamPart, context)` so consumers can actually inspect `part.type`. Zero existing consumers were found across the monorepo (the hook was introduced earlier in this PR), so this is a type-only correction with no runtime migration. New regression tests in `loop.test.ts` cover single-fire semantics, per-iteration firing, empty-stream skip, and observer-error isolation. - Pinned the ATIF v1.4 compliance contract in-source: `trajectory-collector.ts`, `TrajectoryJson`, `AtifStep`, `TrajectoryEvent`, `collectTrajectoryEvent`, and `runHeadless` now carry module/interface-level JSDoc spelling out the Harbor spec version, the allowed `steps[*].source` values, the `extra.*` persistence rule, and the stream-vs-snapshot boundary. `packages/headless/AGENTS.md` gains an "ATIF v1.4 COMPLIANCE" section listing the same invariants, and the `atif-events.test.ts` suite now declares itself as the executable compliance contract. These are docs-only, but they turn future spec drifts into obvious code-review red flags instead of silent regressions. +- Review cycle 1 follow-ups (Oracle + Gemini + Codex + Cubic + CodeRabbit): + - Guarded `TrajectoryCollector.writeTo` against persisting an invalid zero-step trajectory (Harbor's own validator rejects `steps: []`). The method now returns `boolean` — `true` when a file was written, `false` when the write was intentionally skipped to keep `trajectory.json` ATIF-v1.4 compliant. + - Moved the TUI `showLoader("Processing...")` call inside the stream-turn `try/finally` so a thrown `prepareMessages` (or `onBeforeTurn`/usage probe/compaction check) no longer leaves the spinner stuck on screen. + - Tightened the startup usage-probe guard: in addition to the generation token, `measureUsageIfAvailable` now captures `messageHistory.getRevision()` at call time and drops its result when the history has mutated mid-probe, preventing stale empty-message usage from overwriting per-turn measurements. + - Narrowed `TrajectoryJson.extra` to the three canonical lifecycle buckets (`approval_events`, `compaction_events`, `interrupt_events`) by dropping the `Record` intersection. New lifecycle types must now extend the interface explicitly, keeping the ATIF persistence contract type-enforced. + - Hardened the Python validator: `_is_real_number` now rejects `NaN`, `Infinity`, and `-Infinity` (all of which `json.loads` will happily produce from non-strict JSON) via an explicit `math.isfinite` check. + - Corrected documentation drift across `packages/headless/AGENTS.md`, `packages/headless/README.md`, `packages/headless/src/types.ts`, `packages/headless/src/trajectory-collector.ts`, and the root `AGENTS.md`: `approval`/`compaction`/`interrupt` are persisted under `trajectory.extra.*`, not JSONL-only; only `turn-start` and `error` are transient. + - Regression test added for the `writeTo` zero-step guard: `does not write an invalid zero-step trajectory when the stream fails before any step`. diff --git a/AGENTS.md b/AGENTS.md index 85872fc..096b6a1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -182,7 +182,7 @@ try { - File edits in CEA favor hashline-aware operations (`LINE#HASH` + `expected_file_hash`) for stale-safe modifications. - Manual tool-loop continuation is intentionally constrained to normalized `tool-calls` finish reasons. -- Headless mode emits structured ATIF JSONL lifecycle types (`metadata`, `step`, `approval`, `compaction`, `error`, `interrupt`) consumed by benchmark tooling. +- Headless mode emits a JSONL event stream with lifecycle types `metadata`, `step`, `approval`, `compaction`, `error`, `interrupt`, and `turn-start`. The persisted `trajectory.json` produced by `TrajectoryCollector` follows Harbor's ATIF-v1.4 schema (): `approval`, `compaction`, and `interrupt` are bundled into `extra.*` buckets; `turn-start` and `error` are JSONL-only. - `SkillsEngine` discovers skills from up to five directories: bundled, global skills, global commands, project skills, project commands. ## COMMANDS diff --git a/packages/cea/benchmark/test_trajectory.py b/packages/cea/benchmark/test_trajectory.py index 851739c..7ffca1b 100644 --- a/packages/cea/benchmark/test_trajectory.py +++ b/packages/cea/benchmark/test_trajectory.py @@ -6,12 +6,15 @@ from __future__ import annotations import json +import math import sys from pathlib import Path def _is_real_number(value: object) -> bool: - return isinstance(value, (int, float)) and not isinstance(value, bool) + if isinstance(value, bool) or not isinstance(value, (int, float)): + return False + return math.isfinite(float(value)) def _is_real_int(value: object) -> bool: diff --git a/packages/headless/AGENTS.md b/packages/headless/AGENTS.md index a89cb2b..18c7187 100644 --- a/packages/headless/AGENTS.md +++ b/packages/headless/AGENTS.md @@ -18,9 +18,11 @@ Every event is a JSON object on its own line. > [Harbor's ATIF specification](https://www.harborframework.com/docs/agents/trajectory-format). > The current ATIF version is **v1.4**. This JSONL protocol is an internal > streaming contract used by the runner to drive UI, telemetry, and the -> trajectory collector; it carries lifecycle annotations -> (`approval`, `compaction`, `interrupt`, `turn-start`) that have no place -> in ATIF and are dropped from the persisted trajectory. +> trajectory collector. Persisted lifecycle annotations (`approval`, +> `compaction`, `interrupt`) are bundled into ATIF `extra.*` buckets by the +> collector — they are NOT `steps[*].source` values, but they do survive +> on disk. Transient annotations (`turn-start`, `error`) stay JSONL-only +> and are never written to `trajectory.json`. ### Design Decisions - **NO sessionId on individual events**: A single `MetadataEvent` at the start carries the `session_id`. diff --git a/packages/headless/README.md b/packages/headless/README.md index 4f14271..ce4a37f 100644 --- a/packages/headless/README.md +++ b/packages/headless/README.md @@ -169,7 +169,7 @@ registerSignalHandlers({ ## JSONL Event Types -The runner streams an internal JSONL event protocol documented in `packages/headless/AGENTS.md`. The persisted `trajectory.json` produced by `TrajectoryCollector` conforms to Harbor's **ATIF-v1.4** schema (); the JSONL stream carries additional lifecycle annotations that are not part of ATIF and are dropped from the trajectory. +The runner streams an internal JSONL event protocol documented in `packages/headless/AGENTS.md`. The persisted `trajectory.json` produced by `TrajectoryCollector` conforms to Harbor's **ATIF-v1.4** schema (). Lifecycle annotations on the JSONL stream split into two categories: `approval`, `compaction`, and `interrupt` are persisted into `trajectory.extra.*` buckets (not as `steps[*].source` values); `turn-start` and `error` are transient and stay JSONL-only. ### Event overview diff --git a/packages/headless/src/runner.test.ts b/packages/headless/src/runner.test.ts index 681c71c..6912735 100644 --- a/packages/headless/src/runner.test.ts +++ b/packages/headless/src/runner.test.ts @@ -330,6 +330,27 @@ describe("runHeadless", () => { expect(serialized).not.toContain("turn-start"); }); + it("does not write an invalid zero-step trajectory when the stream fails before any step", async () => { + const history = new CheckpointHistory(); + const atifOutputPath = `/tmp/plugsuits-zero-step-${Date.now()}.json`; + const { existsSync } = await import("node:fs"); + + await runHeadless({ + agent: { + stream: () => + Promise.reject(new Error("provider unreachable")) as never, + }, + atifOutputPath, + emitEvent: () => undefined, + // Deliberately omit initialUserMessage so no user step precedes the abort. + messageHistory: history, + modelId: "mock-model", + sessionId: "session-zero-step-guard", + }).catch(() => undefined); + + expect(existsSync(atifOutputPath)).toBe(false); + }); + it("does not emit a synthetic user event when no initial user message is given", async () => { const events: TrajectoryEvent[] = []; diff --git a/packages/headless/src/trajectory-collector.ts b/packages/headless/src/trajectory-collector.ts index a4d2b89..419b7d4 100644 --- a/packages/headless/src/trajectory-collector.ts +++ b/packages/headless/src/trajectory-collector.ts @@ -12,15 +12,19 @@ * * • `schema_version` is the literal string "ATIF-v1.4". * • Every {@link AtifStep.step_id} is a sequential integer starting at 1. - * • `steps[*].source` is limited to `"user" | "agent" | "system"`; lifecycle - * event types (approval, compaction, interrupt, turn-start, error) are - * NEVER persisted as step sources — they live in the JSONL stream only. - * • Lifecycle annotations that the spec allows persisting go under - * `extra.approval_events`, `extra.compaction_events`, and - * `extra.interrupt_events`. New lifecycle types must NOT introduce new - * top-level fields. + * • `steps[*].source` is limited to `"user" | "agent" | "system"`; no + * lifecycle event type is ever persisted as a step source. + * • Persisted lifecycle annotations go under `extra.approval_events`, + * `extra.compaction_events`, and `extra.interrupt_events`. New + * lifecycle types must NOT introduce new top-level fields; pick an + * existing `extra.*` bucket or drop the event from persistence. + * • Transient lifecycle events (`turn-start`, `error`) are JSONL-only + * and are dropped by the collector. * • `final_metrics` must include every key defined in ATIF v1.4 even if * the value is `null` (null-when-absent, not omitted). + * • Trajectories with zero steps are NOT persisted — `writeTo` returns + * `false` in that case to avoid producing a file that fails Harbor's + * own validator. * • Metrics are pulled from the SDK `stream.usage`; never estimated. * * The JSONL event stream emitted to stdout by the runner is a DIFFERENT @@ -76,7 +80,7 @@ export interface TrajectoryJson { approval_events?: ApprovalEvent[]; compaction_events?: CompactionEvent[]; interrupt_events?: InterruptEvent[]; - } & Record; + }; final_metrics: { total_cached_tokens: number | null; total_completion_tokens: number | null; @@ -227,10 +231,24 @@ export class TrajectoryCollector { return trajectory; } - writeTo(outputPath: string): void { + /** + * Persists the trajectory to disk in ATIF v1.4 format. + * + * Returns `true` when a file was written, `false` when the write was + * skipped because the trajectory would violate the ATIF v1.4 shape + * contract (currently: no steps emitted). Skipping is preferred over + * writing an invalid document — Harbor's own validator rejects + * `steps: []`, so a zero-step file is worse than no file for any + * downstream consumer. + */ + writeTo(outputPath: string): boolean { + if (this.steps.length === 0) { + return false; + } const trajectory = this.finalize(); mkdirSync(dirname(outputPath), { recursive: true }); writeFileSync(outputPath, JSON.stringify(trajectory, null, 2), "utf-8"); + return true; } reset(): void { diff --git a/packages/headless/src/types.ts b/packages/headless/src/types.ts index b84a954..6b351d1 100644 --- a/packages/headless/src/types.ts +++ b/packages/headless/src/types.ts @@ -8,12 +8,16 @@ import type { BeforeTurnResult } from "@ai-sdk-tool/harness"; * (https://www.harborframework.com/docs/agents/trajectory-format). The * event union below is the internal stdout JSONL contract that the * runner emits during execution; the collector consumes these events - * and produces the ATIF trajectory as output. Lifecycle annotations - * (`approval`, `compaction`, `interrupt`, `turn-start`) exist only on - * this JSONL stream and are not part of ATIF. + * and produces the ATIF trajectory as output. + * + * Lifecycle annotations split into two categories: + * - `approval`, `compaction`, `interrupt` are persisted under + * `trajectory.extra.*` buckets (not as `steps[*].source` values). + * - `turn-start` and `error` stay JSONL-only and are dropped by the + * collector. * * All emitted step events (UserStepEvent, AgentStepEvent) conform to the ATIF specification. - * Metadata is emitted once at run start. Compaction and error events are lifecycle annotations. + * Metadata is emitted once at run start. */ // ============================================================================ diff --git a/packages/tui/src/agent-tui.ts b/packages/tui/src/agent-tui.ts index 3935607..97dd094 100644 --- a/packages/tui/src/agent-tui.ts +++ b/packages/tui/src/agent-tui.ts @@ -1049,6 +1049,7 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { usageProbeGeneration += 1; const thisGeneration = usageProbeGeneration; + const startingRevision = config.messageHistory.getRevision?.(); const measured = normalizeUsageMeasurement( await config.measureUsage(messages) @@ -1060,6 +1061,12 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { if (thisGeneration !== usageProbeGeneration) { return false; } + if ( + startingRevision !== undefined && + config.messageHistory.getRevision?.() !== startingRevision + ) { + return false; + } config.messageHistory.updateActualUsage({ inputTokens: measured.inputTokens, @@ -1534,17 +1541,17 @@ export async function createAgentTUI(config: AgentTUIConfig): Promise { overflowRetried = false, noOutputRetryCount = 0 ): Promise<"completed" | "continue" | "interrupted"> => { - showLoader("Processing..."); - - const preparedTurn = await prepareMessages(phase); - let messagesForLLM = preparedTurn.messages; - const turnOverrides = preparedTurn.turnOverrides; - const streamAbortController = new AbortController(); activeStreamController = streamAbortController; streamInterruptRequested = false; try { + showLoader("Processing..."); + + const preparedTurn = await prepareMessages(phase); + let messagesForLLM = preparedTurn.messages; + const turnOverrides = preparedTurn.turnOverrides; + const budget = await resolveTurnBudget(phase, messagesForLLM); messagesForLLM = budget.messagesForLLM; From 8160c46acb721118495c3b05b170be0d86102525 Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 19:29:35 +0900 Subject: [PATCH 11/12] fix: cycle 2 review follow-ups (Oracle re-audit) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cycle 2: Cubic reported 'No issues found across 19 files'. Oracle's second audit surfaced three genuine items the bots missed: 1. Headless probe race (Oracle P2): measureUsageIfAvailable in the headless runner lacked the generation + revision guard that the TUI already had. A slow background probe could overwrite fresher post-compaction usage. Mirrored the TUI pattern using messageHistory.getRevision?.() with optional chaining (the headless message history interface declares getRevision as optional). 2. ATIF step source contract drift (Oracle P2): trajectory-collector.ts permits steps[*].source = 'user' | 'agent' | 'system' (Harbor ATIF v1.4 allows all three; system steps support observations since v1.2), but the bundled Python validator rejected 'system' and the benchmark docs documented only 'user | agent'. Aligned all three surfaces. 3. Root README.md drift (Oracle P3): headless event list omitted turn-start and did not point at Harbor's ATIF-v1.4 schema. Updated to match the current package docs. Verified: typecheck (6/6), ultracite check (277 files clean), 1236 tests pass, build (6/6). No new tests required — the headless probe guard is covered by the same invariants the TUI guard already tests. --- .changeset/prompt-processing-indicators.md | 4 ++++ README.md | 2 +- packages/cea/benchmark/AGENTS.md | 2 +- packages/cea/benchmark/test_trajectory.py | 5 +++-- packages/headless/src/runner.ts | 19 +++++++++++++++++++ 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index 3be0ef7..faea70c 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -31,3 +31,7 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - Hardened the Python validator: `_is_real_number` now rejects `NaN`, `Infinity`, and `-Infinity` (all of which `json.loads` will happily produce from non-strict JSON) via an explicit `math.isfinite` check. - Corrected documentation drift across `packages/headless/AGENTS.md`, `packages/headless/README.md`, `packages/headless/src/types.ts`, `packages/headless/src/trajectory-collector.ts`, and the root `AGENTS.md`: `approval`/`compaction`/`interrupt` are persisted under `trajectory.extra.*`, not JSONL-only; only `turn-start` and `error` are transient. - Regression test added for the `writeTo` zero-step guard: `does not write an invalid zero-step trajectory when the stream fails before any step`. +- Review cycle 2 follow-ups (Oracle re-audit): + - Headless `measureUsageIfAvailable` now carries the same generation + revision guards the TUI already had. A slow background probe that resolves after a compaction or a newer per-turn probe no longer overwrites fresh usage data. + - ATIF v1.4 step source contract aligned across code, Python validator, and benchmark docs: `user`, `agent`, and `system` are all permitted (Harbor v1.2+). Previous divergence between `AtifStep.source` and `test_trajectory.py`'s `valid_sources = {user, agent}` is resolved. + - Root `README.md` headless event list now includes `turn-start` and points at Harbor's ATIF-v1.4 schema for the persisted trajectory. diff --git a/README.md b/README.md index 0ced259..d7d4dda 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Available commands: pnpm run headless -- "Fix the type error in src/index.ts" ``` -Outputs structured ATIF JSONL events (`metadata`, `step`, `approval`, `compaction`, `error`, `interrupt`) for programmatic consumption. +Outputs a JSONL event stream (`metadata`, `step`, `approval`, `compaction`, `error`, `interrupt`, `turn-start`) for programmatic consumption. The persisted `trajectory.json` conforms to Harbor's ATIF-v1.4 schema. ## Architecture diff --git a/packages/cea/benchmark/AGENTS.md b/packages/cea/benchmark/AGENTS.md index eacaead..fcc43cc 100644 --- a/packages/cea/benchmark/AGENTS.md +++ b/packages/cea/benchmark/AGENTS.md @@ -92,7 +92,7 @@ python -m harbor.utils.trajectory_validator jobs//*/agent/trajectory.jso ``` Validator expectations: -- `steps[*].source` is currently `user` or `agent` +- `steps[*].source` is `user`, `agent`, or `system` (ATIF v1.4 permits all three; system steps support observations since v1.2) - bundled tool observations live in `steps[*].observation.results` - persisted lifecycle annotations, when present, live under `extra.approval_events`, `extra.compaction_events`, and `extra.interrupt_events` diff --git a/packages/cea/benchmark/test_trajectory.py b/packages/cea/benchmark/test_trajectory.py index 7ffca1b..865911b 100644 --- a/packages/cea/benchmark/test_trajectory.py +++ b/packages/cea/benchmark/test_trajectory.py @@ -64,8 +64,9 @@ def validate_trajectory(path: str) -> list[str]: if step_ids != expected: errors.append(f"step_ids: expected {expected}, got {step_ids}") - # 6. each step has required fields - valid_sources = {"user", "agent"} + # 6. each step has required fields. ATIF v1.4 permits "user", "agent", + # and "system" as step sources (system steps support observations since v1.2). + valid_sources = {"user", "agent", "system"} for i, step in enumerate(steps): if not isinstance(step, dict): continue diff --git a/packages/headless/src/runner.ts b/packages/headless/src/runner.ts index 925046f..19b3ab4 100644 --- a/packages/headless/src/runner.ts +++ b/packages/headless/src/runner.ts @@ -327,6 +327,8 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { let totalIterationCount = 0; type StreamTurnResult = Awaited>; type StreamUsage = StreamTurnResult["usage"] | undefined; + let usageProbeGeneration = 0; + const measureUsageIfAvailable = async ( messages: ModelMessage[] ): Promise => { @@ -334,6 +336,13 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { return false; } + usageProbeGeneration += 1; + const thisGeneration = usageProbeGeneration; + const historyWithRevision = config.messageHistory as { + getRevision?: () => number; + }; + const startingRevision = historyWithRevision.getRevision?.(); + const measured = normalizeUsageMeasurement( await config.measureUsage(messages) ); @@ -341,6 +350,16 @@ export async function runHeadless(config: HeadlessRunnerConfig): Promise { return false; } + if (thisGeneration !== usageProbeGeneration) { + return false; + } + if ( + startingRevision !== undefined && + historyWithRevision.getRevision?.() !== startingRevision + ) { + return false; + } + config.messageHistory.updateActualUsage({ inputTokens: measured.inputTokens, outputTokens: measured.outputTokens, From 8df785b0d446be430437e779e4a21b07d45f0d6e Mon Sep 17 00:00:00 2001 From: Woonggi Min Date: Tue, 21 Apr 2026 19:36:54 +0900 Subject: [PATCH 12/12] docs(changeset): resolve self-contradiction on TrajectoryJson.extra type (cycle 3) Cycle 3 Oracle audit flagged exactly one remaining issue: the changeset for PR #117 said in its 'features' bullet (line 14) that TrajectoryJson.extra was typed as an open record, then said in its 'cycle 1 follow-ups' bullet (line 30) that it was later narrowed back to the three canonical lifecycle buckets. The shipped code matches the NARROWED version, so the earlier bullet was stale. Fixed by rewriting line 14 to reflect the final, shipped state: extra is a closed record of exactly approval_events / compaction_events / interrupt_events, and new lifecycle types must extend the interface explicitly. No runtime changes. Verified: ultracite check clean, 1236 tests pass, build (6/6 full turbo). --- .changeset/prompt-processing-indicators.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.changeset/prompt-processing-indicators.md b/.changeset/prompt-processing-indicators.md index faea70c..98a8b9b 100644 --- a/.changeset/prompt-processing-indicators.md +++ b/.changeset/prompt-processing-indicators.md @@ -11,7 +11,7 @@ Surface the "prompt processing" state that previously looked frozen, and fix fol - TUI: shows a `Processing...` loader during turn preparation and transitions to `Working...` once the LLM request is in flight. The startup token probe is now non-blocking (fire-and-forget) so the editor accepts input immediately; the context-usage footer starts from the estimated count and quietly upgrades to the real value. During a blocking compaction the foreground loader temporarily switches to `Compacting...` and restores the previous label when the block ends, so users see the actual reason for a long wait. `text-start` stream parts are now treated as visible, clearing the streaming loader as soon as the assistant view mounts (no more empty-view flicker). - Headless: emits a `turn-start` lifecycle annotation and a matching `onStreamStart` callback before each LLM request; the event is dropped from `trajectory.json` (transient UX signal, no `step_id`) so persisted consumers see identical output. The event fires exactly once per logical turn — overflow and no-output retries no longer re-emit it. New tests cover normal ordering, `new-turn` vs `intermediate-step` phases, retry single-emission, and non-persistence in `trajectory.json`. - Headless: the persisted `schema_version` is corrected from the internal `ATIF-v1.6` label to the actual current Harbor spec version `ATIF-v1.4` (). Documentation across `packages/headless/AGENTS.md`, `packages/headless/README.md`, and `packages/cea/benchmark/AGENTS.md` now separates the internal JSONL streaming protocol (which carries lifecycle annotations such as `approval`, `compaction`, `interrupt`, `turn-start`) from the ATIF-v1.4 trajectory that `TrajectoryCollector` writes to disk. -- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `Trajectory.extra` is typed as an open record so future lifecycle annotations fit without a breaking type change. +- Headless: `StepMetrics` gains the remaining ATIF-v1.4 optional fields (`logprobs`, `prompt_token_ids`, `completion_token_ids`) and `TrajectoryJson.final_metrics` now aggregates `total_cost_usd`. `TrajectoryJson.extra` is typed as a closed record of exactly the three ATIF persistence buckets (`approval_events`, `compaction_events`, `interrupt_events`); new lifecycle types must extend the interface explicitly so the Harbor persistence contract stays type-enforced. - CEA: the `--atif` CLI help text and the benchmark pipeline now reference ATIF-v1.4 (matching the corrected `schema_version`). The bundled `packages/cea/benchmark/test_trajectory.py` validator now calls Harbor's official `TrajectoryValidator` when `harbor` is importable and falls back to a stricter local shape check otherwise; it enforces per-step metric shapes and rejects `bool` values where ATIF requires a real number. - Addressed PR review feedback: - `turn-start` and `onStreamStart` now fire strictly after `agent.stream()` successfully returns, so stream-creation failures no longer produce a false "stream started" signal (reported by Gemini, Codex, and Cubic reviewers).