From 0f9001e9e3ac61885c8379f5317f0bf390d2baab Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 14:57:14 +0100 Subject: [PATCH 01/17] docs: add responder (interactive skills) design spec #303 Adds the approved design for an LLM-backed surrogate user that answers a skill's follow-up questions per task under inputs.responder, with reply/stop/abstain classification, a runner-driven follow-up loop reusing the agent session, and distinct result tagging for abstain (StatusError) and cap-exhaustion. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...2026-05-29-responder-interactive-skills.md | 345 ++++++++++++++++++ 1 file changed, 345 insertions(+) create mode 100644 docs/plans/2026-05-29-responder-interactive-skills.md diff --git a/docs/plans/2026-05-29-responder-interactive-skills.md b/docs/plans/2026-05-29-responder-interactive-skills.md new file mode 100644 index 00000000..c2a2e752 --- /dev/null +++ b/docs/plans/2026-05-29-responder-interactive-skills.md @@ -0,0 +1,345 @@ +# Design: Responder — driving interactive skills via a surrogate LLM + +- **Issue:** [#303](https://github.com/microsoft/waza/issues/303) +- **Status:** Approved design, ready for implementation planning +- **Date:** 2026-05-29 + +## Problem + +A growing category of Agent Skills is inherently multi-turn: when the agent +doesn't have everything it needs from the initial prompt, it pauses to ask the +user follow-up questions before completing the task. + +**Concrete example — `configure-agent`.** Invoked with "Add a new agent to my +application", the skill must gather a name, system instructions, the set of +tools, and any handoffs before it can generate the agent definition. None of +these can be inferred from the initial prompt, and the structured Q&A *is* part +of the skill's value. + +Today, evaluating such a skill in waza forces a bad trade-off: either pre-bake +every answer into the initial prompt (collapsing the skill into a degenerate +one-shot version that no longer tests what ships) or evaluate manually. The +existing static `follow_up_prompts` mechanism only works when the questions — +and their order — are known in advance, which defeats the purpose of testing the +skill's adaptive questioning. + +## Goal + +Add a **responder**: an LLM-backed surrogate user, configured per task, that +answers the skill's follow-up questions consistently with a described target +configuration. After each agent turn produces chat text, the responder +classifies it into one of three outcomes: + +- **Reply** → send the responder's answer back as a new user prompt, decrement a + follow-up budget, and continue. +- **Stop** → the agent is done (no further questions); exit the loop cleanly. +- **Abstain** → the responder explicitly could not answer. Abort the run with a + distinct failure classification, signalling that the brief/skill is too vague + — *not* a transient model timeout or network blip. + +## Scope + +### In scope + +- Per-task `responder` config under `inputs` (sibling to `follow_up_prompts`). +- A responder component that maintains a persistent surrogate-user session and + classifies each agent message via structured tool-calling. +- Runner integration: a responder-driven follow-up loop reusing the existing + agent session and workspace. +- Distinct result tagging for abstain and cap-exhaustion, surfaced in logs, + results JSON, and the dashboard. +- Schema, validation, tests, and documentation. + +### Out of scope (possible follow-ups) + +- Eval-level responder defaults shared across tasks (each task is self-contained + for now; if many tasks share one target config, the block repeats). +- Per-field override/merge semantics between eval-level and task-level config. + +## User-facing surface + +The responder is configured per task under `inputs`, alongside the existing +`follow_up_prompts` field. The two are mutually exclusive. + +```yaml +inputs: + prompt: "Add a new agent to my application" + responder: + model: gpt-4o # optional; defaults to config.model + instructions: | + You are configuring a new agent inside an agentic application. + The agent you want to create has: + - name: research-agent + - system instructions: "Search the web and summarise findings on the + topic the user provides." + - tools: web_search, url_fetch + - handoffs: none + Answer the skill's questions consistently with this configuration, + regardless of the order in which the skill asks for each piece. + If you genuinely can't infer an answer from the above, abstain. + max_followups: 8 +``` + +Because the responder lives per task, the same skill can be exercised against +several target configurations (a research agent, a customer-support agent, a +triage agent with handoffs) by giving each task its own `responder.instructions` +— this is exactly the robustness testing the issue calls for, achieved without +any eval-level override machinery. + +### Configuration fields + +| Field | Required | Default | Notes | +|----------------|----------|------------------|-----------------------------------------| +| `model` | no | `config.model` | Model used for the responder LLM. | +| `instructions` | yes | — | Describes the target config + abstain rule. | +| `max_followups`| yes | — | Must be `>= 1`. Caps responder replies. | + +## Architecture + +The design reuses two patterns already proven in the codebase: + +1. **LLM-backed classification via structured tool-calling**, as used by the + prompt grader (`internal/graders/prompt_grader.go`), against the narrow + `Executor` interface (`Execute(ctx, *ExecutionRequest)`). +2. **Multi-turn agent follow-ups via session + workspace reuse**, as used by the + existing static follow-up loop (`executeFollowUps` in + `internal/orchestration/runner.go`), which resumes the agent session by + passing `SessionID` and `WorkspaceDir` on each `Execute`. + +The responder owns classification; the runner owns the loop and all agent +follow-up plumbing (per-turn timeout, event/usage/tool-call merging). + +```mermaid +sequenceDiagram + participant R as Runner (executeResponderLoop) + participant A as Agent session (engine) + participant C as Responder Classifier + participant S as Responder session (engine) + + R->>A: initial prompt (Execute) + A-->>R: agent chat text (FinalOutput) + loop while budget > 0 + R->>C: Classify(agentMessage) + C->>S: agent question (Execute, persistent session + decision tools) + S-->>C: tool call: respond / stop / abstain + C-->>R: Decision + alt reply + R->>A: follow-up = answer (Execute, reuse SessionID + WorkspaceDir) + A-->>R: agent chat text + Note over R: budget-- + else stop + Note over R: outcome = stopped; break + else abstain + Note over R: outcome = abstained (StatusError); break + end + end + Note over R: budget exhausted while still replying → outcome = cap_exhausted + R->>R: run graders against final state +``` + +### Component 1: Config model (`internal/models`) + +A new `ResponderConfig` carried on `TaskStimulus` (the `inputs` block): + +```go +type ResponderConfig struct { + Model string `yaml:"model,omitempty" json:"model,omitempty"` + Instructions string `yaml:"instructions" json:"instructions"` + MaxFollowups int `yaml:"max_followups" json:"max_followups"` +} + +// TaskStimulus gains: +// Responder *ResponderConfig `yaml:"responder,omitempty" json:"responder,omitempty"` +``` + +**Validation** (in `TestCase.Validate`, surfaced by `LoadTestCase`): + +- If `Responder != nil`: + - `Instructions` must be non-empty. + - `MaxFollowups >= 1`. + - `FollowUps` must be empty (mutual exclusivity; clear error message naming + both fields). + +### Component 2: Responder package (`internal/responder`) + +```go +type DecisionKind int +const ( + DecisionReply DecisionKind = iota + DecisionStop + DecisionAbstain +) + +type Decision struct { + Kind DecisionKind + Answer string // set when Kind == DecisionReply + Reason string // set when Kind == DecisionAbstain +} + +// Executor is the narrow execution surface the responder needs (same shape as +// graders.Executor), enabling unit tests with a fake executor. +type Executor interface { + Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) +} + +type Classifier struct { + exec Executor + model string + instructions string + sessionID string // empty until the first Classify creates the session +} + +func New(exec Executor, cfg models.ResponderConfig, defaultModel string) *Classifier +func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decision, error) +``` + +`Classify` behaviour: + +- **Persistent session.** The first call creates the responder session (no + resume `SessionID`); the returned `SessionID` is stored and passed on every + subsequent call so the responder accumulates the back-and-forth like a real + user. The session is owned by the engine and cleaned up at `Shutdown`. +- **First message** carries the responder `instructions` as a preamble plus the + agent's first question and a directive to answer by calling exactly one + decision tool. **Later messages** carry only the agent's latest question + (instructions persist in session context). +- **Structured output** via three tools whose handlers capture the decision: + - `respond(answer: string)` → `DecisionReply` + - `stop()` → `DecisionStop` + - `abstain(reason: string)` → `DecisionAbstain` +- Request uses `NoSkills: true`, `MessageMode: MessageModeEnqueue`, + `Streaming: true`. The responder session does **not** use the agent's + workspace. +- If no decision tool is called (responder malfunction), `Classify` returns an + error — distinct from abstain. + +### Component 3: Runner loop (`internal/orchestration/runner.go`) + +In `executeRun`, after the initial `Execute`: + +- If `tc.Stimulus.Responder != nil` → `executeResponderLoop`. +- Else if `len(tc.Stimulus.FollowUps) > 0` → existing `executeFollowUps`. + +`executeResponderLoop` mirrors `executeFollowUps` plumbing (build request via +`buildExecutionRequest`, set `Message`/`SessionID`/`WorkspaceDir`, apply per-turn +timeout, merge `Events`/`ToolCalls`/`SkillInvocations`/`DurationMs`/`FinalOutput`/ +`WorkspaceFiles`/`Usage` into `resp`). Pseudocode: + +``` +classifier := responder.New(r.engine, *tc.Stimulus.Responder, r.spec.Config.ModelID) +left := tc.Stimulus.Responder.MaxFollowups +sent := 0 +outcome := "completed" +for left > 0 { + decision, err := classifier.Classify(ctx, resp.FinalOutput) + if err != nil { resp.ErrorMsg = "responder error: " + err; outcome = "error"; break } + switch decision.Kind { + case Reply: + // send agent follow-up using decision.Answer (reuse SessionID + WorkspaceDir) + // merge follow-up response into resp; on error set resp.ErrorMsg + break + sent++; left-- + log: responder replied (turn sent, budget left) + case Stop: + outcome = "stopped"; goto done + case Abstain: + resp.ErrorMsg = "responder abstained: " + decision.Reason + outcome = "abstained"; goto done + } +} +if left == 0 && lastDecisionWasReply { + outcome = "cap_exhausted" + log warning: responder budget exhausted while agent still asking +} +done: +attach ResponderInfo{FollowupsSent: sent, Outcome: outcome, Reason: ...} to the run +``` + +Verbose mode emits per-turn progress events (reusing the existing +`EventAgentPrompt` / `EventAgentResponse` style) so `-v` runs show the +responder's answers and the agent's replies. + +### Component 4: Results & reporting (`internal/models/outcome.go`) + +```go +type ResponderInfo struct { + FollowupsSent int `json:"followups_sent"` + Outcome string `json:"outcome"` // completed|stopped|abstained|cap_exhausted|error + Reason string `json:"reason,omitempty"` +} + +// RunResult gains: +// Responder *ResponderInfo `json:"responder,omitempty"` +``` + +Status mapping: + +| Responder outcome | `RunResult.Status` | `ErrorMsg` | Notes | +|-------------------|--------------------|------------------------------------|-------| +| `completed` | unchanged (graded) | — | Agent finished; graders decide pass/fail. | +| `stopped` | unchanged (graded) | — | Responder signalled done. | +| `abstained` | `StatusError` | `responder abstained: ` | Distinct, filterable; separate from timeouts/network errors. | +| `cap_exhausted` | unchanged (graded) | — | Logged + surfaced; graders judge the end state. | +| `error` | `StatusError` | `responder error: ` | Responder malfunction (no decision / session failure). | + +Because abstain reuses `StatusError` but is tagged via `Responder.Outcome`, +reports and the dashboard can distinguish a vague-brief abstain from a genuine +error. The dashboard (`web/`) surfaces `responder.outcome` (and reason) so +abstain and cap-exhaustion are visible per run. + +## Error handling & edge cases + +- **No decision tool called** → `Classify` error → run `error` outcome + (`StatusError`), distinct from abstain. +- **Responder session creation/Execute failure** → propagated as run `error`. +- **Agent follow-up Execute failure** → mirrors `executeFollowUps`: set + `resp.ErrorMsg`, stop the loop. +- **`max_followups` exhausted while agent still asking** → `cap_exhausted`; loop + stops, run proceeds to grading, warning logged. +- **Mutual exclusivity** of `responder` and `follow_up_prompts` enforced at load + time with a clear error. +- **Context cancellation / task timeout** honoured on every responder and agent + turn via the existing per-turn timeout pattern. + +## Testing + +- **`internal/responder`** — fake `Executor` invoking decision-tool handlers: + reply / stop / abstain / no-decision-error; persistent-session resumption + (second `Classify` passes the stored `SessionID`); first-vs-later message + shape (instructions preamble only on first call); model defaulting. +- **`internal/orchestration`** — mock engine + injectable classifier (or fake + executor): reply → agent follow-up sent with reused session/workspace; stop; + abstain → `StatusError` + `Responder.Outcome == "abstained"`; cap exhaustion → + graded + `Responder.Outcome == "cap_exhausted"`; mutual-exclusivity rejection. +- **`internal/models`** — validation: missing instructions, `max_followups < 1`, + both `responder` and `follow_up_prompts` set. +- **Schema** — `internal/validation` and `internal/projectconfig` parity tests + for the new `responder` field. +- All existing tests remain green; `go test ./...` and `golangci-lint run` pass. + +## Documentation + +Per `AGENTS.md`: + +- `README.md` — responder section + YAML example in the eval/inputs docs. +- `site/src/content/docs/` — eval-YAML reference entry for `inputs.responder` + and a short guide on testing interactive skills; build with `npm run build`. +- Schema files kept in sync. +- Dashboard (`web/`) — surface `responder.outcome`/`reason`; regenerate + screenshots if UI changes. +- Reference issue #303 in commits; update tracking issue #66 if applicable. + +## Rationale + +- **Per-task placement** mirrors `follow_up_prompts`, keeps each task + self-contained, makes mutual-exclusivity checking local, and directly serves + the "vary the target config across tasks" use case — without any eval-level + override/merge complexity. +- **Runner owns the loop, responder owns classification** keeps the responder + small and unit-testable, and reuses the battle-tested agent follow-up plumbing + rather than duplicating it. +- **Persistent responder session** models a real user who remembers prior + answers, avoiding contradictory or repeated responses across turns. +- **Abstain as tagged `StatusError`** satisfies the issue's requirement that a + vague-brief abstain be reportable separately from transient errors, without + introducing a new top-level status value that every report/consumer would need + to learn. From e940402b31e5c35bf2aa0b583c1589baea93bd52 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:03:07 +0100 Subject: [PATCH 02/17] docs: add responder (interactive skills) implementation plan #303 Bite-sized TDD task breakdown covering the inputs.responder config model and validation, the internal/responder package (persistent surrogate-user session with reply/stop/abstain classification), the runner-driven follow-up loop, ResponderInfo reporting, JSON schema, docs, and dashboard surfacing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- ...-interactive-skills-implementation-plan.md | 1310 +++++++++++++++++ 1 file changed, 1310 insertions(+) create mode 100644 docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md diff --git a/docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md b/docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md new file mode 100644 index 00000000..44659302 --- /dev/null +++ b/docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md @@ -0,0 +1,1310 @@ +# Responder (Interactive Skills) Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a per-task LLM-backed "responder" that answers an interactive skill's follow-up questions (reply / stop / abstain), driving multi-turn agent runs automatically. + +**Architecture:** A new `internal/responder` package owns an LLM surrogate-user session that classifies each agent message via structured tool-calling. The `EvalRunner` owns a follow-up loop (`executeResponderLoop`) that reuses the existing agent session/workspace plumbing (mirroring `executeFollowUps`). Config lives per task under `inputs.responder`; abstain is tagged on the run as a distinct `StatusError` outcome. + +**Tech Stack:** Go 1.26, Copilot SDK (`github.com/github/copilot-sdk/go`), `go-viper/mapstructure/v2`, `gopkg.in/yaml.v3`, `santhosh-tekuri/jsonschema/v6`. Tests use the repo's existing `testify`-style assertions and the in-tree `MockEngine` / fake-executor patterns. + +**Design doc:** `docs/plans/2026-05-29-responder-interactive-skills.md` + +**Conventions:** +- Build: `go build ./...` · Test: `go test ./...` · Lint: `golangci-lint run` +- Run a single test: `go test ./internal// -run '^TestName$' -v` +- Conventional commits, reference `#303`. Append the trailer: + `Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>` + (Commits are signed manually by the maintainer — when a step says "Commit", stage the files and present the commit command; the maintainer runs it.) + +--- + +## File Structure + +**Create:** +- `internal/responder/responder.go` — `Classifier`, `Decision`, `DecisionKind`, `New`, `Classify`, decision tools. +- `internal/responder/responder_test.go` — unit tests with a fake executor. + +**Modify:** +- `internal/models/testcase.go` — add `ResponderConfig`, `TaskStimulus.Responder`, validation + mutual exclusivity. +- `internal/models/testcase_test.go` — validation tests. +- `internal/models/outcome.go` — add `ResponderInfo` + `RunResult.Responder`. +- `internal/orchestration/runner.go` — classifier factory field, `executeResponderLoop`, branch in `executeRun`, set `RunResult.Responder`/status. +- `internal/orchestration/runner_test.go` (or a new `responder_loop_test.go`) — loop tests with mock engine + fake classifier. +- `schemas/task.schema.json` — add `responder` to the `inputs` `$def`. +- `README.md` — responder section + YAML example. +- `site/src/content/docs/guides/eval-yaml.mdx` and `site/src/content/docs/reference/schema.mdx` — document `inputs.responder`. + +**Note on scope:** The dashboard (`web/`) surfacing of `responder.outcome` is included as the final task. The pre-existing placement of `follow_up_prompts` in the task schema (it sits at task top-level, while the Go model reads it under `inputs`) is out of scope — do not "fix" it. + +--- + +## Task 1: Config model — `ResponderConfig` and `TaskStimulus.Responder` + +**Files:** +- Modify: `internal/models/testcase.go` +- Test: `internal/models/testcase_test.go` + +- [ ] **Step 1: Write the failing test** + +Add to `internal/models/testcase_test.go`: + +```go +func TestResponderConfigParsesUnderInputs(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "task.yaml") + yaml := ` +id: configure-agent +name: Configure a research agent +inputs: + prompt: "Add a new agent to my application" + responder: + model: gpt-4o + instructions: | + The agent you want is research-agent with tools web_search. + If you can't infer an answer, abstain. + max_followups: 8 +` + require.NoError(t, os.WriteFile(path, []byte(yaml), 0o600)) + + tc, err := LoadTestCase(path) + require.NoError(t, err) + require.NotNil(t, tc.Stimulus.Responder) + require.Equal(t, "gpt-4o", tc.Stimulus.Responder.Model) + require.Equal(t, 8, tc.Stimulus.Responder.MaxFollowups) + require.Contains(t, tc.Stimulus.Responder.Instructions, "research-agent") +} +``` + +If `require`/`os`/`filepath` are not already imported in the test file, add them. + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/models/ -run '^TestResponderConfigParsesUnderInputs$' -v` +Expected: FAIL — compile error `tc.Stimulus.Responder undefined` (field does not exist yet). + +- [ ] **Step 3: Add the type and field** + +In `internal/models/testcase.go`, add the new type after the `TaskStimulus` struct: + +```go +// ResponderConfig configures an LLM-backed surrogate user that answers a +// skill's follow-up questions during a multi-turn run. It is mutually +// exclusive with TaskStimulus.FollowUps. +type ResponderConfig struct { + // Model is the model used for the responder LLM. Optional; when empty the + // eval-level config.model is used. + Model string `yaml:"model,omitempty" json:"model,omitempty"` + // Instructions describe the target configuration the responder represents + // and the rule for abstaining. Required. + Instructions string `yaml:"instructions" json:"instructions"` + // MaxFollowups caps how many times the responder may reply before the loop + // stops. Required; must be >= 1. + MaxFollowups int `yaml:"max_followups" json:"max_followups"` +} +``` + +Then add the field to `TaskStimulus` (place it after `FollowUps`): + +```go + Responder *ResponderConfig `yaml:"responder,omitempty" json:"responder,omitempty"` +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `go test ./internal/models/ -run '^TestResponderConfigParsesUnderInputs$' -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add internal/models/testcase.go internal/models/testcase_test.go +git commit -m "feat: add inputs.responder config model #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 2: Config validation — required fields + mutual exclusivity + +**Files:** +- Modify: `internal/models/testcase.go` (the `TestCase.Validate` method, ~line 311) +- Test: `internal/models/testcase_test.go` + +- [ ] **Step 1: Write the failing tests** + +Add to `internal/models/testcase_test.go`: + +```go +func TestResponderValidationRejectsMissingInstructions(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + Responder: &ResponderConfig{MaxFollowups: 3}, + }, + } + err := tc.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "instructions") +} + +func TestResponderValidationRejectsZeroMaxFollowups(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 0}, + }, + } + err := tc.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "max_followups") +} + +func TestResponderValidationRejectsBothResponderAndFollowUps(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + FollowUps: []string{"next"}, + Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2}, + }, + } + err := tc.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "follow_up_prompts") + require.Contains(t, err.Error(), "responder") +} + +func TestResponderValidationAcceptsValidConfig(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2}, + }, + } + require.NoError(t, tc.Validate()) +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/models/ -run '^TestResponderValidation' -v` +Expected: FAIL — the "reject" tests get no error (validation not implemented). + +- [ ] **Step 3: Implement validation** + +In `internal/models/testcase.go`, extend `func (tc *TestCase) Validate() error`. Add the following block before the final `return nil`: + +```go + if r := tc.Stimulus.Responder; r != nil { + name := tc.TestID + if name == "" { + name = tc.DisplayName + } + prefix := "test case" + if name != "" { + prefix = fmt.Sprintf("test case %q", name) + } + if strings.TrimSpace(r.Instructions) == "" { + return fmt.Errorf("%s: responder.instructions is required", prefix) + } + if r.MaxFollowups < 1 { + return fmt.Errorf("%s: responder.max_followups must be at least 1, got %d", prefix, r.MaxFollowups) + } + if len(tc.Stimulus.FollowUps) > 0 { + return fmt.Errorf("%s: inputs.responder and inputs.follow_up_prompts are mutually exclusive; use one or the other", prefix) + } + } +``` + +(`fmt` and `strings` are already imported in this file.) + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/models/ -run '^TestResponderValidation' -v` +Expected: PASS (all four). + +- [ ] **Step 5: Commit** + +```bash +git add internal/models/testcase.go internal/models/testcase_test.go +git commit -m "feat: validate inputs.responder fields and mutual exclusivity #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 3: Responder package — `Decision` types and tool definitions + +**Files:** +- Create: `internal/responder/responder.go` +- Test: `internal/responder/responder_test.go` + +This task builds the package skeleton and the three decision tools, verifying that invoking each tool handler records the correct decision. The `Classify` flow is added in Task 4. + +- [ ] **Step 1: Write the failing test** + +Create `internal/responder/responder_test.go`: + +```go +package responder + +import ( + "testing" + + copilot "github.com/github/copilot-sdk/go" + "github.com/stretchr/testify/require" +) + +func TestDecisionToolsRecordReply(t *testing.T) { + d := &decisionRecorder{} + tools := d.tools() + require.Len(t, tools, 3) + + respond := findTool(t, tools, toolRespond) + _, err := respond.Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "research-agent"}, + }) + require.NoError(t, err) + require.True(t, d.set) + require.Equal(t, DecisionReply, d.decision.Kind) + require.Equal(t, "research-agent", d.decision.Answer) +} + +func TestDecisionToolsRecordStop(t *testing.T) { + d := &decisionRecorder{} + stop := findTool(t, d.tools(), toolStop) + _, err := stop.Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + require.NoError(t, err) + require.True(t, d.set) + require.Equal(t, DecisionStop, d.decision.Kind) +} + +func TestDecisionToolsRecordAbstain(t *testing.T) { + d := &decisionRecorder{} + abstain := findTool(t, d.tools(), toolAbstain) + _, err := abstain.Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"reason": "brief too vague"}, + }) + require.NoError(t, err) + require.True(t, d.set) + require.Equal(t, DecisionAbstain, d.decision.Kind) + require.Equal(t, "brief too vague", d.decision.Reason) +} + +func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool { + t.Helper() + for _, tl := range tools { + if tl.Name == name { + return tl + } + } + t.Fatalf("tool %q not found", name) + return copilot.Tool{} +} +``` + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/responder/ -v` +Expected: FAIL — package/symbols (`decisionRecorder`, `Decision`, `toolRespond`, ...) do not exist. + +- [ ] **Step 3: Implement the package skeleton and tools** + +Create `internal/responder/responder.go`: + +```go +// Package responder implements an LLM-backed surrogate user that answers an +// interactive skill's follow-up questions during a multi-turn evaluation run. +package responder + +import ( + "context" + + copilot "github.com/github/copilot-sdk/go" + "github.com/go-viper/mapstructure/v2" + "github.com/microsoft/waza/internal/execution" + "github.com/microsoft/waza/internal/models" +) + +// DecisionKind enumerates the responder's possible classifications of an agent +// message. +type DecisionKind int + +const ( + // DecisionReply means the responder answered the agent's question. + DecisionReply DecisionKind = iota + // DecisionStop means the agent is done and no further input is needed. + DecisionStop + // DecisionAbstain means the responder could not answer from its brief. + DecisionAbstain +) + +// Decision is the outcome of classifying a single agent message. +type Decision struct { + Kind DecisionKind + Answer string // set when Kind == DecisionReply + Reason string // set when Kind == DecisionAbstain +} + +const ( + toolRespond = "responder_reply" + toolStop = "responder_stop" + toolAbstain = "responder_abstain" +) + +// Executor is the narrow execution surface the responder needs. The concrete +// AgentEngine satisfies it, and tests supply a fake. +type Executor interface { + Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) +} + +// decisionRecorder captures the single decision tool the responder LLM calls. +type decisionRecorder struct { + decision Decision + set bool +} + +func (d *decisionRecorder) tools() []copilot.Tool { + return []copilot.Tool{ + { + Name: toolRespond, + Description: "Answer the agent's question as the user. Call this exactly once with your answer.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{ + "answer": map[string]any{ + "type": "string", + "description": "Your reply to the agent's question, consistent with your configuration.", + }, + }, + "required": []string{"answer"}, + }, + Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { + var args struct { + Answer string `mapstructure:"answer"` + } + _ = mapstructure.Decode(inv.Arguments, &args) + d.decision = Decision{Kind: DecisionReply, Answer: args.Answer} + d.set = true + return copilot.ToolResult{}, nil + }, + }, + { + Name: toolStop, + Description: "Signal that the agent has finished and needs no further input. Call this when there is no question to answer.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{}, + }, + Handler: func(copilot.ToolInvocation) (copilot.ToolResult, error) { + d.decision = Decision{Kind: DecisionStop} + d.set = true + return copilot.ToolResult{}, nil + }, + }, + { + Name: toolAbstain, + Description: "Signal that you cannot answer the agent's question from your configuration. Call this only when the information is genuinely missing.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{ + "reason": map[string]any{ + "type": "string", + "description": "Why you cannot answer.", + }, + }, + "required": []string{"reason"}, + }, + Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { + var args struct { + Reason string `mapstructure:"reason"` + } + _ = mapstructure.Decode(inv.Arguments, &args) + d.decision = Decision{Kind: DecisionAbstain, Reason: args.Reason} + d.set = true + return copilot.ToolResult{}, nil + }, + }, + } +} + +// Classifier is added in the next task; declared here so the package compiles +// with its dependencies referenced. +var _ = models.ResponderConfig{} +``` + +(The trailing `var _ = models.ResponderConfig{}` keeps the `models` import used until Task 4 adds `Classifier`; remove it in Task 4.) + +- [ ] **Step 4: Run test to verify it passes** + +Run: `go test ./internal/responder/ -v` +Expected: PASS (three decision-tool tests). + +- [ ] **Step 5: Commit** + +```bash +git add internal/responder/responder.go internal/responder/responder_test.go +git commit -m "feat: add responder decision types and tools #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 4: Responder package — `Classifier` and `Classify` + +**Files:** +- Modify: `internal/responder/responder.go` +- Test: `internal/responder/responder_test.go` + +- [ ] **Step 1: Write the failing tests** + +Add to `internal/responder/responder_test.go` (add imports `context`, `github.com/microsoft/waza/internal/execution`, `github.com/microsoft/waza/internal/models` as needed): + +```go +type fakeExecutor struct { + calls []*execution.ExecutionRequest + respond func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) +} + +func (f *fakeExecutor) Execute(_ context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + f.calls = append(f.calls, req) + return f.respond(req) +} + +func TestClassifyReply(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + // Simulate the model calling the reply tool. + _, err := findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "research-agent"}, + }) + require.NoError(t, err) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}, "gpt-4o") + d, err := c.Classify(context.Background(), "What is the agent name?") + require.NoError(t, err) + require.Equal(t, DecisionReply, d.Kind) + require.Equal(t, "research-agent", d.Answer) +} + +func TestClassifyAbstain(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolAbstain).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"reason": "no info"}, + }) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + d, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) + require.Equal(t, DecisionAbstain, d.Kind) + require.Equal(t, "no info", d.Reason) +} + +func TestClassifyNoDecisionToolIsError(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil // no tool called + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.Error(t, err) +} + +func TestClassifyUsesDefaultModelWhenUnset(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + require.Equal(t, "default-model", req.ModelID) + _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "default-model") + _, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) +} + +func TestClassifyPersistsSession(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "a"}, + }) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "INSTR", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q1?") + require.NoError(t, err) + _, err = c.Classify(context.Background(), "Q2?") + require.NoError(t, err) + + require.Len(t, exec.calls, 2) + // First call has no resume session id and includes the instructions. + require.Empty(t, exec.calls[0].SessionID) + require.Contains(t, exec.calls[0].Message, "INSTR") + require.Contains(t, exec.calls[0].Message, "Q1?") + // Second call resumes the responder session and omits the instructions. + require.Equal(t, "resp-1", exec.calls[1].SessionID) + require.NotContains(t, exec.calls[1].Message, "INSTR") + require.Contains(t, exec.calls[1].Message, "Q2?") +} +``` + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/responder/ -run '^TestClassify' -v` +Expected: FAIL — `New`/`Classify` undefined. + +- [ ] **Step 3: Implement `Classifier` and `Classify`** + +In `internal/responder/responder.go`: remove the `var _ = models.ResponderConfig{}` line, add `errors` and `fmt` to imports, and append: + +```go +// Classifier maintains a persistent surrogate-user session and classifies each +// agent message into a Decision. +type Classifier struct { + exec Executor + model string + instructions string + sessionID string // empty until the first Classify creates the session +} + +// New constructs a Classifier. defaultModel is used when cfg.Model is empty. +func New(exec Executor, cfg models.ResponderConfig, defaultModel string) *Classifier { + model := cfg.Model + if model == "" { + model = defaultModel + } + return &Classifier{ + exec: exec, + model: model, + instructions: cfg.Instructions, + } +} + +// Classify sends the agent's latest message to the responder LLM and returns +// its decision. The first call seeds the session with the responder +// instructions; subsequent calls resume the same session. +func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decision, error) { + rec := &decisionRecorder{} + + req := &execution.ExecutionRequest{ + ModelID: c.model, + Message: c.buildMessage(agentMessage), + Tools: rec.tools(), + MessageMode: execution.MessageModeEnqueue, + Streaming: true, + SessionID: c.sessionID, + NoSkills: true, + EphemeralSession: true, + SkipWorkspaceCapture: true, + } + + resp, err := c.exec.Execute(ctx, req) + if err != nil { + // If the model already chose a decision before a post-tool follow-up + // turn failed, honour it (mirrors the prompt grader's behaviour). + if rec.set { + return rec.decision, nil + } + return Decision{}, fmt.Errorf("responder execution failed: %w", err) + } + if resp != nil && resp.SessionID != "" { + c.sessionID = resp.SessionID + } + if !rec.set { + return Decision{}, errors.New("responder did not call a decision tool") + } + return rec.decision, nil +} + +func (c *Classifier) buildMessage(agentMessage string) string { + if c.sessionID == "" { + return fmt.Sprintf( + "%s\n\nYou are role-playing as the user. The agent just said:\n\n%s\n\n"+ + "Respond by calling exactly one tool: %s to answer, %s if the agent is finished and needs nothing, or %s if you genuinely cannot answer from your configuration.", + c.instructions, agentMessage, toolRespond, toolStop, toolAbstain, + ) + } + return fmt.Sprintf( + "The agent just said:\n\n%s\n\nRespond by calling exactly one tool (%s, %s, or %s).", + agentMessage, toolRespond, toolStop, toolAbstain, + ) +} +``` + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/responder/ -v` +Expected: PASS (all tests, including Task 3's). + +- [ ] **Step 5: Commit** + +```bash +git add internal/responder/responder.go internal/responder/responder_test.go +git commit -m "feat: add responder Classifier with persistent session #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 5: Results model — `ResponderInfo` on `RunResult` + +**Files:** +- Modify: `internal/models/outcome.go` +- Test: `internal/models/outcome_test.go` + +- [ ] **Step 1: Write the failing test** + +Add to `internal/models/outcome_test.go`: + +```go +func TestResponderInfoSerializes(t *testing.T) { + rr := RunResult{ + RunNumber: 1, + Status: StatusError, + Responder: &ResponderInfo{ + FollowupsSent: 3, + Outcome: "abstained", + Reason: "brief too vague", + }, + } + data, err := json.Marshal(rr) + require.NoError(t, err) + require.Contains(t, string(data), `"responder"`) + require.Contains(t, string(data), `"outcome":"abstained"`) + + // Omitted when nil. + data2, err := json.Marshal(RunResult{RunNumber: 1, Status: StatusPassed}) + require.NoError(t, err) + require.NotContains(t, string(data2), `"responder"`) +} +``` + +(Add `encoding/json` and `testify/require` imports if not already present.) + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/models/ -run '^TestResponderInfoSerializes$' -v` +Expected: FAIL — `ResponderInfo` / `RunResult.Responder` undefined. + +- [ ] **Step 3: Implement the type and field** + +In `internal/models/outcome.go`, add after the `RunResult` struct definition: + +```go +// ResponderInfo records the outcome of a responder-driven multi-turn run. +type ResponderInfo struct { + // FollowupsSent is the number of responder answers sent to the agent. + FollowupsSent int `json:"followups_sent"` + // Outcome is one of: completed, stopped, abstained, cap_exhausted, error. + Outcome string `json:"outcome"` + // Reason holds the responder's reason when Outcome == "abstained" or an + // error message when Outcome == "error". + Reason string `json:"reason,omitempty"` +} +``` + +Add the field to `RunResult` (after `WorkspaceDir`): + +```go + Responder *ResponderInfo `json:"responder,omitempty"` +``` + +Also add named constants near the top of the file (after the `Status` consts) for reuse by the runner: + +```go +// Responder outcome values recorded on RunResult.Responder.Outcome. +const ( + ResponderOutcomeCompleted = "completed" + ResponderOutcomeStopped = "stopped" + ResponderOutcomeAbstained = "abstained" + ResponderOutcomeCapExhausted = "cap_exhausted" + ResponderOutcomeError = "error" +) +``` + +- [ ] **Step 4: Run test to verify it passes** + +Run: `go test ./internal/models/ -run '^TestResponderInfoSerializes$' -v` +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add internal/models/outcome.go internal/models/outcome_test.go +git commit -m "feat: add ResponderInfo to RunResult #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 6: Runner — classifier factory field (injectable for tests) + +**Files:** +- Modify: `internal/orchestration/runner.go` +- Test: `internal/orchestration/runner_test.go` + +This task introduces an injectable factory so the loop (Task 7) can be tested with a fake classifier while defaulting to the real responder. + +- [ ] **Step 1: Add the abstraction and factory (no behaviour change yet)** + +In `internal/orchestration/runner.go`: + +1. Add the import `"github.com/microsoft/waza/internal/responder"` to the import block. + +2. Add an interface and a factory field to the `EvalRunner` struct. After the `verbose bool` field, add: + +```go + // newClassifier builds a responder classifier for a task. Overridable in + // tests; defaults to a responder backed by the runner's engine. + newClassifier func(cfg models.ResponderConfig, defaultModel string) responderClassifier +``` + +3. Add the interface near the other small type declarations (e.g. just above `type EvalRunner struct`): + +```go +// responderClassifier classifies an agent message into a responder decision. +// Implemented by *responder.Classifier; faked in tests. +type responderClassifier interface { + Classify(ctx context.Context, agentMessage string) (responder.Decision, error) +} +``` + +4. In `NewEvalRunner` (after the struct is constructed, before `return`), set the default factory: + +```go + r.newClassifier = func(cfg models.ResponderConfig, defaultModel string) responderClassifier { + return responder.New(r.engine, cfg, defaultModel) + } +``` + +(Adjust to match how `NewEvalRunner` names its receiver/local variable — it returns a `*EvalRunner`; assign the field on that value before returning it.) + +- [ ] **Step 2: Build to verify it compiles** + +Run: `go build ./internal/orchestration/` +Expected: success (no behaviour change yet). `responder.New` satisfies `responderClassifier` because `*responder.Classifier` has the `Classify` method. + +- [ ] **Step 3: Commit** + +```bash +git add internal/orchestration/runner.go +git commit -m "refactor: add injectable responder classifier factory to runner #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 7: Runner — `executeResponderLoop` and branch in `executeRun` + +**Files:** +- Modify: `internal/orchestration/runner.go` +- Test: `internal/orchestration/runner_test.go` (or new `internal/orchestration/responder_loop_test.go`) + +- [ ] **Step 1: Write the failing tests** + +Create `internal/orchestration/responder_loop_test.go`. This uses a fake classifier injected via `r.newClassifier` and a `MockEngine` for agent turns. Adjust the runner/config construction to match existing test helpers in `runner_test.go` (e.g. how other tests build an `EvalRunner` with a spec + `MockEngine`); the assertions below are the contract: + +```go +package orchestration + +import ( + "context" + "testing" + + "github.com/microsoft/waza/internal/models" + "github.com/microsoft/waza/internal/responder" + "github.com/stretchr/testify/require" +) + +// scriptedClassifier returns a queued sequence of decisions. +type scriptedClassifier struct { + decisions []responder.Decision + idx int + calls int +} + +func (s *scriptedClassifier) Classify(_ context.Context, _ string) (responder.Decision, error) { + s.calls++ + d := s.decisions[s.idx] + if s.idx < len(s.decisions)-1 { + s.idx++ + } + return d, nil +} + +func TestResponderLoopReplyThenStop(t *testing.T) { + r := newResponderTestRunner(t) // helper: builds EvalRunner with MockEngine + responder spec + sc := &scriptedClassifier{decisions: []responder.Decision{ + {Kind: responder.DecisionReply, Answer: "research-agent"}, + {Kind: responder.DecisionStop}, + }} + r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } + + tc := &models.TestCase{ + TestID: "t1", + Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}}, + } + rr := r.executeRun(context.Background(), tc, 1) + + require.NotNil(t, rr.Responder) + require.Equal(t, models.ResponderOutcomeStopped, rr.Responder.Outcome) + require.Equal(t, 1, rr.Responder.FollowupsSent) +} + +func TestResponderLoopAbstainMarksError(t *testing.T) { + r := newResponderTestRunner(t) + sc := &scriptedClassifier{decisions: []responder.Decision{ + {Kind: responder.DecisionAbstain, Reason: "too vague"}, + }} + r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } + + tc := &models.TestCase{ + TestID: "t1", + Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 5}}, + } + rr := r.executeRun(context.Background(), tc, 1) + + require.Equal(t, models.StatusError, rr.Status) + require.NotNil(t, rr.Responder) + require.Equal(t, models.ResponderOutcomeAbstained, rr.Responder.Outcome) + require.Contains(t, rr.ErrorMsg, "abstained") + require.Contains(t, rr.ErrorMsg, "too vague") +} + +func TestResponderLoopCapExhausted(t *testing.T) { + r := newResponderTestRunner(t) + sc := &scriptedClassifier{decisions: []responder.Decision{ + {Kind: responder.DecisionReply, Answer: "a"}, // always reply + }} + r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } + + tc := &models.TestCase{ + TestID: "t1", + Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 2}}, + } + rr := r.executeRun(context.Background(), tc, 1) + + require.NotNil(t, rr.Responder) + require.Equal(t, models.ResponderOutcomeCapExhausted, rr.Responder.Outcome) + require.Equal(t, 2, rr.Responder.FollowupsSent) + require.NotEqual(t, models.StatusError, rr.Status) // graded normally +} +``` + +Add a `newResponderTestRunner(t)` helper in this file. Model it on the existing runner construction in `runner_test.go` — build an `*config.EvalConfig` from a minimal `models.EvalSpec` (skill name, `Config{TrialsPerTask:1, TimeoutSec:30, ModelID:"mock"}`), create a `MockEngine`, call `Initialize`, and return `NewEvalRunner(cfg, engine)`. Inspect `runner_test.go` for the exact helper names already used and reuse them rather than duplicating. + +- [ ] **Step 2: Run tests to verify they fail** + +Run: `go test ./internal/orchestration/ -run '^TestResponderLoop' -v` +Expected: FAIL — `executeResponderLoop` not wired; `rr.Responder` is nil. + +- [ ] **Step 3: Implement the loop and branch** + +In `internal/orchestration/runner.go`, change the follow-up branch in `executeRun`. Replace: + +```go + // Execute follow-up prompts if defined + if len(tc.Stimulus.FollowUps) > 0 { + r.executeFollowUps(ctx, tc, resp) + } +``` + +with: + +```go + // Drive multi-turn: responder loop takes precedence; otherwise static + // follow-ups. Validation guarantees these are mutually exclusive. + var responderInfo *models.ResponderInfo + if tc.Stimulus.Responder != nil { + responderInfo = r.executeResponderLoop(ctx, tc, resp) + } else if len(tc.Stimulus.FollowUps) > 0 { + r.executeFollowUps(ctx, tc, resp) + } +``` + +Then, in the `return models.RunResult{...}` at the end of `executeRun`, add the field: + +```go + Responder: responderInfo, +``` + +Now add the loop method (place it directly after `executeFollowUps`): + +```go +// executeResponderLoop drives a multi-turn run using an LLM-backed surrogate +// user. After each agent turn it classifies the agent's latest message and +// either replies (sending a new agent prompt), stops, or aborts on abstain. +// It mutates resp in place (mirroring executeFollowUps) and returns a summary. +func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse) *models.ResponderInfo { + cfg := *tc.Stimulus.Responder + classifier := r.newClassifier(cfg, r.cfg.Spec().Config.ModelID) + + info := &models.ResponderInfo{Outcome: models.ResponderOutcomeCompleted} + left := cfg.MaxFollowups + lastWasReply := false + + for left > 0 { + decision, err := classifier.Classify(ctx, resp.FinalOutput) + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder error: %v", err) + info.Outcome = models.ResponderOutcomeError + info.Reason = err.Error() + return info + } + + switch decision.Kind { + case responder.DecisionStop: + info.Outcome = models.ResponderOutcomeStopped + return info + + case responder.DecisionAbstain: + resp.ErrorMsg = fmt.Sprintf("responder abstained: %s", decision.Reason) + info.Outcome = models.ResponderOutcomeAbstained + info.Reason = decision.Reason + return info + + case responder.DecisionReply: + if !r.sendResponderReply(ctx, tc, resp, decision.Answer, info.FollowupsSent+1) { + // sendResponderReply set resp.ErrorMsg; stop the loop. + info.Outcome = models.ResponderOutcomeError + info.Reason = resp.ErrorMsg + return info + } + info.FollowupsSent++ + left-- + lastWasReply = true + } + } + + if lastWasReply { + info.Outcome = models.ResponderOutcomeCapExhausted + slog.WarnContext(ctx, "responder budget exhausted while agent still asking questions", + "test", tc.DisplayName, "max_followups", cfg.MaxFollowups) + } + return info +} + +// sendResponderReply sends one responder answer to the agent session, reusing +// the session and workspace, and merges the agent's response into resp. It +// returns false (and sets resp.ErrorMsg) on failure. +func (r *EvalRunner) sendResponderReply(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse, answer string, turn int) bool { + followReq, err := r.buildExecutionRequest(tc) + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err) + return false + } + followReq.Message = answer + followReq.SessionID = resp.SessionID + followReq.WorkspaceDir = resp.WorkspaceDir + + if r.verbose { + r.notifyProgress(ProgressEvent{ + EventType: EventAgentPrompt, + TestName: tc.DisplayName, + Details: map[string]any{"message": answer, "responder_reply": turn}, + }) + } + + timeout, err := r.executionTimeout(tc) + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err) + return false + } + followCtx, cancelFollow := context.WithTimeout(ctx, timeout) + followResp, err := r.engine.Execute(followCtx, followReq) + cancelFollow() + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder reply %d failed: %v", turn, err) + return false + } + if followResp.ErrorMsg != "" { + resp.ErrorMsg = fmt.Sprintf("responder reply %d: %s", turn, followResp.ErrorMsg) + return false + } + + resp.Events = append(resp.Events, followResp.Events...) + resp.ToolCalls = append(resp.ToolCalls, followResp.ToolCalls...) + resp.SkillInvocations = append(resp.SkillInvocations, followResp.SkillInvocations...) + resp.DurationMs += followResp.DurationMs + resp.FinalOutput = followResp.FinalOutput + resp.WorkspaceFiles = followResp.WorkspaceFiles + if followResp.Usage != nil { + if resp.Usage == nil { + resp.Usage = followResp.Usage + } else { + resp.Usage = models.AggregateUsageStats([]*models.UsageStats{resp.Usage, followResp.Usage}) + } + } + return true +} +``` + +Add `"log/slog"` to the import block if it is not already imported. + +- [ ] **Step 4: Run tests to verify they pass** + +Run: `go test ./internal/orchestration/ -run '^TestResponderLoop' -v` +Expected: PASS (reply-then-stop, abstain→error, cap-exhausted). + +- [ ] **Step 5: Run the full orchestration + models + responder suites** + +Run: `go test ./internal/orchestration/ ./internal/models/ ./internal/responder/` +Expected: PASS. + +- [ ] **Step 6: Commit** + +```bash +git add internal/orchestration/runner.go internal/orchestration/responder_loop_test.go +git commit -m "feat: drive interactive skills via responder loop #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 8: JSON schema — add `responder` to the task `inputs` definition + +**Files:** +- Modify: `schemas/task.schema.json` +- Test: `internal/validation/schema_test.go` (add a case) — or rely on the existing doc-examples test once docs are added in Task 9. + +- [ ] **Step 1: Write the failing test** + +Add to `internal/validation/schema_test.go` a test that a task with `inputs.responder` validates. Match the file's existing helper for validating task bytes (look for `ValidateTaskBytes`): + +```go +func TestTaskSchemaAcceptsResponder(t *testing.T) { + yaml := []byte(` +id: t1 +name: Configure agent +inputs: + prompt: "add agent" + responder: + instructions: "be research-agent; abstain if unknown" + max_followups: 8 +`) + errs := ValidateTaskBytes(yaml) + require.Empty(t, errs) +} +``` + +(If the exported helper has a different name/signature, use the one already exercised by neighbouring tests in this file.) + +- [ ] **Step 2: Run test to verify it fails** + +Run: `go test ./internal/validation/ -run '^TestTaskSchemaAcceptsResponder$' -v` +Expected: FAIL — `inputs.responder` rejected because the `inputs` `$def` has `additionalProperties: false`. + +- [ ] **Step 3: Add `responder` to the schema** + +In `schemas/task.schema.json`, inside the `$defs.inputs.properties` object (the block containing `prompt`, `prompt_file`, `context`, `files`, `repos`, `workdir`, `environment`), add: + +```json + "responder": { + "type": "object", + "additionalProperties": false, + "required": ["instructions", "max_followups"], + "description": "LLM-backed surrogate user that answers the skill's follow-up questions. Mutually exclusive with follow_up_prompts.", + "properties": { + "model": { + "type": "string", + "description": "Model used for the responder LLM. Defaults to the eval-level config.model." + }, + "instructions": { + "type": "string", + "minLength": 1, + "description": "Describes the target configuration the responder represents and the rule for abstaining." + }, + "max_followups": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of responder replies before the loop stops." + } + } + } +``` + +Insert it as a sibling of `environment` (add a trailing comma to `environment` as needed to keep valid JSON). + +- [ ] **Step 4: Run test to verify it passes** + +Run: `go test ./internal/validation/ -run '^TestTaskSchemaAcceptsResponder$' -v` +Expected: PASS. Also run `go test ./internal/validation/` to ensure no regressions. + +- [ ] **Step 5: Commit** + +```bash +git add schemas/task.schema.json internal/validation/schema_test.go +git commit -m "feat: add inputs.responder to task JSON schema #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 9: Documentation — README and site + +**Files:** +- Modify: `README.md` +- Modify: `site/src/content/docs/guides/eval-yaml.mdx` +- Modify: `site/src/content/docs/reference/schema.mdx` + +- [ ] **Step 1: Add a README section** + +In `README.md`, near where multi-turn / `follow_up_prompts` is described (search for `follow_up_prompts`), add a "Responder (interactive skills)" subsection with this example and explanation: + +````markdown +#### Responder (interactive skills) + +For skills that ask follow-up questions, configure a `responder` — an LLM that +plays the user and answers the skill's questions. It is mutually exclusive with +`follow_up_prompts`. + +```yaml +inputs: + prompt: "Add a new agent to my application" + responder: + model: gpt-4o # optional; defaults to config.model + instructions: | + The agent you want is "research-agent" with system instructions + "Search the web and summarise findings", tools web_search + url_fetch, + and no handoffs. Answer the skill's questions consistently with this. + If you genuinely can't infer an answer, abstain. + max_followups: 8 +``` + +After each agent turn the responder either **replies** (the answer is sent back, +continuing the conversation), **stops** (the agent is done), or **abstains** — +which fails the run with a distinct `abstained` outcome, signalling the brief is +too vague. Each task carries its own responder, so the same skill can be tested +against several target configurations. +```` + +- [ ] **Step 2: Add the eval-yaml guide entry** + +In `site/src/content/docs/guides/eval-yaml.mdx`, after the `follow_up_prompts` section (search for it), add an analogous `responder` section using the same YAML example and explanation as Step 1, in the file's existing prose style. + +- [ ] **Step 3: Add the schema reference entry** + +In `site/src/content/docs/reference/schema.mdx`, after the `### follow_up_prompts` section, add: + +````markdown +### responder + +**Type:** object +**Required:** no + +An LLM-backed surrogate user that answers the skill's follow-up questions during +a multi-turn run. Mutually exclusive with `follow_up_prompts`. + +| Field | Type | Required | Description | +|----------------|---------|----------|--------------------------------------------------------| +| `model` | string | no | Responder model. Defaults to the eval-level `config.model`. | +| `instructions` | string | yes | Target configuration the responder represents + abstain rule. | +| `max_followups`| integer | yes | Max responder replies before the loop stops (`>= 1`). | + +```yaml +inputs: + prompt: "Add a new agent to my application" + responder: + instructions: "Be research-agent with tools web_search; abstain if unknown." + max_followups: 8 +``` + +The responder classifies each agent message as **reply**, **stop**, or +**abstain**. An abstain marks the run as an error with outcome `abstained`, +distinct from model timeouts or network errors. If `max_followups` is reached +while the agent is still asking questions, the loop stops with outcome +`cap_exhausted` and graders evaluate the final state. +```` + +- [ ] **Step 4: Build the docs site to verify** + +Run: `cd site && npm run build` +Expected: build succeeds with no MDX errors. + +- [ ] **Step 5: Commit** + +```bash +git add README.md site/src/content/docs/guides/eval-yaml.mdx site/src/content/docs/reference/schema.mdx +git commit -m "docs: document inputs.responder for interactive skills #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 10: Dashboard — surface responder outcome + +**Files:** +- Modify: dashboard source under `web/` (locate where `RunResult` / run details are rendered). +- Test: `web/` Playwright tests if the changed view has e2e coverage. + +- [ ] **Step 1: Locate where run results are rendered** + +Run: `grep -rn "final_output\|error_msg\|workspace_dir" web/src 2>/dev/null | head` +Identify the component that renders per-run details (the same one that shows `error_msg` / status). + +- [ ] **Step 2: Render `responder` when present** + +In that component, when a run has a `responder` object, display its `outcome` +(and `reason` when present) — e.g. a small badge/label: `Responder: abstained — +brief too vague` or `Responder: cap_exhausted (3 replies)`. Follow the existing +styling/markup conventions in the surrounding component. Keep it read-only; no +new data fetching is required since `responder` is already in the results JSON. + +- [ ] **Step 3: Run the dashboard e2e tests (if the view is covered)** + +Run: `cd web && npx playwright test --project=chromium` +Expected: PASS. If a screenshot snapshot for this view changed intentionally, +regenerate: `cd web && npx playwright test e2e/screenshots.spec.ts --project=chromium`. + +- [ ] **Step 4: Commit** + +```bash +git add web/ +git commit -m "feat: surface responder outcome in dashboard #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Task 11: Full verification + +**Files:** none (verification only). + +- [ ] **Step 1: Build everything** + +Run: `go build ./...` +Expected: success. + +- [ ] **Step 2: Run the full Go test suite** + +Run: `go test ./...` +Expected: PASS. + +- [ ] **Step 3: Lint** + +Run: `golangci-lint run` +Expected: no issues. Fix any lint findings in the files you touched. + +- [ ] **Step 4: Smoke-test with the mock engine (optional but recommended)** + +Create a temporary eval + task using `executor: mock` with an `inputs.responder` +block, run `./waza run -v -o /tmp/responder-results.json`, and +confirm the run completes and `responder` appears in the results JSON. Remove +the temp files afterward. + +- [ ] **Step 5: Final commit (if any verification fixes were made)** + +```bash +git add -A +git commit -m "chore: responder feature verification fixes #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +``` + +--- + +## Self-Review Notes (for the planner) + +- **Spec coverage:** config surface (Task 1), validation + mutual exclusivity (Task 2), responder package with persistent session + reply/stop/abstain + no-decision error (Tasks 3–4), runner loop + cap-exhaustion + abstain→StatusError (Task 7), `ResponderInfo` reporting (Task 5), schema (Task 8), docs (Task 9), dashboard (Task 10). All design sections map to a task. +- **Type consistency:** `ResponderConfig{Model, Instructions, MaxFollowups}`, `Decision{Kind, Answer, Reason}`, `DecisionReply/Stop/Abstain`, `Classifier.Classify`, `responderClassifier` interface, `ResponderInfo{FollowupsSent, Outcome, Reason}`, and the `ResponderOutcome*` constants are used identically across tasks. +- **Known integration point to verify during execution:** the exact `NewEvalRunner` receiver/return shape (Task 6 Step 1.4) and the existing runner test helper names (Task 7 Step 1) — inspect `runner.go` / `runner_test.go` and adapt the shown code to the real signatures rather than assuming. From 25964eae4e53e118addfcdcb1556151e00b2ee7a Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:05:36 +0100 Subject: [PATCH 03/17] feat: add inputs.responder config model #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/models/testcase.go | 16 ++++++++++++++++ internal/models/testcase_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/internal/models/testcase.go b/internal/models/testcase.go index 15b3e220..6367c65b 100644 --- a/internal/models/testcase.go +++ b/internal/models/testcase.go @@ -38,6 +38,22 @@ type TaskStimulus struct { WorkDir string `yaml:"workdir,omitempty" json:"workdir,omitempty"` Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"` FollowUps []string `yaml:"follow_up_prompts,omitempty" json:"follow_ups,omitempty"` + Responder *ResponderConfig `yaml:"responder,omitempty" json:"responder,omitempty"` +} + +// ResponderConfig configures an LLM-backed surrogate user that answers a +// skill's follow-up questions during a multi-turn run. It is mutually +// exclusive with TaskStimulus.FollowUps. +type ResponderConfig struct { + // Model is the model used for the responder LLM. Optional; when empty the + // eval-level config.model is used. + Model string `yaml:"model,omitempty" json:"model,omitempty"` + // Instructions describe the target configuration the responder represents + // and the rule for abstaining. Required. + Instructions string `yaml:"instructions" json:"instructions"` + // MaxFollowups caps how many times the responder may reply before the loop + // stops. Required; must be >= 1. + MaxFollowups int `yaml:"max_followups" json:"max_followups"` } // ResourceRef points to a file or inline content diff --git a/internal/models/testcase_test.go b/internal/models/testcase_test.go index e57bf553..1e68c7a7 100644 --- a/internal/models/testcase_test.go +++ b/internal/models/testcase_test.go @@ -5,6 +5,8 @@ import ( "path/filepath" "strings" "testing" + + "github.com/stretchr/testify/require" ) func TestLoadTestCase_ShouldTriggerField(t *testing.T) { @@ -215,3 +217,28 @@ instruction_files: t.Errorf("Expected second instruction file 'docs/task.instructions.md', got %q", tc.InstructionFiles[1]) } } + +func TestResponderConfigParsesUnderInputs(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "task.yaml") + yaml := ` +id: configure-agent +name: Configure a research agent +inputs: + prompt: "Add a new agent to my application" + responder: + model: gpt-4o + instructions: | + The agent you want is research-agent with tools web_search. + If you can't infer an answer, abstain. + max_followups: 8 +` + require.NoError(t, os.WriteFile(path, []byte(yaml), 0o600)) + + tc, err := LoadTestCase(path) + require.NoError(t, err) + require.NotNil(t, tc.Stimulus.Responder) + require.Equal(t, "gpt-4o", tc.Stimulus.Responder.Model) + require.Equal(t, 8, tc.Stimulus.Responder.MaxFollowups) + require.Contains(t, tc.Stimulus.Responder.Instructions, "research-agent") +} From de29bdac27826f4592852fa4d9b5b0f0b438d9e5 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:07:28 +0100 Subject: [PATCH 04/17] feat: validate inputs.responder fields and mutual exclusivity #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/models/testcase.go | 21 +++++++++++++ internal/models/testcase_test.go | 52 ++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/internal/models/testcase.go b/internal/models/testcase.go index 6367c65b..958b161f 100644 --- a/internal/models/testcase.go +++ b/internal/models/testcase.go @@ -335,6 +335,27 @@ func (tc *TestCase) Validate() error { } return fmt.Errorf("timeout_seconds must be at least 1, got %d", *tc.TimeoutSec) } + + if r := tc.Stimulus.Responder; r != nil { + name := tc.TestID + if name == "" { + name = tc.DisplayName + } + prefix := "test case" + if name != "" { + prefix = fmt.Sprintf("test case %q", name) + } + if strings.TrimSpace(r.Instructions) == "" { + return fmt.Errorf("%s: responder.instructions is required", prefix) + } + if r.MaxFollowups < 1 { + return fmt.Errorf("%s: responder.max_followups must be at least 1, got %d", prefix, r.MaxFollowups) + } + if len(tc.Stimulus.FollowUps) > 0 { + return fmt.Errorf("%s: inputs.responder and inputs.follow_up_prompts are mutually exclusive; use one or the other", prefix) + } + } + return nil } diff --git a/internal/models/testcase_test.go b/internal/models/testcase_test.go index 1e68c7a7..00f82fdd 100644 --- a/internal/models/testcase_test.go +++ b/internal/models/testcase_test.go @@ -242,3 +242,55 @@ inputs: require.Equal(t, 8, tc.Stimulus.Responder.MaxFollowups) require.Contains(t, tc.Stimulus.Responder.Instructions, "research-agent") } + +func TestResponderValidationRejectsMissingInstructions(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + Responder: &ResponderConfig{MaxFollowups: 3}, + }, + } + err := tc.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "instructions") +} + +func TestResponderValidationRejectsZeroMaxFollowups(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 0}, + }, + } + err := tc.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "max_followups") +} + +func TestResponderValidationRejectsBothResponderAndFollowUps(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + FollowUps: []string{"next"}, + Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2}, + }, + } + err := tc.Validate() + require.Error(t, err) + require.Contains(t, err.Error(), "follow_up_prompts") + require.Contains(t, err.Error(), "responder") +} + +func TestResponderValidationAcceptsValidConfig(t *testing.T) { + tc := &TestCase{ + TestID: "t1", + Stimulus: TaskStimulus{ + Message: "go", + Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2}, + }, + } + require.NoError(t, tc.Validate()) +} From c32374c8a7c6ad3ac0ae4e387147616177ac17eb Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:10:54 +0100 Subject: [PATCH 05/17] feat: add responder decision types and tools #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/responder/responder.go | 118 +++++++++++++++++++++++++++ internal/responder/responder_test.go | 55 +++++++++++++ 2 files changed, 173 insertions(+) create mode 100644 internal/responder/responder.go create mode 100644 internal/responder/responder_test.go diff --git a/internal/responder/responder.go b/internal/responder/responder.go new file mode 100644 index 00000000..9e88bab5 --- /dev/null +++ b/internal/responder/responder.go @@ -0,0 +1,118 @@ +// Package responder implements an LLM-backed surrogate user that answers an +// interactive skill's follow-up questions during a multi-turn evaluation run. +package responder + +import ( + "context" + + copilot "github.com/github/copilot-sdk/go" + "github.com/go-viper/mapstructure/v2" + "github.com/microsoft/waza/internal/execution" + "github.com/microsoft/waza/internal/models" +) + +// DecisionKind enumerates the responder's possible classifications of an agent +// message. +type DecisionKind int + +const ( + // DecisionReply means the responder answered the agent's question. + DecisionReply DecisionKind = iota + // DecisionStop means the agent is done and no further input is needed. + DecisionStop + // DecisionAbstain means the responder could not answer from its brief. + DecisionAbstain +) + +// Decision is the outcome of classifying a single agent message. +type Decision struct { + Kind DecisionKind + Answer string // set when Kind == DecisionReply + Reason string // set when Kind == DecisionAbstain +} + +const ( + toolRespond = "responder_reply" + toolStop = "responder_stop" + toolAbstain = "responder_abstain" +) + +// Executor is the narrow execution surface the responder needs. The concrete +// AgentEngine satisfies it, and tests supply a fake. +type Executor interface { + Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) +} + +// decisionRecorder captures the single decision tool the responder LLM calls. +type decisionRecorder struct { + decision Decision + set bool +} + +func (d *decisionRecorder) tools() []copilot.Tool { + return []copilot.Tool{ + { + Name: toolRespond, + Description: "Answer the agent's question as the user. Call this exactly once with your answer.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{ + "answer": map[string]any{ + "type": "string", + "description": "Your reply to the agent's question, consistent with your configuration.", + }, + }, + "required": []string{"answer"}, + }, + Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { + var args struct { + Answer string `mapstructure:"answer"` + } + _ = mapstructure.Decode(inv.Arguments, &args) + d.decision = Decision{Kind: DecisionReply, Answer: args.Answer} + d.set = true + return copilot.ToolResult{}, nil + }, + }, + { + Name: toolStop, + Description: "Signal that the agent has finished and needs no further input. Call this when there is no question to answer.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{}, + }, + Handler: func(copilot.ToolInvocation) (copilot.ToolResult, error) { + d.decision = Decision{Kind: DecisionStop} + d.set = true + return copilot.ToolResult{}, nil + }, + }, + { + Name: toolAbstain, + Description: "Signal that you cannot answer the agent's question from your configuration. Call this only when the information is genuinely missing.", + Parameters: map[string]any{ + "type": "object", + "properties": map[string]any{ + "reason": map[string]any{ + "type": "string", + "description": "Why you cannot answer.", + }, + }, + "required": []string{"reason"}, + }, + Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { + var args struct { + Reason string `mapstructure:"reason"` + } + _ = mapstructure.Decode(inv.Arguments, &args) + d.decision = Decision{Kind: DecisionAbstain, Reason: args.Reason} + d.set = true + return copilot.ToolResult{}, nil + }, + }, + } +} + +// Classifier is added in the next task; declared here so the package compiles +// with its dependencies referenced. +var _ = models.ResponderConfig{} diff --git a/internal/responder/responder_test.go b/internal/responder/responder_test.go new file mode 100644 index 00000000..fece423f --- /dev/null +++ b/internal/responder/responder_test.go @@ -0,0 +1,55 @@ +package responder + +import ( + "testing" + + copilot "github.com/github/copilot-sdk/go" + "github.com/stretchr/testify/require" +) + +func TestDecisionToolsRecordReply(t *testing.T) { + d := &decisionRecorder{} + tools := d.tools() + require.Len(t, tools, 3) + + respond := findTool(t, tools, toolRespond) + _, err := respond.Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "research-agent"}, + }) + require.NoError(t, err) + require.True(t, d.set) + require.Equal(t, DecisionReply, d.decision.Kind) + require.Equal(t, "research-agent", d.decision.Answer) +} + +func TestDecisionToolsRecordStop(t *testing.T) { + d := &decisionRecorder{} + stop := findTool(t, d.tools(), toolStop) + _, err := stop.Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + require.NoError(t, err) + require.True(t, d.set) + require.Equal(t, DecisionStop, d.decision.Kind) +} + +func TestDecisionToolsRecordAbstain(t *testing.T) { + d := &decisionRecorder{} + abstain := findTool(t, d.tools(), toolAbstain) + _, err := abstain.Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"reason": "brief too vague"}, + }) + require.NoError(t, err) + require.True(t, d.set) + require.Equal(t, DecisionAbstain, d.decision.Kind) + require.Equal(t, "brief too vague", d.decision.Reason) +} + +func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool { + t.Helper() + for _, tl := range tools { + if tl.Name == name { + return tl + } + } + t.Fatalf("tool %q not found", name) + return copilot.Tool{} +} From 6732d923db7ff4e14e05e776016cd80f1a4f1531 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:13:10 +0100 Subject: [PATCH 06/17] feat: add responder Classifier with persistent session #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/responder/responder.go | 74 +++++++++++++++++++++- internal/responder/responder_test.go | 94 ++++++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 3 deletions(-) diff --git a/internal/responder/responder.go b/internal/responder/responder.go index 9e88bab5..7387a35d 100644 --- a/internal/responder/responder.go +++ b/internal/responder/responder.go @@ -4,6 +4,8 @@ package responder import ( "context" + "errors" + "fmt" copilot "github.com/github/copilot-sdk/go" "github.com/go-viper/mapstructure/v2" @@ -113,6 +115,72 @@ func (d *decisionRecorder) tools() []copilot.Tool { } } -// Classifier is added in the next task; declared here so the package compiles -// with its dependencies referenced. -var _ = models.ResponderConfig{} +// Classifier maintains a persistent surrogate-user session and classifies each +// agent message into a Decision. +type Classifier struct { + exec Executor + model string + instructions string + sessionID string // empty until the first Classify creates the session +} + +// New constructs a Classifier. defaultModel is used when cfg.Model is empty. +func New(exec Executor, cfg models.ResponderConfig, defaultModel string) *Classifier { + model := cfg.Model + if model == "" { + model = defaultModel + } + return &Classifier{ + exec: exec, + model: model, + instructions: cfg.Instructions, + } +} + +// Classify sends the agent's latest message to the responder LLM and returns +// its decision. The first call seeds the session with the responder +// instructions; subsequent calls resume the same session. +func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decision, error) { + rec := &decisionRecorder{} + + req := &execution.ExecutionRequest{ + ModelID: c.model, + Message: c.buildMessage(agentMessage), + Tools: rec.tools(), + MessageMode: execution.MessageModeEnqueue, + Streaming: true, + SessionID: c.sessionID, + NoSkills: true, + EphemeralSession: true, + SkipWorkspaceCapture: true, + } + + resp, err := c.exec.Execute(ctx, req) + if err != nil { + if rec.set { + return rec.decision, nil + } + return Decision{}, fmt.Errorf("responder execution failed: %w", err) + } + if resp != nil && resp.SessionID != "" { + c.sessionID = resp.SessionID + } + if !rec.set { + return Decision{}, errors.New("responder did not call a decision tool") + } + return rec.decision, nil +} + +func (c *Classifier) buildMessage(agentMessage string) string { + if c.sessionID == "" { + return fmt.Sprintf( + "%s\n\nYou are role-playing as the user. The agent just said:\n\n%s\n\n"+ + "Respond by calling exactly one tool: %s to answer, %s if the agent is finished and needs nothing, or %s if you genuinely cannot answer from your configuration.", + c.instructions, agentMessage, toolRespond, toolStop, toolAbstain, + ) + } + return fmt.Sprintf( + "The agent just said:\n\n%s\n\nRespond by calling exactly one tool (%s, %s, or %s).", + agentMessage, toolRespond, toolStop, toolAbstain, + ) +} diff --git a/internal/responder/responder_test.go b/internal/responder/responder_test.go index fece423f..60b84e33 100644 --- a/internal/responder/responder_test.go +++ b/internal/responder/responder_test.go @@ -1,9 +1,12 @@ package responder import ( + "context" "testing" copilot "github.com/github/copilot-sdk/go" + "github.com/microsoft/waza/internal/execution" + "github.com/microsoft/waza/internal/models" "github.com/stretchr/testify/require" ) @@ -43,6 +46,97 @@ func TestDecisionToolsRecordAbstain(t *testing.T) { require.Equal(t, "brief too vague", d.decision.Reason) } +type fakeExecutor struct { + calls []*execution.ExecutionRequest + respond func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) +} + +func (f *fakeExecutor) Execute(_ context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + f.calls = append(f.calls, req) + return f.respond(req) +} + +func TestClassifyReply(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, err := findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "research-agent"}, + }) + require.NoError(t, err) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}, "gpt-4o") + d, err := c.Classify(context.Background(), "What is the agent name?") + require.NoError(t, err) + require.Equal(t, DecisionReply, d.Kind) + require.Equal(t, "research-agent", d.Answer) +} + +func TestClassifyAbstain(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolAbstain).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"reason": "no info"}, + }) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + d, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) + require.Equal(t, DecisionAbstain, d.Kind) + require.Equal(t, "no info", d.Reason) +} + +func TestClassifyNoDecisionToolIsError(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.Error(t, err) +} + +func TestClassifyUsesDefaultModelWhenUnset(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + require.Equal(t, "default-model", req.ModelID) + _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "default-model") + _, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) +} + +func TestClassifyPersistsSession(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "a"}, + }) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "INSTR", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q1?") + require.NoError(t, err) + _, err = c.Classify(context.Background(), "Q2?") + require.NoError(t, err) + + require.Len(t, exec.calls, 2) + require.Empty(t, exec.calls[0].SessionID) + require.Contains(t, exec.calls[0].Message, "INSTR") + require.Contains(t, exec.calls[0].Message, "Q1?") + require.Equal(t, "resp-1", exec.calls[1].SessionID) + require.NotContains(t, exec.calls[1].Message, "INSTR") + require.Contains(t, exec.calls[1].Message, "Q2?") +} + func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool { t.Helper() for _, tl := range tools { From 574035321d0843ede8a5c61456a1ef5278ef9cf9 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:14:46 +0100 Subject: [PATCH 07/17] feat: add ResponderInfo to RunResult #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/models/outcome.go | 21 +++++++++++++++++++++ internal/models/outcome_test.go | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/internal/models/outcome.go b/internal/models/outcome.go index c1fba4cb..a47540ee 100644 --- a/internal/models/outcome.go +++ b/internal/models/outcome.go @@ -20,6 +20,15 @@ const ( StatusNA Status = "n/a" ) +// Responder outcome values recorded on RunResult.Responder.Outcome. +const ( + ResponderOutcomeCompleted = "completed" + ResponderOutcomeStopped = "stopped" + ResponderOutcomeAbstained = "abstained" + ResponderOutcomeCapExhausted = "cap_exhausted" + ResponderOutcomeError = "error" +) + // GraderKind identifies the type of grader (e.g. regex, file, code). type GraderKind string @@ -62,6 +71,17 @@ func AllGraderKinds() []string { return names } +// ResponderInfo records the outcome of a responder-driven multi-turn run. +type ResponderInfo struct { + // FollowupsSent is the number of responder answers sent to the agent. + FollowupsSent int `json:"followups_sent"` + // Outcome is one of: completed, stopped, abstained, cap_exhausted, error. + Outcome string `json:"outcome"` + // Reason holds the responder's reason when Outcome == "abstained" or an + // error message when Outcome == "error". + Reason string `json:"reason,omitempty"` +} + // EvaluationOutcome represents the complete result of an evaluation run type EvaluationOutcome struct { RunID string `json:"eval_id"` @@ -158,6 +178,7 @@ type RunResult struct { SkillInvocations []SkillInvocation `json:"skill_invocations,omitempty"` Usage *UsageStats `json:"usage,omitempty"` WorkspaceDir string `json:"workspace_dir,omitempty"` + Responder *ResponderInfo `json:"responder,omitempty"` } type GraderResults struct { diff --git a/internal/models/outcome_test.go b/internal/models/outcome_test.go index 6a97a593..47f2ec02 100644 --- a/internal/models/outcome_test.go +++ b/internal/models/outcome_test.go @@ -1,6 +1,7 @@ package models import ( + "encoding/json" "math" "testing" @@ -226,3 +227,23 @@ func TestAggregateUsageStats_AllNil(t *testing.T) { func TestAggregateUsageStats_Empty(t *testing.T) { require.Nil(t, AggregateUsageStats(nil)) } + +func TestResponderInfoSerializes(t *testing.T) { + rr := RunResult{ + RunNumber: 1, + Status: StatusError, + Responder: &ResponderInfo{ + FollowupsSent: 3, + Outcome: "abstained", + Reason: "brief too vague", + }, + } + data, err := json.Marshal(rr) + require.NoError(t, err) + require.Contains(t, string(data), `"responder"`) + require.Contains(t, string(data), `"outcome":"abstained"`) + + data2, err := json.Marshal(RunResult{RunNumber: 1, Status: StatusPassed}) + require.NoError(t, err) + require.NotContains(t, string(data2), `"responder"`) +} From 0670c60eb31ba62516ddad2430567a64b8061b98 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:17:08 +0100 Subject: [PATCH 08/17] refactor: add injectable responder classifier factory to runner #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/orchestration/runner.go | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index 97c9f35d..71de9317 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -19,6 +19,7 @@ import ( "github.com/microsoft/waza/internal/graders" "github.com/microsoft/waza/internal/hooks" "github.com/microsoft/waza/internal/models" + "github.com/microsoft/waza/internal/responder" "github.com/microsoft/waza/internal/template" "github.com/microsoft/waza/internal/transcript" "github.com/microsoft/waza/internal/utils" @@ -26,6 +27,12 @@ import ( copilot "github.com/github/copilot-sdk/go" ) +// responderClassifier classifies an agent message into a responder decision. +// Implemented by *responder.Classifier; faked in tests. +type responderClassifier interface { + Classify(ctx context.Context, agentMessage string) (responder.Decision, error) +} + // EvalRunner orchestrates the execution of tests. // // Deprecated alias: TestRunner is provided for backward compatibility. @@ -34,6 +41,10 @@ type EvalRunner struct { engine execution.AgentEngine verbose bool + // newClassifier builds a responder classifier for a task. Overridable in + // tests; defaults to a responder backed by the runner's engine. + newClassifier func(cfg models.ResponderConfig, defaultModel string) responderClassifier + // Task filtering taskFilters []string @@ -136,6 +147,9 @@ func NewEvalRunner(cfg *config.EvalConfig, engine execution.AgentEngine, opts .. verbose: cfg.Verbose(), listeners: []ProgressListener{}, } + r.newClassifier = func(cfg models.ResponderConfig, defaultModel string) responderClassifier { + return responder.New(r.engine, cfg, defaultModel) + } for _, o := range opts { o(r) } From 8c7bedc1e770dd660147fb61869974b90830cf27 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:20:40 +0100 Subject: [PATCH 09/17] feat: drive interactive skills via responder loop #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/orchestration/responder_loop_test.go | 104 ++++++++++++++++ internal/orchestration/runner.go | 116 +++++++++++++++++- 2 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 internal/orchestration/responder_loop_test.go diff --git a/internal/orchestration/responder_loop_test.go b/internal/orchestration/responder_loop_test.go new file mode 100644 index 00000000..070e3b47 --- /dev/null +++ b/internal/orchestration/responder_loop_test.go @@ -0,0 +1,104 @@ +package orchestration + +import ( + "context" + "testing" + + "github.com/microsoft/waza/internal/config" + "github.com/microsoft/waza/internal/execution" + "github.com/microsoft/waza/internal/models" + "github.com/microsoft/waza/internal/responder" + "github.com/stretchr/testify/require" +) + +// scriptedClassifier returns a queued sequence of decisions, repeating the last. +type scriptedClassifier struct { + decisions []responder.Decision + idx int + calls int +} + +func (s *scriptedClassifier) Classify(_ context.Context, _ string) (responder.Decision, error) { + s.calls++ + d := s.decisions[s.idx] + if s.idx < len(s.decisions)-1 { + s.idx++ + } + return d, nil +} + +func newResponderTestRunner(t *testing.T) *EvalRunner { + t.Helper() + spec := &models.EvalSpec{ + SpecIdentity: models.SpecIdentity{Name: "test-benchmark"}, + SkillName: "my-skill", + Config: models.Config{ + EngineType: "mock", + ModelID: "gpt-4", + TimeoutSec: 120, + }, + } + cfg := config.NewEvalConfig(spec) + engine := execution.NewMockEngine("gpt-4") + require.NoError(t, engine.Initialize(context.Background())) + t.Cleanup(func() { require.NoError(t, engine.Shutdown(context.Background())) }) + return NewEvalRunner(cfg, engine, WithSkipGraders()) +} + +func TestResponderLoopReplyThenStop(t *testing.T) { + r := newResponderTestRunner(t) + sc := &scriptedClassifier{decisions: []responder.Decision{ + {Kind: responder.DecisionReply, Answer: "research-agent"}, + {Kind: responder.DecisionStop}, + }} + r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } + + tc := &models.TestCase{ + TestID: "t1", + Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}}, + } + rr := r.executeRun(context.Background(), tc, 1) + + require.NotNil(t, rr.Responder) + require.Equal(t, models.ResponderOutcomeStopped, rr.Responder.Outcome) + require.Equal(t, 1, rr.Responder.FollowupsSent) +} + +func TestResponderLoopAbstainMarksError(t *testing.T) { + r := newResponderTestRunner(t) + sc := &scriptedClassifier{decisions: []responder.Decision{ + {Kind: responder.DecisionAbstain, Reason: "too vague"}, + }} + r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } + + tc := &models.TestCase{ + TestID: "t1", + Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 5}}, + } + rr := r.executeRun(context.Background(), tc, 1) + + require.Equal(t, models.StatusError, rr.Status) + require.NotNil(t, rr.Responder) + require.Equal(t, models.ResponderOutcomeAbstained, rr.Responder.Outcome) + require.Contains(t, rr.ErrorMsg, "abstained") + require.Contains(t, rr.ErrorMsg, "too vague") +} + +func TestResponderLoopCapExhausted(t *testing.T) { + r := newResponderTestRunner(t) + sc := &scriptedClassifier{decisions: []responder.Decision{ + {Kind: responder.DecisionReply, Answer: "a"}, + }} + r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } + + tc := &models.TestCase{ + TestID: "t1", + Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 2}}, + } + rr := r.executeRun(context.Background(), tc, 1) + + require.NotNil(t, rr.Responder) + require.Equal(t, models.ResponderOutcomeCapExhausted, rr.Responder.Outcome) + require.Equal(t, 2, rr.Responder.FollowupsSent) + require.NotEqual(t, models.StatusError, rr.Status) +} diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index 71de9317..2b0790be 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -3,6 +3,7 @@ package orchestration import ( "context" "fmt" + "log/slog" "math" "os" "path/filepath" @@ -1086,8 +1087,12 @@ func (r *EvalRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum }) } - // Execute follow-up prompts if defined - if len(tc.Stimulus.FollowUps) > 0 { + // Drive multi-turn: responder loop takes precedence; otherwise static + // follow-ups. Validation guarantees these are mutually exclusive. + var responderInfo *models.ResponderInfo + if tc.Stimulus.Responder != nil { + responderInfo = r.executeResponderLoop(ctx, tc, resp) + } else if len(tc.Stimulus.FollowUps) > 0 { r.executeFollowUps(ctx, tc, resp) } @@ -1167,6 +1172,7 @@ func (r *EvalRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum ErrorMsg: resp.ErrorMsg, SkillInvocations: skillInvocations, WorkspaceDir: resp.WorkspaceDir, + Responder: responderInfo, } } @@ -1288,6 +1294,112 @@ func (r *EvalRunner) executeFollowUps(ctx context.Context, tc *models.TestCase, } } +// executeResponderLoop drives a multi-turn run using an LLM-backed surrogate +// user. After each agent turn it classifies the agent's latest message and +// either replies (sending a new agent prompt), stops, or aborts on abstain. +// It mutates resp in place (mirroring executeFollowUps) and returns a summary. +func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse) *models.ResponderInfo { + cfg := *tc.Stimulus.Responder + classifier := r.newClassifier(cfg, r.cfg.Spec().Config.ModelID) + + info := &models.ResponderInfo{Outcome: models.ResponderOutcomeCompleted} + left := cfg.MaxFollowups + lastWasReply := false + + for left > 0 { + decision, err := classifier.Classify(ctx, resp.FinalOutput) + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder error: %v", err) + info.Outcome = models.ResponderOutcomeError + info.Reason = err.Error() + return info + } + + switch decision.Kind { + case responder.DecisionStop: + info.Outcome = models.ResponderOutcomeStopped + return info + + case responder.DecisionAbstain: + resp.ErrorMsg = fmt.Sprintf("responder abstained: %s", decision.Reason) + info.Outcome = models.ResponderOutcomeAbstained + info.Reason = decision.Reason + return info + + case responder.DecisionReply: + if !r.sendResponderReply(ctx, tc, resp, decision.Answer, info.FollowupsSent+1) { + info.Outcome = models.ResponderOutcomeError + info.Reason = resp.ErrorMsg + return info + } + info.FollowupsSent++ + left-- + lastWasReply = true + } + } + + if lastWasReply { + info.Outcome = models.ResponderOutcomeCapExhausted + slog.WarnContext(ctx, "responder budget exhausted while agent still asking questions", + "test", tc.DisplayName, "max_followups", cfg.MaxFollowups) + } + return info +} + +// sendResponderReply sends one responder answer to the agent session, reusing +// the session and workspace, and merges the agent's response into resp. It +// returns false (and sets resp.ErrorMsg) on failure. +func (r *EvalRunner) sendResponderReply(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse, answer string, turn int) bool { + followReq, err := r.buildExecutionRequest(tc) + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err) + return false + } + followReq.Message = answer + followReq.SessionID = resp.SessionID + followReq.WorkspaceDir = resp.WorkspaceDir + + if r.verbose { + r.notifyProgress(ProgressEvent{ + EventType: EventAgentPrompt, + TestName: tc.DisplayName, + Details: map[string]any{"message": answer, "responder_reply": turn}, + }) + } + + timeout, err := r.executionTimeout(tc) + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err) + return false + } + followCtx, cancelFollow := context.WithTimeout(ctx, timeout) + followResp, err := r.engine.Execute(followCtx, followReq) + cancelFollow() + if err != nil { + resp.ErrorMsg = fmt.Sprintf("responder reply %d failed: %v", turn, err) + return false + } + if followResp.ErrorMsg != "" { + resp.ErrorMsg = fmt.Sprintf("responder reply %d: %s", turn, followResp.ErrorMsg) + return false + } + + resp.Events = append(resp.Events, followResp.Events...) + resp.ToolCalls = append(resp.ToolCalls, followResp.ToolCalls...) + resp.SkillInvocations = append(resp.SkillInvocations, followResp.SkillInvocations...) + resp.DurationMs += followResp.DurationMs + resp.FinalOutput = followResp.FinalOutput + resp.WorkspaceFiles = followResp.WorkspaceFiles + if followResp.Usage != nil { + if resp.Usage == nil { + resp.Usage = followResp.Usage + } else { + resp.Usage = models.AggregateUsageStats([]*models.UsageStats{resp.Usage, followResp.Usage}) + } + } + return true +} + func (r *EvalRunner) loadResources(tc *models.TestCase) []execution.ResourceFile { var resources []execution.ResourceFile From 857161424528796d46d15f32b08c0ec37af6c211 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:35:48 +0100 Subject: [PATCH 10/17] fix: use persistent responder session with explicit teardown #303 Responder Classify used EphemeralSession=true, which the engine deletes after the first turn, breaking session resume and dropping instructions on every subsequent turn. Switch to a persistent (non-ephemeral) session, add Classifier.Close plus CopilotEngine.DeleteSession to tear it down explicitly, and call Close via defer at the end of the responder loop with a detached context so cleanup runs even on cancellation. Capture sessionID before the error check so an error-with-decision still persists the session id. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/execution/copilot.go | 14 ++++ internal/orchestration/responder_loop_test.go | 2 + internal/orchestration/runner.go | 16 ++++- internal/responder/responder.go | 48 ++++++++++---- internal/responder/responder_test.go | 65 +++++++++++++++++++ 5 files changed, 132 insertions(+), 13 deletions(-) diff --git a/internal/execution/copilot.go b/internal/execution/copilot.go index bc994a49..beaa4f9e 100644 --- a/internal/execution/copilot.go +++ b/internal/execution/copilot.go @@ -574,6 +574,20 @@ func (e *CopilotEngine) doShutdown(ctx context.Context) error { return nil } +// DeleteSession removes a persistent session created via Execute (with +// EphemeralSession=false) and stops tracking it for shutdown cleanup. It is +// used by callers that own a long-lived session, such as the responder, to +// tear it down promptly rather than waiting for engine Shutdown. +func (e *CopilotEngine) DeleteSession(ctx context.Context, sessionID string) error { + if sessionID == "" { + return nil + } + e.sessionsMu.Lock() + delete(e.sessions, sessionID) + e.sessionsMu.Unlock() + return e.client.DeleteSession(ctx, sessionID) +} + // SessionUsage returns the final usage stats for a session. Call after Shutdown() // to get data from session.shutdown events (ModelMetrics, TotalPremiumRequests). // When BYOK was active for this session, the returned stats include sanitized diff --git a/internal/orchestration/responder_loop_test.go b/internal/orchestration/responder_loop_test.go index 070e3b47..5b61bf54 100644 --- a/internal/orchestration/responder_loop_test.go +++ b/internal/orchestration/responder_loop_test.go @@ -27,6 +27,8 @@ func (s *scriptedClassifier) Classify(_ context.Context, _ string) (responder.De return d, nil } +func (s *scriptedClassifier) Close(_ context.Context) error { return nil } + func newResponderTestRunner(t *testing.T) *EvalRunner { t.Helper() spec := &models.EvalSpec{ diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index 2b0790be..4b458714 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -28,10 +28,12 @@ import ( copilot "github.com/github/copilot-sdk/go" ) -// responderClassifier classifies an agent message into a responder decision. -// Implemented by *responder.Classifier; faked in tests. +// responderClassifier classifies an agent message into a responder decision +// and tears down its session when the run finishes. Implemented by +// *responder.Classifier; faked in tests. type responderClassifier interface { Classify(ctx context.Context, agentMessage string) (responder.Decision, error) + Close(ctx context.Context) error } // EvalRunner orchestrates the execution of tests. @@ -1301,6 +1303,16 @@ func (r *EvalRunner) executeFollowUps(ctx context.Context, tc *models.TestCase, func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse) *models.ResponderInfo { cfg := *tc.Stimulus.Responder classifier := r.newClassifier(cfg, r.cfg.Spec().Config.ModelID) + defer func() { + // Tear down the persistent responder session with a detached context so + // cleanup still runs even if ctx was cancelled during the run. + cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + defer cancel() + if err := classifier.Close(cleanupCtx); err != nil { + slog.WarnContext(ctx, "failed to clean up responder session", + "test", tc.DisplayName, "error", err) + } + }() info := &models.ResponderInfo{Outcome: models.ResponderOutcomeCompleted} left := cfg.MaxFollowups diff --git a/internal/responder/responder.go b/internal/responder/responder.go index 7387a35d..f6f59cfe 100644 --- a/internal/responder/responder.go +++ b/internal/responder/responder.go @@ -45,6 +45,13 @@ type Executor interface { Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) } +// sessionDeleter is an optional capability for explicitly tearing down a +// persistent session. *execution.CopilotEngine implements it; engines that do +// not (e.g. the mock) leave Close as a no-op. +type sessionDeleter interface { + DeleteSession(ctx context.Context, sessionID string) error +} + // decisionRecorder captures the single decision tool the responder LLM calls. type decisionRecorder struct { decision Decision @@ -144,33 +151,52 @@ func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decisio rec := &decisionRecorder{} req := &execution.ExecutionRequest{ - ModelID: c.model, - Message: c.buildMessage(agentMessage), - Tools: rec.tools(), - MessageMode: execution.MessageModeEnqueue, - Streaming: true, - SessionID: c.sessionID, - NoSkills: true, - EphemeralSession: true, + ModelID: c.model, + Message: c.buildMessage(agentMessage), + Tools: rec.tools(), + MessageMode: execution.MessageModeEnqueue, + Streaming: true, + SessionID: c.sessionID, + NoSkills: true, + // The responder session must persist across turns so it can be resumed + // (and so its instructions need only be sent once). It is torn down + // explicitly via Close. EphemeralSession would delete it after the + // first turn, breaking resume. + EphemeralSession: false, SkipWorkspaceCapture: true, } resp, err := c.exec.Execute(ctx, req) + if resp != nil && resp.SessionID != "" { + c.sessionID = resp.SessionID + } if err != nil { if rec.set { return rec.decision, nil } return Decision{}, fmt.Errorf("responder execution failed: %w", err) } - if resp != nil && resp.SessionID != "" { - c.sessionID = resp.SessionID - } if !rec.set { return Decision{}, errors.New("responder did not call a decision tool") } return rec.decision, nil } +// Close tears down the persistent responder session if one was created. It is +// safe to call multiple times and is a no-op when the underlying executor does +// not support explicit session deletion. +func (c *Classifier) Close(ctx context.Context) error { + if c.sessionID == "" { + return nil + } + sessionID := c.sessionID + c.sessionID = "" + if d, ok := c.exec.(sessionDeleter); ok { + return d.DeleteSession(ctx, sessionID) + } + return nil +} + func (c *Classifier) buildMessage(agentMessage string) string { if c.sessionID == "" { return fmt.Sprintf( diff --git a/internal/responder/responder_test.go b/internal/responder/responder_test.go index 60b84e33..c7cf0e31 100644 --- a/internal/responder/responder_test.go +++ b/internal/responder/responder_test.go @@ -137,6 +137,71 @@ func TestClassifyPersistsSession(t *testing.T) { require.Contains(t, exec.calls[1].Message, "Q2?") } +func TestClassifyUsesPersistentSession(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) + + require.Len(t, exec.calls, 1) + require.False(t, exec.calls[0].EphemeralSession, + "responder must use a persistent (non-ephemeral) session so it can be resumed across turns") +} + +// deletingExecutor records the sessions it is asked to delete. +type deletingExecutor struct { + fakeExecutor + deleted []string +} + +func (d *deletingExecutor) DeleteSession(_ context.Context, sessionID string) error { + d.deleted = append(d.deleted, sessionID) + return nil +} + +func TestCloseDeletesSession(t *testing.T) { + exec := &deletingExecutor{} + exec.respond = func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) + + require.NoError(t, c.Close(context.Background())) + require.Equal(t, []string{"resp-1"}, exec.deleted) + + // Close is idempotent: the session id is cleared after the first call. + require.NoError(t, c.Close(context.Background())) + require.Equal(t, []string{"resp-1"}, exec.deleted) +} + +func TestCloseWithoutSessionIsNoop(t *testing.T) { + exec := &deletingExecutor{} + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + require.NoError(t, c.Close(context.Background())) + require.Empty(t, exec.deleted) +} + +func TestCloseWithoutDeleterIsNoop(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.NoError(t, err) + require.NoError(t, c.Close(context.Background())) +} + func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool { t.Helper() for _, tl := range tools { From 4cf3968d2365e07149be3c6ecfb4e66073fe8afe Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:38:21 +0100 Subject: [PATCH 11/17] feat: add inputs.responder to task JSON schema #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/validation/schema_test.go | 13 +++++++++++++ schemas/task.schema.json | 22 ++++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/internal/validation/schema_test.go b/internal/validation/schema_test.go index ea7ea2ad..a345392f 100644 --- a/internal/validation/schema_test.go +++ b/internal/validation/schema_test.go @@ -125,6 +125,19 @@ inputs: require.Empty(t, errs, "task with instruction_files should have no errors") } +func TestValidateTaskBytes_Responder(t *testing.T) { + yaml := `id: task-1 +name: Configure agent +inputs: + prompt: "add agent" + responder: + instructions: "be research-agent; abstain if unknown" + max_followups: 8 +` + errs := ValidateTaskBytes([]byte(yaml)) + require.Empty(t, errs, "task with inputs.responder should have no errors") +} + func TestValidateTaskBytes_Invalid(t *testing.T) { errs := ValidateTaskBytes([]byte(invalidTaskYAML)) require.NotEmpty(t, errs, "invalid task should have errors") diff --git a/schemas/task.schema.json b/schemas/task.schema.json index 332259c5..9722c777 100644 --- a/schemas/task.schema.json +++ b/schemas/task.schema.json @@ -135,6 +135,28 @@ "type": "string" }, "description": "Environment variables set during task execution." + }, + "responder": { + "type": "object", + "additionalProperties": false, + "required": ["instructions", "max_followups"], + "description": "LLM-backed surrogate user that answers the skill's follow-up questions. Mutually exclusive with follow_up_prompts.", + "properties": { + "model": { + "type": "string", + "description": "Model used for the responder LLM. Defaults to the eval-level config.model." + }, + "instructions": { + "type": "string", + "minLength": 1, + "description": "Describes the target configuration the responder represents and the rule for abstaining." + }, + "max_followups": { + "type": "integer", + "minimum": 1, + "description": "Maximum number of responder replies before the loop stops." + } + } } } }, From 13552633ecd199a5ae5bb786c84e38d54bbed672 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:40:16 +0100 Subject: [PATCH 12/17] docs: document inputs.responder for interactive skills #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 28 ++++++++++++++++++++++ site/src/content/docs/guides/eval-yaml.mdx | 19 +++++++++++++++ site/src/content/docs/reference/schema.mdx | 23 ++++++++++++++++++ 3 files changed, 70 insertions(+) diff --git a/README.md b/README.md index dc02e6c5..ea3d4240 100644 --- a/README.md +++ b/README.md @@ -1193,6 +1193,34 @@ Waza automatically removes each worktree on engine shutdown (`git worktree remov **Out of scope today** (tracked separately): HTTPS / SSH clone strategies, submodules, Git LFS, and auto-detecting "the repo this test is running in" without an explicit `source`. +## Responder (interactive skills) + +For skills that ask follow-up questions, configure a `responder` — an LLM that plays the user and answers the skill's questions. It is mutually exclusive with `follow_up_prompts`. + +```yaml +# task.yaml +inputs: + prompt: "Add a new agent to my application" + responder: + model: gpt-4o # optional; defaults to config.model + instructions: | + The agent you want is "research-agent" with system instructions + "Search the web and summarise findings", tools web_search + url_fetch, + and no handoffs. Answer the skill's questions consistently with this. + If you genuinely can't infer an answer, abstain. + max_followups: 8 +``` + +After each agent turn the responder either **replies** (the answer is sent back, continuing the conversation), **stops** (the agent is done), or **abstains** — which fails the run with a distinct `abstained` outcome, signalling the brief is too vague. If `max_followups` is reached while the agent is still asking questions, the loop stops with outcome `cap_exhausted` and graders evaluate the final state. Each task carries its own responder, so the same skill can be tested against several target configurations. + +**Fields** (under `inputs.responder`): + +| Field | Required | Description | +|---|---|---| +| `instructions` | yes | The target configuration the responder represents and the rule for abstaining. | +| `max_followups` | yes | Maximum number of responder replies before the loop stops (`>= 1`). | +| `model` | no | Model used for the responder LLM. Defaults to the eval-level `config.model`. | + ## CI/CD Integration Waza is designed to work seamlessly with CI/CD pipelines. diff --git a/site/src/content/docs/guides/eval-yaml.mdx b/site/src/content/docs/guides/eval-yaml.mdx index 33d5c00f..05eb4a48 100644 --- a/site/src/content/docs/guides/eval-yaml.mdx +++ b/site/src/content/docs/guides/eval-yaml.mdx @@ -390,6 +390,25 @@ inputs: This is useful for evaluating multi-turn conversations where each step builds on the previous one. Graders run only after all prompts (initial + follow-ups) have completed, so the final output reflects the full conversation. +### Responder (Interactive Skills) + +For skills that ask follow-up questions, configure a `responder` — an LLM that plays the user and answers the skill's questions. It is mutually exclusive with `follow_up_prompts`. + +```yaml +inputs: + prompt: "Add a new agent to my application" + responder: + model: gpt-4o # optional; defaults to config.model + instructions: | + The agent you want is "research-agent" with system instructions + "Search the web and summarise findings", tools web_search + url_fetch, + and no handoffs. Answer the skill's questions consistently with this. + If you genuinely can't infer an answer, abstain. + max_followups: 8 +``` + +After each agent turn the responder either **replies** (the answer is sent back, continuing the conversation), **stops** (the agent is done), or **abstains** — which fails the run with a distinct `abstained` outcome, signalling the brief is too vague. If `max_followups` is reached while the agent is still asking questions, the loop stops with outcome `cap_exhausted` and graders evaluate the final state. Each task carries its own responder, so the same skill can be tested against several target configurations. + Prompt supports templating: ```yaml diff --git a/site/src/content/docs/reference/schema.mdx b/site/src/content/docs/reference/schema.mdx index 3dbd0986..2c976197 100644 --- a/site/src/content/docs/reference/schema.mdx +++ b/site/src/content/docs/reference/schema.mdx @@ -506,6 +506,29 @@ inputs: Graders evaluate only the final state after all prompts complete. If any follow-up fails, remaining prompts are skipped and the run is marked as an error. +### responder + +**Type:** object +**Required:** no + +An LLM-backed surrogate user that answers the skill's follow-up questions during a multi-turn run. Mutually exclusive with `follow_up_prompts`. + +| Field | Type | Required | Description | +|----------------|---------|----------|--------------------------------------------------------| +| `model` | string | no | Responder model. Defaults to the eval-level `config.model`. | +| `instructions` | string | yes | Target configuration the responder represents + abstain rule. | +| `max_followups`| integer | yes | Max responder replies before the loop stops (`>= 1`). | + +```yaml +inputs: + prompt: "Add a new agent to my application" + responder: + instructions: "Be research-agent with tools web_search; abstain if unknown." + max_followups: 8 +``` + +The responder classifies each agent message as **reply**, **stop**, or **abstain**. An abstain marks the run as an error with outcome `abstained`, distinct from model timeouts or network errors. If `max_followups` is reached while the agent is still asking questions, the loop stops with outcome `cap_exhausted` and graders evaluate the final state. + ### files **Type:** array From 5b5b0c8ac30f05600a281743b184eb657d60dd8c Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:43:56 +0100 Subject: [PATCH 13/17] feat: surface responder outcome in dashboard #303 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/webapi/additional_test.go | 43 ++++++++++++++++++++++++++++++ internal/webapi/store.go | 7 +++++ internal/webapi/types.go | 8 ++++++ web/src/api/client.ts | 7 +++++ web/src/components/RunDetail.tsx | 27 ++++++++++++++++++- 5 files changed, 91 insertions(+), 1 deletion(-) diff --git a/internal/webapi/additional_test.go b/internal/webapi/additional_test.go index 693bffaf..511ef63e 100644 --- a/internal/webapi/additional_test.go +++ b/internal/webapi/additional_test.go @@ -270,6 +270,49 @@ func TestOutcomeToDetailMapsStatsTranscriptAndDigest(t *testing.T) { } } +func TestOutcomeToDetailMapsResponder(t *testing.T) { + outcome := &models.EvaluationOutcome{ + RunID: "responder-run", + BenchName: "bench-responder", + Setup: models.OutcomeSetup{ModelID: "gpt-4o"}, + Digest: models.OutcomeDigest{TotalTests: 1, Succeeded: 1}, + TestOutcomes: []models.TestOutcome{ + { + DisplayName: "task-with-responder", + Status: models.StatusPassed, + Runs: []models.RunResult{ + { + Responder: &models.ResponderInfo{ + FollowupsSent: 2, + Outcome: models.ResponderOutcomeAbstained, + Reason: "too vague", + }, + }, + }, + }, + }, + } + + detail := outcomeToDetail(outcome) + + if len(detail.Tasks) != 1 { + t.Fatalf("expected 1 task, got %d", len(detail.Tasks)) + } + responder := detail.Tasks[0].Responder + if responder == nil { + t.Fatal("expected responder") + } + if responder.Outcome != "abstained" { + t.Errorf("expected outcome abstained, got %q", responder.Outcome) + } + if responder.FollowupsSent != 2 { + t.Errorf("expected 2 followups sent, got %d", responder.FollowupsSent) + } + if responder.Reason != "too vague" { + t.Errorf("expected reason too vague, got %q", responder.Reason) + } +} + func TestOutcomeToDetailNoTasks(t *testing.T) { detail := outcomeToDetail(&models.EvaluationOutcome{ RunID: "empty", diff --git a/internal/webapi/store.go b/internal/webapi/store.go index 1cbd3ec3..032d4558 100644 --- a/internal/webapi/store.go +++ b/internal/webapi/store.go @@ -211,6 +211,13 @@ func outcomeToDetail(o *models.EvaluationOutcome) *RunDetail { } tr.Transcript = mapTranscriptEvents(run.Transcript) tr.SessionDigest = mapSessionDigest(&run.SessionDigest) + if run.Responder != nil { + tr.Responder = &ResponderInfoResponse{ + FollowupsSent: run.Responder.FollowupsSent, + Outcome: run.Responder.Outcome, + Reason: run.Responder.Reason, + } + } } if tr.GraderResults == nil { tr.GraderResults = []GraderResult{} diff --git a/internal/webapi/types.go b/internal/webapi/types.go index 59de4b5b..0998edbe 100644 --- a/internal/webapi/types.go +++ b/internal/webapi/types.go @@ -33,6 +33,7 @@ type TaskResult struct { GraderResults []GraderResult `json:"graderResults"` Transcript []TranscriptEventResponse `json:"transcript,omitempty"` SessionDigest *SessionDigestResponse `json:"sessionDigest,omitempty"` + Responder *ResponderInfoResponse `json:"responder,omitempty"` BootstrapCI *ConfidenceIntervalResponse `json:"bootstrapCI,omitempty"` IsSignificant *bool `json:"isSignificant,omitempty"` } @@ -68,6 +69,13 @@ type SessionDigestResponse struct { Errors []string `json:"errors"` } +// ResponderInfoResponse is the API representation of a responder-driven run summary. +type ResponderInfoResponse struct { + FollowupsSent int `json:"followupsSent"` + Outcome string `json:"outcome"` + Reason string `json:"reason,omitempty"` +} + // GraderResult is a single grader/validator result. type GraderResult struct { Name string `json:"name"` diff --git a/web/src/api/client.ts b/web/src/api/client.ts index ec6427a4..48ad9939 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -59,6 +59,12 @@ export interface SessionDigest { errors: string[]; } +export interface ResponderInfo { + followupsSent: number; + outcome: string; + reason?: string; +} + export interface TaskResult { name: string; outcome: string; @@ -68,6 +74,7 @@ export interface TaskResult { graderResults: GraderResult[]; transcript?: TranscriptEvent[]; sessionDigest?: SessionDigest; + responder?: ResponderInfo; bootstrapCI?: BootstrapCI; isSignificant?: boolean; } diff --git a/web/src/components/RunDetail.tsx b/web/src/components/RunDetail.tsx index c6dcdcae..00236cb4 100644 --- a/web/src/components/RunDetail.tsx +++ b/web/src/components/RunDetail.tsx @@ -9,7 +9,7 @@ import { Download, } from "lucide-react"; import { useRunDetail } from "../hooks/useApi"; -import type { TaskResult, GraderResult } from "../api/client"; +import type { TaskResult, GraderResult, ResponderInfo } from "../api/client"; import { formatDuration, formatCost, @@ -96,6 +96,30 @@ function SignificanceBadge({ isSignificant }: { isSignificant?: boolean }) { ); } +function ResponderBadge({ responder }: { responder?: ResponderInfo }) { + if (!responder) return null; + + let className = + "inline-flex items-center rounded-full px-2 py-0.5 text-xs font-medium"; + if (responder.outcome === "abstained" || responder.outcome === "error") { + className += " bg-red-500/10 text-red-400"; + } else if (responder.outcome === "cap_exhausted") { + className += " bg-yellow-500/10 text-yellow-400"; + } else { + className += " bg-zinc-700 text-zinc-300"; + } + + const replyLabel = responder.followupsSent === 1 ? "reply" : "replies"; + const reason = responder.reason ? ` — ${responder.reason}` : ""; + + return ( + + Responder: {responder.outcome} ({responder.followupsSent} {replyLabel}) + {reason} + + ); +} + function CIRange({ lower, upper }: { lower: number; upper: number }) { return ( )} {task.name} + From 60e3645abcb9383321bfa7c43e4afb38a3228927 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:50:13 +0100 Subject: [PATCH 14/17] fix: rebuild dashboard bundle and fix lint misspelling #303 Rebuild web/dist/index.html so its asset hash matches the freshly built bundle (fixes TestIndexHTMLReferencesExistingAssets after the responder dashboard change) and correct a misspelling flagged by golangci-lint in the responder cleanup comment. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/orchestration/runner.go | 2 +- web/dist/index.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index 4b458714..3a8fe3ac 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -1305,7 +1305,7 @@ func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCa classifier := r.newClassifier(cfg, r.cfg.Spec().Config.ModelID) defer func() { // Tear down the persistent responder session with a detached context so - // cleanup still runs even if ctx was cancelled during the run. + // cleanup still runs even if ctx was canceled during the run. cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() if err := classifier.Close(cleanupCtx); err != nil { diff --git a/web/dist/index.html b/web/dist/index.html index 4f675d7e..40fc3d36 100644 --- a/web/dist/index.html +++ b/web/dist/index.html @@ -4,7 +4,7 @@ waza — eval dashboard - + From 11a66e57538777b425b0a76f41fc7353e1c36303 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 15:55:25 +0100 Subject: [PATCH 15/17] fix: free responder session usage collector on DeleteSession #303 A non-ephemeral session registers in both e.sessions and e.usageCollectors, but DeleteSession only removed it from e.sessions, orphaning the usage collector for the engine's lifetime. Each responder-driven task leaked one collector; under concurrent runs this accumulated monotonically. Also delete the usageCollectors entry under its mutex. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/execution/copilot.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/internal/execution/copilot.go b/internal/execution/copilot.go index beaa4f9e..581853b3 100644 --- a/internal/execution/copilot.go +++ b/internal/execution/copilot.go @@ -585,6 +585,11 @@ func (e *CopilotEngine) DeleteSession(ctx context.Context, sessionID string) err e.sessionsMu.Lock() delete(e.sessions, sessionID) e.sessionsMu.Unlock() + + e.usageCollectorsMu.Lock() + delete(e.usageCollectors, sessionID) + e.usageCollectorsMu.Unlock() + return e.client.DeleteSession(ctx, sessionID) } From 59c3d51afb9d1a20ee74856bd4089924544db183 Mon Sep 17 00:00:00 2001 From: Adam Dougal Date: Fri, 29 May 2026 16:50:14 +0100 Subject: [PATCH 16/17] chore: remove implementation plan --- ...-interactive-skills-implementation-plan.md | 1310 ----------------- 1 file changed, 1310 deletions(-) delete mode 100644 docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md diff --git a/docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md b/docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md deleted file mode 100644 index 44659302..00000000 --- a/docs/plans/2026-05-29-responder-interactive-skills-implementation-plan.md +++ /dev/null @@ -1,1310 +0,0 @@ -# Responder (Interactive Skills) Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add a per-task LLM-backed "responder" that answers an interactive skill's follow-up questions (reply / stop / abstain), driving multi-turn agent runs automatically. - -**Architecture:** A new `internal/responder` package owns an LLM surrogate-user session that classifies each agent message via structured tool-calling. The `EvalRunner` owns a follow-up loop (`executeResponderLoop`) that reuses the existing agent session/workspace plumbing (mirroring `executeFollowUps`). Config lives per task under `inputs.responder`; abstain is tagged on the run as a distinct `StatusError` outcome. - -**Tech Stack:** Go 1.26, Copilot SDK (`github.com/github/copilot-sdk/go`), `go-viper/mapstructure/v2`, `gopkg.in/yaml.v3`, `santhosh-tekuri/jsonschema/v6`. Tests use the repo's existing `testify`-style assertions and the in-tree `MockEngine` / fake-executor patterns. - -**Design doc:** `docs/plans/2026-05-29-responder-interactive-skills.md` - -**Conventions:** -- Build: `go build ./...` · Test: `go test ./...` · Lint: `golangci-lint run` -- Run a single test: `go test ./internal// -run '^TestName$' -v` -- Conventional commits, reference `#303`. Append the trailer: - `Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>` - (Commits are signed manually by the maintainer — when a step says "Commit", stage the files and present the commit command; the maintainer runs it.) - ---- - -## File Structure - -**Create:** -- `internal/responder/responder.go` — `Classifier`, `Decision`, `DecisionKind`, `New`, `Classify`, decision tools. -- `internal/responder/responder_test.go` — unit tests with a fake executor. - -**Modify:** -- `internal/models/testcase.go` — add `ResponderConfig`, `TaskStimulus.Responder`, validation + mutual exclusivity. -- `internal/models/testcase_test.go` — validation tests. -- `internal/models/outcome.go` — add `ResponderInfo` + `RunResult.Responder`. -- `internal/orchestration/runner.go` — classifier factory field, `executeResponderLoop`, branch in `executeRun`, set `RunResult.Responder`/status. -- `internal/orchestration/runner_test.go` (or a new `responder_loop_test.go`) — loop tests with mock engine + fake classifier. -- `schemas/task.schema.json` — add `responder` to the `inputs` `$def`. -- `README.md` — responder section + YAML example. -- `site/src/content/docs/guides/eval-yaml.mdx` and `site/src/content/docs/reference/schema.mdx` — document `inputs.responder`. - -**Note on scope:** The dashboard (`web/`) surfacing of `responder.outcome` is included as the final task. The pre-existing placement of `follow_up_prompts` in the task schema (it sits at task top-level, while the Go model reads it under `inputs`) is out of scope — do not "fix" it. - ---- - -## Task 1: Config model — `ResponderConfig` and `TaskStimulus.Responder` - -**Files:** -- Modify: `internal/models/testcase.go` -- Test: `internal/models/testcase_test.go` - -- [ ] **Step 1: Write the failing test** - -Add to `internal/models/testcase_test.go`: - -```go -func TestResponderConfigParsesUnderInputs(t *testing.T) { - dir := t.TempDir() - path := filepath.Join(dir, "task.yaml") - yaml := ` -id: configure-agent -name: Configure a research agent -inputs: - prompt: "Add a new agent to my application" - responder: - model: gpt-4o - instructions: | - The agent you want is research-agent with tools web_search. - If you can't infer an answer, abstain. - max_followups: 8 -` - require.NoError(t, os.WriteFile(path, []byte(yaml), 0o600)) - - tc, err := LoadTestCase(path) - require.NoError(t, err) - require.NotNil(t, tc.Stimulus.Responder) - require.Equal(t, "gpt-4o", tc.Stimulus.Responder.Model) - require.Equal(t, 8, tc.Stimulus.Responder.MaxFollowups) - require.Contains(t, tc.Stimulus.Responder.Instructions, "research-agent") -} -``` - -If `require`/`os`/`filepath` are not already imported in the test file, add them. - -- [ ] **Step 2: Run test to verify it fails** - -Run: `go test ./internal/models/ -run '^TestResponderConfigParsesUnderInputs$' -v` -Expected: FAIL — compile error `tc.Stimulus.Responder undefined` (field does not exist yet). - -- [ ] **Step 3: Add the type and field** - -In `internal/models/testcase.go`, add the new type after the `TaskStimulus` struct: - -```go -// ResponderConfig configures an LLM-backed surrogate user that answers a -// skill's follow-up questions during a multi-turn run. It is mutually -// exclusive with TaskStimulus.FollowUps. -type ResponderConfig struct { - // Model is the model used for the responder LLM. Optional; when empty the - // eval-level config.model is used. - Model string `yaml:"model,omitempty" json:"model,omitempty"` - // Instructions describe the target configuration the responder represents - // and the rule for abstaining. Required. - Instructions string `yaml:"instructions" json:"instructions"` - // MaxFollowups caps how many times the responder may reply before the loop - // stops. Required; must be >= 1. - MaxFollowups int `yaml:"max_followups" json:"max_followups"` -} -``` - -Then add the field to `TaskStimulus` (place it after `FollowUps`): - -```go - Responder *ResponderConfig `yaml:"responder,omitempty" json:"responder,omitempty"` -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `go test ./internal/models/ -run '^TestResponderConfigParsesUnderInputs$' -v` -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add internal/models/testcase.go internal/models/testcase_test.go -git commit -m "feat: add inputs.responder config model #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 2: Config validation — required fields + mutual exclusivity - -**Files:** -- Modify: `internal/models/testcase.go` (the `TestCase.Validate` method, ~line 311) -- Test: `internal/models/testcase_test.go` - -- [ ] **Step 1: Write the failing tests** - -Add to `internal/models/testcase_test.go`: - -```go -func TestResponderValidationRejectsMissingInstructions(t *testing.T) { - tc := &TestCase{ - TestID: "t1", - Stimulus: TaskStimulus{ - Message: "go", - Responder: &ResponderConfig{MaxFollowups: 3}, - }, - } - err := tc.Validate() - require.Error(t, err) - require.Contains(t, err.Error(), "instructions") -} - -func TestResponderValidationRejectsZeroMaxFollowups(t *testing.T) { - tc := &TestCase{ - TestID: "t1", - Stimulus: TaskStimulus{ - Message: "go", - Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 0}, - }, - } - err := tc.Validate() - require.Error(t, err) - require.Contains(t, err.Error(), "max_followups") -} - -func TestResponderValidationRejectsBothResponderAndFollowUps(t *testing.T) { - tc := &TestCase{ - TestID: "t1", - Stimulus: TaskStimulus{ - Message: "go", - FollowUps: []string{"next"}, - Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2}, - }, - } - err := tc.Validate() - require.Error(t, err) - require.Contains(t, err.Error(), "follow_up_prompts") - require.Contains(t, err.Error(), "responder") -} - -func TestResponderValidationAcceptsValidConfig(t *testing.T) { - tc := &TestCase{ - TestID: "t1", - Stimulus: TaskStimulus{ - Message: "go", - Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2}, - }, - } - require.NoError(t, tc.Validate()) -} -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `go test ./internal/models/ -run '^TestResponderValidation' -v` -Expected: FAIL — the "reject" tests get no error (validation not implemented). - -- [ ] **Step 3: Implement validation** - -In `internal/models/testcase.go`, extend `func (tc *TestCase) Validate() error`. Add the following block before the final `return nil`: - -```go - if r := tc.Stimulus.Responder; r != nil { - name := tc.TestID - if name == "" { - name = tc.DisplayName - } - prefix := "test case" - if name != "" { - prefix = fmt.Sprintf("test case %q", name) - } - if strings.TrimSpace(r.Instructions) == "" { - return fmt.Errorf("%s: responder.instructions is required", prefix) - } - if r.MaxFollowups < 1 { - return fmt.Errorf("%s: responder.max_followups must be at least 1, got %d", prefix, r.MaxFollowups) - } - if len(tc.Stimulus.FollowUps) > 0 { - return fmt.Errorf("%s: inputs.responder and inputs.follow_up_prompts are mutually exclusive; use one or the other", prefix) - } - } -``` - -(`fmt` and `strings` are already imported in this file.) - -- [ ] **Step 4: Run tests to verify they pass** - -Run: `go test ./internal/models/ -run '^TestResponderValidation' -v` -Expected: PASS (all four). - -- [ ] **Step 5: Commit** - -```bash -git add internal/models/testcase.go internal/models/testcase_test.go -git commit -m "feat: validate inputs.responder fields and mutual exclusivity #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 3: Responder package — `Decision` types and tool definitions - -**Files:** -- Create: `internal/responder/responder.go` -- Test: `internal/responder/responder_test.go` - -This task builds the package skeleton and the three decision tools, verifying that invoking each tool handler records the correct decision. The `Classify` flow is added in Task 4. - -- [ ] **Step 1: Write the failing test** - -Create `internal/responder/responder_test.go`: - -```go -package responder - -import ( - "testing" - - copilot "github.com/github/copilot-sdk/go" - "github.com/stretchr/testify/require" -) - -func TestDecisionToolsRecordReply(t *testing.T) { - d := &decisionRecorder{} - tools := d.tools() - require.Len(t, tools, 3) - - respond := findTool(t, tools, toolRespond) - _, err := respond.Handler(copilot.ToolInvocation{ - Arguments: map[string]any{"answer": "research-agent"}, - }) - require.NoError(t, err) - require.True(t, d.set) - require.Equal(t, DecisionReply, d.decision.Kind) - require.Equal(t, "research-agent", d.decision.Answer) -} - -func TestDecisionToolsRecordStop(t *testing.T) { - d := &decisionRecorder{} - stop := findTool(t, d.tools(), toolStop) - _, err := stop.Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) - require.NoError(t, err) - require.True(t, d.set) - require.Equal(t, DecisionStop, d.decision.Kind) -} - -func TestDecisionToolsRecordAbstain(t *testing.T) { - d := &decisionRecorder{} - abstain := findTool(t, d.tools(), toolAbstain) - _, err := abstain.Handler(copilot.ToolInvocation{ - Arguments: map[string]any{"reason": "brief too vague"}, - }) - require.NoError(t, err) - require.True(t, d.set) - require.Equal(t, DecisionAbstain, d.decision.Kind) - require.Equal(t, "brief too vague", d.decision.Reason) -} - -func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool { - t.Helper() - for _, tl := range tools { - if tl.Name == name { - return tl - } - } - t.Fatalf("tool %q not found", name) - return copilot.Tool{} -} -``` - -- [ ] **Step 2: Run test to verify it fails** - -Run: `go test ./internal/responder/ -v` -Expected: FAIL — package/symbols (`decisionRecorder`, `Decision`, `toolRespond`, ...) do not exist. - -- [ ] **Step 3: Implement the package skeleton and tools** - -Create `internal/responder/responder.go`: - -```go -// Package responder implements an LLM-backed surrogate user that answers an -// interactive skill's follow-up questions during a multi-turn evaluation run. -package responder - -import ( - "context" - - copilot "github.com/github/copilot-sdk/go" - "github.com/go-viper/mapstructure/v2" - "github.com/microsoft/waza/internal/execution" - "github.com/microsoft/waza/internal/models" -) - -// DecisionKind enumerates the responder's possible classifications of an agent -// message. -type DecisionKind int - -const ( - // DecisionReply means the responder answered the agent's question. - DecisionReply DecisionKind = iota - // DecisionStop means the agent is done and no further input is needed. - DecisionStop - // DecisionAbstain means the responder could not answer from its brief. - DecisionAbstain -) - -// Decision is the outcome of classifying a single agent message. -type Decision struct { - Kind DecisionKind - Answer string // set when Kind == DecisionReply - Reason string // set when Kind == DecisionAbstain -} - -const ( - toolRespond = "responder_reply" - toolStop = "responder_stop" - toolAbstain = "responder_abstain" -) - -// Executor is the narrow execution surface the responder needs. The concrete -// AgentEngine satisfies it, and tests supply a fake. -type Executor interface { - Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) -} - -// decisionRecorder captures the single decision tool the responder LLM calls. -type decisionRecorder struct { - decision Decision - set bool -} - -func (d *decisionRecorder) tools() []copilot.Tool { - return []copilot.Tool{ - { - Name: toolRespond, - Description: "Answer the agent's question as the user. Call this exactly once with your answer.", - Parameters: map[string]any{ - "type": "object", - "properties": map[string]any{ - "answer": map[string]any{ - "type": "string", - "description": "Your reply to the agent's question, consistent with your configuration.", - }, - }, - "required": []string{"answer"}, - }, - Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { - var args struct { - Answer string `mapstructure:"answer"` - } - _ = mapstructure.Decode(inv.Arguments, &args) - d.decision = Decision{Kind: DecisionReply, Answer: args.Answer} - d.set = true - return copilot.ToolResult{}, nil - }, - }, - { - Name: toolStop, - Description: "Signal that the agent has finished and needs no further input. Call this when there is no question to answer.", - Parameters: map[string]any{ - "type": "object", - "properties": map[string]any{}, - }, - Handler: func(copilot.ToolInvocation) (copilot.ToolResult, error) { - d.decision = Decision{Kind: DecisionStop} - d.set = true - return copilot.ToolResult{}, nil - }, - }, - { - Name: toolAbstain, - Description: "Signal that you cannot answer the agent's question from your configuration. Call this only when the information is genuinely missing.", - Parameters: map[string]any{ - "type": "object", - "properties": map[string]any{ - "reason": map[string]any{ - "type": "string", - "description": "Why you cannot answer.", - }, - }, - "required": []string{"reason"}, - }, - Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { - var args struct { - Reason string `mapstructure:"reason"` - } - _ = mapstructure.Decode(inv.Arguments, &args) - d.decision = Decision{Kind: DecisionAbstain, Reason: args.Reason} - d.set = true - return copilot.ToolResult{}, nil - }, - }, - } -} - -// Classifier is added in the next task; declared here so the package compiles -// with its dependencies referenced. -var _ = models.ResponderConfig{} -``` - -(The trailing `var _ = models.ResponderConfig{}` keeps the `models` import used until Task 4 adds `Classifier`; remove it in Task 4.) - -- [ ] **Step 4: Run test to verify it passes** - -Run: `go test ./internal/responder/ -v` -Expected: PASS (three decision-tool tests). - -- [ ] **Step 5: Commit** - -```bash -git add internal/responder/responder.go internal/responder/responder_test.go -git commit -m "feat: add responder decision types and tools #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 4: Responder package — `Classifier` and `Classify` - -**Files:** -- Modify: `internal/responder/responder.go` -- Test: `internal/responder/responder_test.go` - -- [ ] **Step 1: Write the failing tests** - -Add to `internal/responder/responder_test.go` (add imports `context`, `github.com/microsoft/waza/internal/execution`, `github.com/microsoft/waza/internal/models` as needed): - -```go -type fakeExecutor struct { - calls []*execution.ExecutionRequest - respond func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) -} - -func (f *fakeExecutor) Execute(_ context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { - f.calls = append(f.calls, req) - return f.respond(req) -} - -func TestClassifyReply(t *testing.T) { - exec := &fakeExecutor{ - respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { - // Simulate the model calling the reply tool. - _, err := findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ - Arguments: map[string]any{"answer": "research-agent"}, - }) - require.NoError(t, err) - return &execution.ExecutionResponse{SessionID: "resp-1"}, nil - }, - } - c := New(exec, models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}, "gpt-4o") - d, err := c.Classify(context.Background(), "What is the agent name?") - require.NoError(t, err) - require.Equal(t, DecisionReply, d.Kind) - require.Equal(t, "research-agent", d.Answer) -} - -func TestClassifyAbstain(t *testing.T) { - exec := &fakeExecutor{ - respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { - _, _ = findTool(t, req.Tools, toolAbstain).Handler(copilot.ToolInvocation{ - Arguments: map[string]any{"reason": "no info"}, - }) - return &execution.ExecutionResponse{SessionID: "resp-1"}, nil - }, - } - c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") - d, err := c.Classify(context.Background(), "Q?") - require.NoError(t, err) - require.Equal(t, DecisionAbstain, d.Kind) - require.Equal(t, "no info", d.Reason) -} - -func TestClassifyNoDecisionToolIsError(t *testing.T) { - exec := &fakeExecutor{ - respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { - return &execution.ExecutionResponse{SessionID: "resp-1"}, nil // no tool called - }, - } - c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") - _, err := c.Classify(context.Background(), "Q?") - require.Error(t, err) -} - -func TestClassifyUsesDefaultModelWhenUnset(t *testing.T) { - exec := &fakeExecutor{ - respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { - require.Equal(t, "default-model", req.ModelID) - _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) - return &execution.ExecutionResponse{SessionID: "resp-1"}, nil - }, - } - c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "default-model") - _, err := c.Classify(context.Background(), "Q?") - require.NoError(t, err) -} - -func TestClassifyPersistsSession(t *testing.T) { - exec := &fakeExecutor{ - respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { - _, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ - Arguments: map[string]any{"answer": "a"}, - }) - return &execution.ExecutionResponse{SessionID: "resp-1"}, nil - }, - } - c := New(exec, models.ResponderConfig{Instructions: "INSTR", MaxFollowups: 5}, "gpt-4o") - _, err := c.Classify(context.Background(), "Q1?") - require.NoError(t, err) - _, err = c.Classify(context.Background(), "Q2?") - require.NoError(t, err) - - require.Len(t, exec.calls, 2) - // First call has no resume session id and includes the instructions. - require.Empty(t, exec.calls[0].SessionID) - require.Contains(t, exec.calls[0].Message, "INSTR") - require.Contains(t, exec.calls[0].Message, "Q1?") - // Second call resumes the responder session and omits the instructions. - require.Equal(t, "resp-1", exec.calls[1].SessionID) - require.NotContains(t, exec.calls[1].Message, "INSTR") - require.Contains(t, exec.calls[1].Message, "Q2?") -} -``` - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `go test ./internal/responder/ -run '^TestClassify' -v` -Expected: FAIL — `New`/`Classify` undefined. - -- [ ] **Step 3: Implement `Classifier` and `Classify`** - -In `internal/responder/responder.go`: remove the `var _ = models.ResponderConfig{}` line, add `errors` and `fmt` to imports, and append: - -```go -// Classifier maintains a persistent surrogate-user session and classifies each -// agent message into a Decision. -type Classifier struct { - exec Executor - model string - instructions string - sessionID string // empty until the first Classify creates the session -} - -// New constructs a Classifier. defaultModel is used when cfg.Model is empty. -func New(exec Executor, cfg models.ResponderConfig, defaultModel string) *Classifier { - model := cfg.Model - if model == "" { - model = defaultModel - } - return &Classifier{ - exec: exec, - model: model, - instructions: cfg.Instructions, - } -} - -// Classify sends the agent's latest message to the responder LLM and returns -// its decision. The first call seeds the session with the responder -// instructions; subsequent calls resume the same session. -func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decision, error) { - rec := &decisionRecorder{} - - req := &execution.ExecutionRequest{ - ModelID: c.model, - Message: c.buildMessage(agentMessage), - Tools: rec.tools(), - MessageMode: execution.MessageModeEnqueue, - Streaming: true, - SessionID: c.sessionID, - NoSkills: true, - EphemeralSession: true, - SkipWorkspaceCapture: true, - } - - resp, err := c.exec.Execute(ctx, req) - if err != nil { - // If the model already chose a decision before a post-tool follow-up - // turn failed, honour it (mirrors the prompt grader's behaviour). - if rec.set { - return rec.decision, nil - } - return Decision{}, fmt.Errorf("responder execution failed: %w", err) - } - if resp != nil && resp.SessionID != "" { - c.sessionID = resp.SessionID - } - if !rec.set { - return Decision{}, errors.New("responder did not call a decision tool") - } - return rec.decision, nil -} - -func (c *Classifier) buildMessage(agentMessage string) string { - if c.sessionID == "" { - return fmt.Sprintf( - "%s\n\nYou are role-playing as the user. The agent just said:\n\n%s\n\n"+ - "Respond by calling exactly one tool: %s to answer, %s if the agent is finished and needs nothing, or %s if you genuinely cannot answer from your configuration.", - c.instructions, agentMessage, toolRespond, toolStop, toolAbstain, - ) - } - return fmt.Sprintf( - "The agent just said:\n\n%s\n\nRespond by calling exactly one tool (%s, %s, or %s).", - agentMessage, toolRespond, toolStop, toolAbstain, - ) -} -``` - -- [ ] **Step 4: Run tests to verify they pass** - -Run: `go test ./internal/responder/ -v` -Expected: PASS (all tests, including Task 3's). - -- [ ] **Step 5: Commit** - -```bash -git add internal/responder/responder.go internal/responder/responder_test.go -git commit -m "feat: add responder Classifier with persistent session #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 5: Results model — `ResponderInfo` on `RunResult` - -**Files:** -- Modify: `internal/models/outcome.go` -- Test: `internal/models/outcome_test.go` - -- [ ] **Step 1: Write the failing test** - -Add to `internal/models/outcome_test.go`: - -```go -func TestResponderInfoSerializes(t *testing.T) { - rr := RunResult{ - RunNumber: 1, - Status: StatusError, - Responder: &ResponderInfo{ - FollowupsSent: 3, - Outcome: "abstained", - Reason: "brief too vague", - }, - } - data, err := json.Marshal(rr) - require.NoError(t, err) - require.Contains(t, string(data), `"responder"`) - require.Contains(t, string(data), `"outcome":"abstained"`) - - // Omitted when nil. - data2, err := json.Marshal(RunResult{RunNumber: 1, Status: StatusPassed}) - require.NoError(t, err) - require.NotContains(t, string(data2), `"responder"`) -} -``` - -(Add `encoding/json` and `testify/require` imports if not already present.) - -- [ ] **Step 2: Run test to verify it fails** - -Run: `go test ./internal/models/ -run '^TestResponderInfoSerializes$' -v` -Expected: FAIL — `ResponderInfo` / `RunResult.Responder` undefined. - -- [ ] **Step 3: Implement the type and field** - -In `internal/models/outcome.go`, add after the `RunResult` struct definition: - -```go -// ResponderInfo records the outcome of a responder-driven multi-turn run. -type ResponderInfo struct { - // FollowupsSent is the number of responder answers sent to the agent. - FollowupsSent int `json:"followups_sent"` - // Outcome is one of: completed, stopped, abstained, cap_exhausted, error. - Outcome string `json:"outcome"` - // Reason holds the responder's reason when Outcome == "abstained" or an - // error message when Outcome == "error". - Reason string `json:"reason,omitempty"` -} -``` - -Add the field to `RunResult` (after `WorkspaceDir`): - -```go - Responder *ResponderInfo `json:"responder,omitempty"` -``` - -Also add named constants near the top of the file (after the `Status` consts) for reuse by the runner: - -```go -// Responder outcome values recorded on RunResult.Responder.Outcome. -const ( - ResponderOutcomeCompleted = "completed" - ResponderOutcomeStopped = "stopped" - ResponderOutcomeAbstained = "abstained" - ResponderOutcomeCapExhausted = "cap_exhausted" - ResponderOutcomeError = "error" -) -``` - -- [ ] **Step 4: Run test to verify it passes** - -Run: `go test ./internal/models/ -run '^TestResponderInfoSerializes$' -v` -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add internal/models/outcome.go internal/models/outcome_test.go -git commit -m "feat: add ResponderInfo to RunResult #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 6: Runner — classifier factory field (injectable for tests) - -**Files:** -- Modify: `internal/orchestration/runner.go` -- Test: `internal/orchestration/runner_test.go` - -This task introduces an injectable factory so the loop (Task 7) can be tested with a fake classifier while defaulting to the real responder. - -- [ ] **Step 1: Add the abstraction and factory (no behaviour change yet)** - -In `internal/orchestration/runner.go`: - -1. Add the import `"github.com/microsoft/waza/internal/responder"` to the import block. - -2. Add an interface and a factory field to the `EvalRunner` struct. After the `verbose bool` field, add: - -```go - // newClassifier builds a responder classifier for a task. Overridable in - // tests; defaults to a responder backed by the runner's engine. - newClassifier func(cfg models.ResponderConfig, defaultModel string) responderClassifier -``` - -3. Add the interface near the other small type declarations (e.g. just above `type EvalRunner struct`): - -```go -// responderClassifier classifies an agent message into a responder decision. -// Implemented by *responder.Classifier; faked in tests. -type responderClassifier interface { - Classify(ctx context.Context, agentMessage string) (responder.Decision, error) -} -``` - -4. In `NewEvalRunner` (after the struct is constructed, before `return`), set the default factory: - -```go - r.newClassifier = func(cfg models.ResponderConfig, defaultModel string) responderClassifier { - return responder.New(r.engine, cfg, defaultModel) - } -``` - -(Adjust to match how `NewEvalRunner` names its receiver/local variable — it returns a `*EvalRunner`; assign the field on that value before returning it.) - -- [ ] **Step 2: Build to verify it compiles** - -Run: `go build ./internal/orchestration/` -Expected: success (no behaviour change yet). `responder.New` satisfies `responderClassifier` because `*responder.Classifier` has the `Classify` method. - -- [ ] **Step 3: Commit** - -```bash -git add internal/orchestration/runner.go -git commit -m "refactor: add injectable responder classifier factory to runner #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 7: Runner — `executeResponderLoop` and branch in `executeRun` - -**Files:** -- Modify: `internal/orchestration/runner.go` -- Test: `internal/orchestration/runner_test.go` (or new `internal/orchestration/responder_loop_test.go`) - -- [ ] **Step 1: Write the failing tests** - -Create `internal/orchestration/responder_loop_test.go`. This uses a fake classifier injected via `r.newClassifier` and a `MockEngine` for agent turns. Adjust the runner/config construction to match existing test helpers in `runner_test.go` (e.g. how other tests build an `EvalRunner` with a spec + `MockEngine`); the assertions below are the contract: - -```go -package orchestration - -import ( - "context" - "testing" - - "github.com/microsoft/waza/internal/models" - "github.com/microsoft/waza/internal/responder" - "github.com/stretchr/testify/require" -) - -// scriptedClassifier returns a queued sequence of decisions. -type scriptedClassifier struct { - decisions []responder.Decision - idx int - calls int -} - -func (s *scriptedClassifier) Classify(_ context.Context, _ string) (responder.Decision, error) { - s.calls++ - d := s.decisions[s.idx] - if s.idx < len(s.decisions)-1 { - s.idx++ - } - return d, nil -} - -func TestResponderLoopReplyThenStop(t *testing.T) { - r := newResponderTestRunner(t) // helper: builds EvalRunner with MockEngine + responder spec - sc := &scriptedClassifier{decisions: []responder.Decision{ - {Kind: responder.DecisionReply, Answer: "research-agent"}, - {Kind: responder.DecisionStop}, - }} - r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } - - tc := &models.TestCase{ - TestID: "t1", - Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}}, - } - rr := r.executeRun(context.Background(), tc, 1) - - require.NotNil(t, rr.Responder) - require.Equal(t, models.ResponderOutcomeStopped, rr.Responder.Outcome) - require.Equal(t, 1, rr.Responder.FollowupsSent) -} - -func TestResponderLoopAbstainMarksError(t *testing.T) { - r := newResponderTestRunner(t) - sc := &scriptedClassifier{decisions: []responder.Decision{ - {Kind: responder.DecisionAbstain, Reason: "too vague"}, - }} - r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } - - tc := &models.TestCase{ - TestID: "t1", - Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 5}}, - } - rr := r.executeRun(context.Background(), tc, 1) - - require.Equal(t, models.StatusError, rr.Status) - require.NotNil(t, rr.Responder) - require.Equal(t, models.ResponderOutcomeAbstained, rr.Responder.Outcome) - require.Contains(t, rr.ErrorMsg, "abstained") - require.Contains(t, rr.ErrorMsg, "too vague") -} - -func TestResponderLoopCapExhausted(t *testing.T) { - r := newResponderTestRunner(t) - sc := &scriptedClassifier{decisions: []responder.Decision{ - {Kind: responder.DecisionReply, Answer: "a"}, // always reply - }} - r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc } - - tc := &models.TestCase{ - TestID: "t1", - Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 2}}, - } - rr := r.executeRun(context.Background(), tc, 1) - - require.NotNil(t, rr.Responder) - require.Equal(t, models.ResponderOutcomeCapExhausted, rr.Responder.Outcome) - require.Equal(t, 2, rr.Responder.FollowupsSent) - require.NotEqual(t, models.StatusError, rr.Status) // graded normally -} -``` - -Add a `newResponderTestRunner(t)` helper in this file. Model it on the existing runner construction in `runner_test.go` — build an `*config.EvalConfig` from a minimal `models.EvalSpec` (skill name, `Config{TrialsPerTask:1, TimeoutSec:30, ModelID:"mock"}`), create a `MockEngine`, call `Initialize`, and return `NewEvalRunner(cfg, engine)`. Inspect `runner_test.go` for the exact helper names already used and reuse them rather than duplicating. - -- [ ] **Step 2: Run tests to verify they fail** - -Run: `go test ./internal/orchestration/ -run '^TestResponderLoop' -v` -Expected: FAIL — `executeResponderLoop` not wired; `rr.Responder` is nil. - -- [ ] **Step 3: Implement the loop and branch** - -In `internal/orchestration/runner.go`, change the follow-up branch in `executeRun`. Replace: - -```go - // Execute follow-up prompts if defined - if len(tc.Stimulus.FollowUps) > 0 { - r.executeFollowUps(ctx, tc, resp) - } -``` - -with: - -```go - // Drive multi-turn: responder loop takes precedence; otherwise static - // follow-ups. Validation guarantees these are mutually exclusive. - var responderInfo *models.ResponderInfo - if tc.Stimulus.Responder != nil { - responderInfo = r.executeResponderLoop(ctx, tc, resp) - } else if len(tc.Stimulus.FollowUps) > 0 { - r.executeFollowUps(ctx, tc, resp) - } -``` - -Then, in the `return models.RunResult{...}` at the end of `executeRun`, add the field: - -```go - Responder: responderInfo, -``` - -Now add the loop method (place it directly after `executeFollowUps`): - -```go -// executeResponderLoop drives a multi-turn run using an LLM-backed surrogate -// user. After each agent turn it classifies the agent's latest message and -// either replies (sending a new agent prompt), stops, or aborts on abstain. -// It mutates resp in place (mirroring executeFollowUps) and returns a summary. -func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse) *models.ResponderInfo { - cfg := *tc.Stimulus.Responder - classifier := r.newClassifier(cfg, r.cfg.Spec().Config.ModelID) - - info := &models.ResponderInfo{Outcome: models.ResponderOutcomeCompleted} - left := cfg.MaxFollowups - lastWasReply := false - - for left > 0 { - decision, err := classifier.Classify(ctx, resp.FinalOutput) - if err != nil { - resp.ErrorMsg = fmt.Sprintf("responder error: %v", err) - info.Outcome = models.ResponderOutcomeError - info.Reason = err.Error() - return info - } - - switch decision.Kind { - case responder.DecisionStop: - info.Outcome = models.ResponderOutcomeStopped - return info - - case responder.DecisionAbstain: - resp.ErrorMsg = fmt.Sprintf("responder abstained: %s", decision.Reason) - info.Outcome = models.ResponderOutcomeAbstained - info.Reason = decision.Reason - return info - - case responder.DecisionReply: - if !r.sendResponderReply(ctx, tc, resp, decision.Answer, info.FollowupsSent+1) { - // sendResponderReply set resp.ErrorMsg; stop the loop. - info.Outcome = models.ResponderOutcomeError - info.Reason = resp.ErrorMsg - return info - } - info.FollowupsSent++ - left-- - lastWasReply = true - } - } - - if lastWasReply { - info.Outcome = models.ResponderOutcomeCapExhausted - slog.WarnContext(ctx, "responder budget exhausted while agent still asking questions", - "test", tc.DisplayName, "max_followups", cfg.MaxFollowups) - } - return info -} - -// sendResponderReply sends one responder answer to the agent session, reusing -// the session and workspace, and merges the agent's response into resp. It -// returns false (and sets resp.ErrorMsg) on failure. -func (r *EvalRunner) sendResponderReply(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse, answer string, turn int) bool { - followReq, err := r.buildExecutionRequest(tc) - if err != nil { - resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err) - return false - } - followReq.Message = answer - followReq.SessionID = resp.SessionID - followReq.WorkspaceDir = resp.WorkspaceDir - - if r.verbose { - r.notifyProgress(ProgressEvent{ - EventType: EventAgentPrompt, - TestName: tc.DisplayName, - Details: map[string]any{"message": answer, "responder_reply": turn}, - }) - } - - timeout, err := r.executionTimeout(tc) - if err != nil { - resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err) - return false - } - followCtx, cancelFollow := context.WithTimeout(ctx, timeout) - followResp, err := r.engine.Execute(followCtx, followReq) - cancelFollow() - if err != nil { - resp.ErrorMsg = fmt.Sprintf("responder reply %d failed: %v", turn, err) - return false - } - if followResp.ErrorMsg != "" { - resp.ErrorMsg = fmt.Sprintf("responder reply %d: %s", turn, followResp.ErrorMsg) - return false - } - - resp.Events = append(resp.Events, followResp.Events...) - resp.ToolCalls = append(resp.ToolCalls, followResp.ToolCalls...) - resp.SkillInvocations = append(resp.SkillInvocations, followResp.SkillInvocations...) - resp.DurationMs += followResp.DurationMs - resp.FinalOutput = followResp.FinalOutput - resp.WorkspaceFiles = followResp.WorkspaceFiles - if followResp.Usage != nil { - if resp.Usage == nil { - resp.Usage = followResp.Usage - } else { - resp.Usage = models.AggregateUsageStats([]*models.UsageStats{resp.Usage, followResp.Usage}) - } - } - return true -} -``` - -Add `"log/slog"` to the import block if it is not already imported. - -- [ ] **Step 4: Run tests to verify they pass** - -Run: `go test ./internal/orchestration/ -run '^TestResponderLoop' -v` -Expected: PASS (reply-then-stop, abstain→error, cap-exhausted). - -- [ ] **Step 5: Run the full orchestration + models + responder suites** - -Run: `go test ./internal/orchestration/ ./internal/models/ ./internal/responder/` -Expected: PASS. - -- [ ] **Step 6: Commit** - -```bash -git add internal/orchestration/runner.go internal/orchestration/responder_loop_test.go -git commit -m "feat: drive interactive skills via responder loop #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 8: JSON schema — add `responder` to the task `inputs` definition - -**Files:** -- Modify: `schemas/task.schema.json` -- Test: `internal/validation/schema_test.go` (add a case) — or rely on the existing doc-examples test once docs are added in Task 9. - -- [ ] **Step 1: Write the failing test** - -Add to `internal/validation/schema_test.go` a test that a task with `inputs.responder` validates. Match the file's existing helper for validating task bytes (look for `ValidateTaskBytes`): - -```go -func TestTaskSchemaAcceptsResponder(t *testing.T) { - yaml := []byte(` -id: t1 -name: Configure agent -inputs: - prompt: "add agent" - responder: - instructions: "be research-agent; abstain if unknown" - max_followups: 8 -`) - errs := ValidateTaskBytes(yaml) - require.Empty(t, errs) -} -``` - -(If the exported helper has a different name/signature, use the one already exercised by neighbouring tests in this file.) - -- [ ] **Step 2: Run test to verify it fails** - -Run: `go test ./internal/validation/ -run '^TestTaskSchemaAcceptsResponder$' -v` -Expected: FAIL — `inputs.responder` rejected because the `inputs` `$def` has `additionalProperties: false`. - -- [ ] **Step 3: Add `responder` to the schema** - -In `schemas/task.schema.json`, inside the `$defs.inputs.properties` object (the block containing `prompt`, `prompt_file`, `context`, `files`, `repos`, `workdir`, `environment`), add: - -```json - "responder": { - "type": "object", - "additionalProperties": false, - "required": ["instructions", "max_followups"], - "description": "LLM-backed surrogate user that answers the skill's follow-up questions. Mutually exclusive with follow_up_prompts.", - "properties": { - "model": { - "type": "string", - "description": "Model used for the responder LLM. Defaults to the eval-level config.model." - }, - "instructions": { - "type": "string", - "minLength": 1, - "description": "Describes the target configuration the responder represents and the rule for abstaining." - }, - "max_followups": { - "type": "integer", - "minimum": 1, - "description": "Maximum number of responder replies before the loop stops." - } - } - } -``` - -Insert it as a sibling of `environment` (add a trailing comma to `environment` as needed to keep valid JSON). - -- [ ] **Step 4: Run test to verify it passes** - -Run: `go test ./internal/validation/ -run '^TestTaskSchemaAcceptsResponder$' -v` -Expected: PASS. Also run `go test ./internal/validation/` to ensure no regressions. - -- [ ] **Step 5: Commit** - -```bash -git add schemas/task.schema.json internal/validation/schema_test.go -git commit -m "feat: add inputs.responder to task JSON schema #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 9: Documentation — README and site - -**Files:** -- Modify: `README.md` -- Modify: `site/src/content/docs/guides/eval-yaml.mdx` -- Modify: `site/src/content/docs/reference/schema.mdx` - -- [ ] **Step 1: Add a README section** - -In `README.md`, near where multi-turn / `follow_up_prompts` is described (search for `follow_up_prompts`), add a "Responder (interactive skills)" subsection with this example and explanation: - -````markdown -#### Responder (interactive skills) - -For skills that ask follow-up questions, configure a `responder` — an LLM that -plays the user and answers the skill's questions. It is mutually exclusive with -`follow_up_prompts`. - -```yaml -inputs: - prompt: "Add a new agent to my application" - responder: - model: gpt-4o # optional; defaults to config.model - instructions: | - The agent you want is "research-agent" with system instructions - "Search the web and summarise findings", tools web_search + url_fetch, - and no handoffs. Answer the skill's questions consistently with this. - If you genuinely can't infer an answer, abstain. - max_followups: 8 -``` - -After each agent turn the responder either **replies** (the answer is sent back, -continuing the conversation), **stops** (the agent is done), or **abstains** — -which fails the run with a distinct `abstained` outcome, signalling the brief is -too vague. Each task carries its own responder, so the same skill can be tested -against several target configurations. -```` - -- [ ] **Step 2: Add the eval-yaml guide entry** - -In `site/src/content/docs/guides/eval-yaml.mdx`, after the `follow_up_prompts` section (search for it), add an analogous `responder` section using the same YAML example and explanation as Step 1, in the file's existing prose style. - -- [ ] **Step 3: Add the schema reference entry** - -In `site/src/content/docs/reference/schema.mdx`, after the `### follow_up_prompts` section, add: - -````markdown -### responder - -**Type:** object -**Required:** no - -An LLM-backed surrogate user that answers the skill's follow-up questions during -a multi-turn run. Mutually exclusive with `follow_up_prompts`. - -| Field | Type | Required | Description | -|----------------|---------|----------|--------------------------------------------------------| -| `model` | string | no | Responder model. Defaults to the eval-level `config.model`. | -| `instructions` | string | yes | Target configuration the responder represents + abstain rule. | -| `max_followups`| integer | yes | Max responder replies before the loop stops (`>= 1`). | - -```yaml -inputs: - prompt: "Add a new agent to my application" - responder: - instructions: "Be research-agent with tools web_search; abstain if unknown." - max_followups: 8 -``` - -The responder classifies each agent message as **reply**, **stop**, or -**abstain**. An abstain marks the run as an error with outcome `abstained`, -distinct from model timeouts or network errors. If `max_followups` is reached -while the agent is still asking questions, the loop stops with outcome -`cap_exhausted` and graders evaluate the final state. -```` - -- [ ] **Step 4: Build the docs site to verify** - -Run: `cd site && npm run build` -Expected: build succeeds with no MDX errors. - -- [ ] **Step 5: Commit** - -```bash -git add README.md site/src/content/docs/guides/eval-yaml.mdx site/src/content/docs/reference/schema.mdx -git commit -m "docs: document inputs.responder for interactive skills #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 10: Dashboard — surface responder outcome - -**Files:** -- Modify: dashboard source under `web/` (locate where `RunResult` / run details are rendered). -- Test: `web/` Playwright tests if the changed view has e2e coverage. - -- [ ] **Step 1: Locate where run results are rendered** - -Run: `grep -rn "final_output\|error_msg\|workspace_dir" web/src 2>/dev/null | head` -Identify the component that renders per-run details (the same one that shows `error_msg` / status). - -- [ ] **Step 2: Render `responder` when present** - -In that component, when a run has a `responder` object, display its `outcome` -(and `reason` when present) — e.g. a small badge/label: `Responder: abstained — -brief too vague` or `Responder: cap_exhausted (3 replies)`. Follow the existing -styling/markup conventions in the surrounding component. Keep it read-only; no -new data fetching is required since `responder` is already in the results JSON. - -- [ ] **Step 3: Run the dashboard e2e tests (if the view is covered)** - -Run: `cd web && npx playwright test --project=chromium` -Expected: PASS. If a screenshot snapshot for this view changed intentionally, -regenerate: `cd web && npx playwright test e2e/screenshots.spec.ts --project=chromium`. - -- [ ] **Step 4: Commit** - -```bash -git add web/ -git commit -m "feat: surface responder outcome in dashboard #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Task 11: Full verification - -**Files:** none (verification only). - -- [ ] **Step 1: Build everything** - -Run: `go build ./...` -Expected: success. - -- [ ] **Step 2: Run the full Go test suite** - -Run: `go test ./...` -Expected: PASS. - -- [ ] **Step 3: Lint** - -Run: `golangci-lint run` -Expected: no issues. Fix any lint findings in the files you touched. - -- [ ] **Step 4: Smoke-test with the mock engine (optional but recommended)** - -Create a temporary eval + task using `executor: mock` with an `inputs.responder` -block, run `./waza run -v -o /tmp/responder-results.json`, and -confirm the run completes and `responder` appears in the results JSON. Remove -the temp files afterward. - -- [ ] **Step 5: Final commit (if any verification fixes were made)** - -```bash -git add -A -git commit -m "chore: responder feature verification fixes #303" -m "Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" -``` - ---- - -## Self-Review Notes (for the planner) - -- **Spec coverage:** config surface (Task 1), validation + mutual exclusivity (Task 2), responder package with persistent session + reply/stop/abstain + no-decision error (Tasks 3–4), runner loop + cap-exhaustion + abstain→StatusError (Task 7), `ResponderInfo` reporting (Task 5), schema (Task 8), docs (Task 9), dashboard (Task 10). All design sections map to a task. -- **Type consistency:** `ResponderConfig{Model, Instructions, MaxFollowups}`, `Decision{Kind, Answer, Reason}`, `DecisionReply/Stop/Abstain`, `Classifier.Classify`, `responderClassifier` interface, `ResponderInfo{FollowupsSent, Outcome, Reason}`, and the `ResponderOutcome*` constants are used identically across tasks. -- **Known integration point to verify during execution:** the exact `NewEvalRunner` receiver/return shape (Task 6 Step 1.4) and the existing runner test helper names (Task 7 Step 1) — inspect `runner.go` / `runner_test.go` and adapt the shown code to the real signatures rather than assuming. From f539f0b2171e3216dbc820b89315c14a57213b3e Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Mon, 1 Jun 2026 13:31:23 -0400 Subject: [PATCH 17/17] fix(responder): surface tool errors and drop dead cap-exhausted flag Addresses three review comments on PR #304: * Reject duplicate decision tool calls in the same turn instead of letting handler order silently pick the winner. The recorder now returns an error on the second call and Classify surfaces it. * Propagate mapstructure decode failures from each tool handler so malformed arguments become a 'responder tool call invalid' error rather than a fabricated empty reply/abstain. * Drop the unused lastWasReply flag and the dead initial ResponderOutcomeCompleted seed in the responder loop. The loop can only exit normally after a reply, so the post-loop branch unconditionally records cap_exhausted. Removed the now-unused ResponderOutcomeCompleted constant. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- internal/models/outcome.go | 1 - internal/orchestration/runner.go | 16 +++---- internal/responder/responder.go | 48 +++++++++++++++++++- internal/responder/responder_test.go | 67 ++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 11 deletions(-) diff --git a/internal/models/outcome.go b/internal/models/outcome.go index a47540ee..3af74da9 100644 --- a/internal/models/outcome.go +++ b/internal/models/outcome.go @@ -22,7 +22,6 @@ const ( // Responder outcome values recorded on RunResult.Responder.Outcome. const ( - ResponderOutcomeCompleted = "completed" ResponderOutcomeStopped = "stopped" ResponderOutcomeAbstained = "abstained" ResponderOutcomeCapExhausted = "cap_exhausted" diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go index 3a8fe3ac..710e2a9a 100644 --- a/internal/orchestration/runner.go +++ b/internal/orchestration/runner.go @@ -1314,9 +1314,8 @@ func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCa } }() - info := &models.ResponderInfo{Outcome: models.ResponderOutcomeCompleted} + info := &models.ResponderInfo{} left := cfg.MaxFollowups - lastWasReply := false for left > 0 { decision, err := classifier.Classify(ctx, resp.FinalOutput) @@ -1346,15 +1345,16 @@ func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCa } info.FollowupsSent++ left-- - lastWasReply = true } } - if lastWasReply { - info.Outcome = models.ResponderOutcomeCapExhausted - slog.WarnContext(ctx, "responder budget exhausted while agent still asking questions", - "test", tc.DisplayName, "max_followups", cfg.MaxFollowups) - } + // Reaching this point means the loop only exited via successful replies + // (Stop, Abstain, and error paths all return early), and validation + // guarantees MaxFollowups >= 1, so a reply must have run on the final + // iteration. The agent is still asking, but we've spent our budget. + info.Outcome = models.ResponderOutcomeCapExhausted + slog.WarnContext(ctx, "responder budget exhausted while agent still asking questions", + "test", tc.DisplayName, "max_followups", cfg.MaxFollowups) return info } diff --git a/internal/responder/responder.go b/internal/responder/responder.go index f6f59cfe..e38315ab 100644 --- a/internal/responder/responder.go +++ b/internal/responder/responder.go @@ -53,9 +53,12 @@ type sessionDeleter interface { } // decisionRecorder captures the single decision tool the responder LLM calls. +// err is set if a handler-level failure (malformed arguments or a duplicate +// decision call) must be surfaced rather than silently swallowed. type decisionRecorder struct { decision Decision set bool + err error } func (d *decisionRecorder) tools() []copilot.Tool { @@ -74,10 +77,16 @@ func (d *decisionRecorder) tools() []copilot.Tool { "required": []string{"answer"}, }, Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { + if err := d.guardDuplicate(toolRespond); err != nil { + return copilot.ToolResult{}, err + } var args struct { Answer string `mapstructure:"answer"` } - _ = mapstructure.Decode(inv.Arguments, &args) + if err := mapstructure.Decode(inv.Arguments, &args); err != nil { + d.recordErr(fmt.Errorf("decode %s arguments: %w", toolRespond, err)) + return copilot.ToolResult{}, err + } d.decision = Decision{Kind: DecisionReply, Answer: args.Answer} d.set = true return copilot.ToolResult{}, nil @@ -91,6 +100,9 @@ func (d *decisionRecorder) tools() []copilot.Tool { "properties": map[string]any{}, }, Handler: func(copilot.ToolInvocation) (copilot.ToolResult, error) { + if err := d.guardDuplicate(toolStop); err != nil { + return copilot.ToolResult{}, err + } d.decision = Decision{Kind: DecisionStop} d.set = true return copilot.ToolResult{}, nil @@ -110,10 +122,16 @@ func (d *decisionRecorder) tools() []copilot.Tool { "required": []string{"reason"}, }, Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) { + if err := d.guardDuplicate(toolAbstain); err != nil { + return copilot.ToolResult{}, err + } var args struct { Reason string `mapstructure:"reason"` } - _ = mapstructure.Decode(inv.Arguments, &args) + if err := mapstructure.Decode(inv.Arguments, &args); err != nil { + d.recordErr(fmt.Errorf("decode %s arguments: %w", toolAbstain, err)) + return copilot.ToolResult{}, err + } d.decision = Decision{Kind: DecisionAbstain, Reason: args.Reason} d.set = true return copilot.ToolResult{}, nil @@ -122,6 +140,26 @@ func (d *decisionRecorder) tools() []copilot.Tool { } } +// guardDuplicate enforces the "call exactly one decision tool, exactly once" +// contract advertised in each tool description. If the model calls a second +// decision tool in the same turn, the handler refuses rather than letting +// invocation order silently pick the winner. +func (d *decisionRecorder) guardDuplicate(name string) error { + if !d.set { + return nil + } + err := fmt.Errorf("responder called %s after a decision was already recorded", name) + d.recordErr(err) + return err +} + +// recordErr captures the first handler-level failure so Classify can surface it. +func (d *decisionRecorder) recordErr(err error) { + if d.err == nil { + d.err = err + } +} + // Classifier maintains a persistent surrogate-user session and classifies each // agent message into a Decision. type Classifier struct { @@ -170,6 +208,12 @@ func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decisio if resp != nil && resp.SessionID != "" { c.sessionID = resp.SessionID } + // A handler-level failure (malformed tool arguments, or the model calling + // more than one decision tool) takes precedence: surfacing it as an error + // is more useful than silently returning a possibly-bogus decision. + if rec.err != nil { + return Decision{}, fmt.Errorf("responder tool call invalid: %w", rec.err) + } if err != nil { if rec.set { return rec.decision, nil diff --git a/internal/responder/responder_test.go b/internal/responder/responder_test.go index c7cf0e31..1d557208 100644 --- a/internal/responder/responder_test.go +++ b/internal/responder/responder_test.go @@ -202,6 +202,73 @@ func TestCloseWithoutDeleterIsNoop(t *testing.T) { require.NoError(t, c.Close(context.Background())) } +func TestDecisionToolsRejectDuplicateCall(t *testing.T) { + d := &decisionRecorder{} + tools := d.tools() + respond := findTool(t, tools, toolRespond) + stop := findTool(t, tools, toolStop) + + _, err := respond.Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "first"}, + }) + require.NoError(t, err) + + // A second decision call must be rejected rather than silently + // overwriting the first decision. + _, err = stop.Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + require.Error(t, err) + require.Error(t, d.err) + // The first decision is preserved so callers can see what was recorded. + require.Equal(t, DecisionReply, d.decision.Kind) + require.Equal(t, "first", d.decision.Answer) +} + +func TestDecisionToolsRejectMalformedArgs(t *testing.T) { + d := &decisionRecorder{} + respond := findTool(t, d.tools(), toolRespond) + + // answer must be a string; passing a non-string triggers a decode error + // that the handler surfaces instead of recording an empty reply. + _, err := respond.Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": map[string]any{"nested": true}}, + }) + require.Error(t, err) + require.Error(t, d.err) + require.False(t, d.set) +} + +func TestClassifyDuplicateDecisionIsError(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + // The model calls reply first, then stop in the same turn. + _, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": "a"}, + }) + _, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}}) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.Error(t, err) + require.Contains(t, err.Error(), "responder tool call invalid") +} + +func TestClassifyMalformedArgsIsError(t *testing.T) { + exec := &fakeExecutor{ + respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) { + _, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{ + Arguments: map[string]any{"answer": 42}, + }) + return &execution.ExecutionResponse{SessionID: "resp-1"}, nil + }, + } + c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o") + _, err := c.Classify(context.Background(), "Q?") + require.Error(t, err) + require.Contains(t, err.Error(), "responder tool call invalid") +} + func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool { t.Helper() for _, tl := range tools {