diff --git a/README.md b/README.md
index dc02e6c56..ea3d42403 100644
--- a/README.md
+++ b/README.md
@@ -1193,6 +1193,34 @@ Waza automatically removes each worktree on engine shutdown (`git worktree remov
 
 **Out of scope today** (tracked separately): HTTPS / SSH clone strategies, submodules, Git LFS, and auto-detecting "the repo this test is running in" without an explicit `source`.
 
+## Responder (interactive skills)
+
+For skills that ask follow-up questions, configure a `responder` — an LLM that plays the user and answers the skill's questions. It is mutually exclusive with `follow_up_prompts`.
+
+```yaml
+# task.yaml
+inputs:
+  prompt: "Add a new agent to my application"
+  responder:
+    model: gpt-4o          # optional; defaults to config.model
+    instructions: |
+      The agent you want is "research-agent" with system instructions
+      "Search the web and summarise findings", tools web_search + url_fetch,
+      and no handoffs. Answer the skill's questions consistently with this.
+      If you genuinely can't infer an answer, abstain.
+    max_followups: 8
+```
+
+After each agent turn the responder either **replies** (the answer is sent back, continuing the conversation), **stops** (the agent is done), or **abstains** — which fails the run with a distinct `abstained` outcome, signalling the brief is too vague. If `max_followups` is reached while the agent is still asking questions, the loop stops with outcome `cap_exhausted` and graders evaluate the final state. Each task carries its own responder, so the same skill can be tested against several target configurations.
+
+**Fields** (under `inputs.responder`):
+
+| Field           | Required | Description |
+|---|---|---|
+| `instructions`  | yes | The target configuration the responder represents and the rule for abstaining. |
+| `max_followups` | yes | Maximum number of responder replies before the loop stops (`>= 1`). |
+| `model`         | no  | Model used for the responder LLM. Defaults to the eval-level `config.model`. |
+
 ## CI/CD Integration
 
 Waza is designed to work seamlessly with CI/CD pipelines.
diff --git a/docs/plans/2026-05-29-responder-interactive-skills.md b/docs/plans/2026-05-29-responder-interactive-skills.md
new file mode 100644
index 000000000..c2a2e7524
--- /dev/null
+++ b/docs/plans/2026-05-29-responder-interactive-skills.md
@@ -0,0 +1,345 @@
+# Design: Responder — driving interactive skills via a surrogate LLM
+
+- **Issue:** [#303](https://github.com/microsoft/waza/issues/303)
+- **Status:** Approved design, ready for implementation planning
+- **Date:** 2026-05-29
+
+## Problem
+
+A growing category of Agent Skills is inherently multi-turn: when the agent
+doesn't have everything it needs from the initial prompt, it pauses to ask the
+user follow-up questions before completing the task.
+
+**Concrete example — `configure-agent`.** Invoked with "Add a new agent to my
+application", the skill must gather a name, system instructions, the set of
+tools, and any handoffs before it can generate the agent definition. None of
+these can be inferred from the initial prompt, and the structured Q&A *is* part
+of the skill's value.
+
+Today, evaluating such a skill in waza forces a bad trade-off: either pre-bake
+every answer into the initial prompt (collapsing the skill into a degenerate
+one-shot version that no longer tests what ships) or evaluate manually. The
+existing static `follow_up_prompts` mechanism only works when the questions —
+and their order — are known in advance, which defeats the purpose of testing the
+skill's adaptive questioning.
+
+## Goal
+
+Add a **responder**: an LLM-backed surrogate user, configured per task, that
+answers the skill's follow-up questions consistently with a described target
+configuration. After each agent turn produces chat text, the responder
+classifies it into one of three outcomes:
+
+- **Reply** → send the responder's answer back as a new user prompt, decrement a
+  follow-up budget, and continue.
+- **Stop** → the agent is done (no further questions); exit the loop cleanly.
+- **Abstain** → the responder explicitly could not answer. Abort the run with a
+  distinct failure classification, signalling that the brief/skill is too vague
+  — *not* a transient model timeout or network blip.
+
+## Scope
+
+### In scope
+
+- Per-task `responder` config under `inputs` (sibling to `follow_up_prompts`).
+- A responder component that maintains a persistent surrogate-user session and
+  classifies each agent message via structured tool-calling.
+- Runner integration: a responder-driven follow-up loop reusing the existing
+  agent session and workspace.
+- Distinct result tagging for abstain and cap-exhaustion, surfaced in logs,
+  results JSON, and the dashboard.
+- Schema, validation, tests, and documentation.
+
+### Out of scope (possible follow-ups)
+
+- Eval-level responder defaults shared across tasks (each task is self-contained
+  for now; if many tasks share one target config, the block repeats).
+- Per-field override/merge semantics between eval-level and task-level config.
+
+## User-facing surface
+
+The responder is configured per task under `inputs`, alongside the existing
+`follow_up_prompts` field. The two are mutually exclusive.
+
+```yaml
+inputs:
+  prompt: "Add a new agent to my application"
+  responder:
+    model: gpt-4o            # optional; defaults to config.model
+    instructions: |
+      You are configuring a new agent inside an agentic application.
+      The agent you want to create has:
+        - name: research-agent
+        - system instructions: "Search the web and summarise findings on the
+          topic the user provides."
+        - tools: web_search, url_fetch
+        - handoffs: none
+      Answer the skill's questions consistently with this configuration,
+      regardless of the order in which the skill asks for each piece.
+      If you genuinely can't infer an answer from the above, abstain.
+    max_followups: 8
+```
+
+Because the responder lives per task, the same skill can be exercised against
+several target configurations (a research agent, a customer-support agent, a
+triage agent with handoffs) by giving each task its own `responder.instructions`
+— this is exactly the robustness testing the issue calls for, achieved without
+any eval-level override machinery.
+
+### Configuration fields
+
+| Field          | Required | Default          | Notes                                   |
+|----------------|----------|------------------|-----------------------------------------|
+| `model`        | no       | `config.model`   | Model used for the responder LLM.       |
+| `instructions` | yes      | —                | Describes the target config + abstain rule. |
+| `max_followups`| yes      | —                | Must be `>= 1`. Caps responder replies. |
+
+## Architecture
+
+The design reuses two patterns already proven in the codebase:
+
+1. **LLM-backed classification via structured tool-calling**, as used by the
+   prompt grader (`internal/graders/prompt_grader.go`), against the narrow
+   `Executor` interface (`Execute(ctx, *ExecutionRequest)`).
+2. **Multi-turn agent follow-ups via session + workspace reuse**, as used by the
+   existing static follow-up loop (`executeFollowUps` in
+   `internal/orchestration/runner.go`), which resumes the agent session by
+   passing `SessionID` and `WorkspaceDir` on each `Execute`.
+
+The responder owns classification; the runner owns the loop and all agent
+follow-up plumbing (per-turn timeout, event/usage/tool-call merging).
+
+```mermaid
+sequenceDiagram
+    participant R as Runner (executeResponderLoop)
+    participant A as Agent session (engine)
+    participant C as Responder Classifier
+    participant S as Responder session (engine)
+
+    R->>A: initial prompt (Execute)
+    A-->>R: agent chat text (FinalOutput)
+    loop while budget > 0
+        R->>C: Classify(agentMessage)
+        C->>S: agent question (Execute, persistent session + decision tools)
+        S-->>C: tool call: respond / stop / abstain
+        C-->>R: Decision
+        alt reply
+            R->>A: follow-up = answer (Execute, reuse SessionID + WorkspaceDir)
+            A-->>R: agent chat text
+            Note over R: budget--
+        else stop
+            Note over R: outcome = stopped; break
+        else abstain
+            Note over R: outcome = abstained (StatusError); break
+        end
+    end
+    Note over R: budget exhausted while still replying → outcome = cap_exhausted
+    R->>R: run graders against final state
+```
+
+### Component 1: Config model (`internal/models`)
+
+A new `ResponderConfig` carried on `TaskStimulus` (the `inputs` block):
+
+```go
+type ResponderConfig struct {
+    Model        string `yaml:"model,omitempty"        json:"model,omitempty"`
+    Instructions string `yaml:"instructions"           json:"instructions"`
+    MaxFollowups int    `yaml:"max_followups"          json:"max_followups"`
+}
+
+// TaskStimulus gains:
+//   Responder *ResponderConfig `yaml:"responder,omitempty" json:"responder,omitempty"`
+```
+
+**Validation** (in `TestCase.Validate`, surfaced by `LoadTestCase`):
+
+- If `Responder != nil`:
+  - `Instructions` must be non-empty.
+  - `MaxFollowups >= 1`.
+  - `FollowUps` must be empty (mutual exclusivity; clear error message naming
+    both fields).
+
+### Component 2: Responder package (`internal/responder`)
+
+```go
+type DecisionKind int
+const (
+    DecisionReply DecisionKind = iota
+    DecisionStop
+    DecisionAbstain
+)
+
+type Decision struct {
+    Kind   DecisionKind
+    Answer string // set when Kind == DecisionReply
+    Reason string // set when Kind == DecisionAbstain
+}
+
+// Executor is the narrow execution surface the responder needs (same shape as
+// graders.Executor), enabling unit tests with a fake executor.
+type Executor interface {
+    Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error)
+}
+
+type Classifier struct {
+    exec         Executor
+    model        string
+    instructions string
+    sessionID    string // empty until the first Classify creates the session
+}
+
+func New(exec Executor, cfg models.ResponderConfig, defaultModel string) *Classifier
+func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decision, error)
+```
+
+`Classify` behaviour:
+
+- **Persistent session.** The first call creates the responder session (no
+  resume `SessionID`); the returned `SessionID` is stored and passed on every
+  subsequent call so the responder accumulates the back-and-forth like a real
+  user. The session is owned by the engine and cleaned up at `Shutdown`.
+- **First message** carries the responder `instructions` as a preamble plus the
+  agent's first question and a directive to answer by calling exactly one
+  decision tool. **Later messages** carry only the agent's latest question
+  (instructions persist in session context).
+- **Structured output** via three tools whose handlers capture the decision:
+  - `respond(answer: string)` → `DecisionReply`
+  - `stop()` → `DecisionStop`
+  - `abstain(reason: string)` → `DecisionAbstain`
+- Request uses `NoSkills: true`, `MessageMode: MessageModeEnqueue`,
+  `Streaming: true`. The responder session does **not** use the agent's
+  workspace.
+- If no decision tool is called (responder malfunction), `Classify` returns an
+  error — distinct from abstain.
+
+### Component 3: Runner loop (`internal/orchestration/runner.go`)
+
+In `executeRun`, after the initial `Execute`:
+
+- If `tc.Stimulus.Responder != nil` → `executeResponderLoop`.
+- Else if `len(tc.Stimulus.FollowUps) > 0` → existing `executeFollowUps`.
+
+`executeResponderLoop` mirrors `executeFollowUps` plumbing (build request via
+`buildExecutionRequest`, set `Message`/`SessionID`/`WorkspaceDir`, apply per-turn
+timeout, merge `Events`/`ToolCalls`/`SkillInvocations`/`DurationMs`/`FinalOutput`/
+`WorkspaceFiles`/`Usage` into `resp`). Pseudocode:
+
+```
+classifier := responder.New(r.engine, *tc.Stimulus.Responder, r.spec.Config.ModelID)
+left := tc.Stimulus.Responder.MaxFollowups
+sent := 0
+outcome := "completed"
+for left > 0 {
+    decision, err := classifier.Classify(ctx, resp.FinalOutput)
+    if err != nil { resp.ErrorMsg = "responder error: " + err; outcome = "error"; break }
+    switch decision.Kind {
+    case Reply:
+        // send agent follow-up using decision.Answer (reuse SessionID + WorkspaceDir)
+        // merge follow-up response into resp; on error set resp.ErrorMsg + break
+        sent++; left--
+        log: responder replied (turn sent, budget left)
+    case Stop:
+        outcome = "stopped"; goto done
+    case Abstain:
+        resp.ErrorMsg = "responder abstained: " + decision.Reason
+        outcome = "abstained"; goto done
+    }
+}
+if left == 0 && lastDecisionWasReply {
+    outcome = "cap_exhausted"
+    log warning: responder budget exhausted while agent still asking
+}
+done:
+attach ResponderInfo{FollowupsSent: sent, Outcome: outcome, Reason: ...} to the run
+```
+
+Verbose mode emits per-turn progress events (reusing the existing
+`EventAgentPrompt` / `EventAgentResponse` style) so `-v` runs show the
+responder's answers and the agent's replies.
+
+### Component 4: Results & reporting (`internal/models/outcome.go`)
+
+```go
+type ResponderInfo struct {
+    FollowupsSent int    `json:"followups_sent"`
+    Outcome       string `json:"outcome"` // completed|stopped|abstained|cap_exhausted|error
+    Reason        string `json:"reason,omitempty"`
+}
+
+// RunResult gains:
+//   Responder *ResponderInfo `json:"responder,omitempty"`
+```
+
+Status mapping:
+
+| Responder outcome | `RunResult.Status` | `ErrorMsg`                         | Notes |
+|-------------------|--------------------|------------------------------------|-------|
+| `completed`       | unchanged (graded) | —                                  | Agent finished; graders decide pass/fail. |
+| `stopped`         | unchanged (graded) | —                                  | Responder signalled done.            |
+| `abstained`       | `StatusError`      | `responder abstained: <reason>`    | Distinct, filterable; separate from timeouts/network errors. |
+| `cap_exhausted`   | unchanged (graded) | —                                  | Logged + surfaced; graders judge the end state. |
+| `error`           | `StatusError`      | `responder error: <msg>`           | Responder malfunction (no decision / session failure). |
+
+Because abstain reuses `StatusError` but is tagged via `Responder.Outcome`,
+reports and the dashboard can distinguish a vague-brief abstain from a genuine
+error. The dashboard (`web/`) surfaces `responder.outcome` (and reason) so
+abstain and cap-exhaustion are visible per run.
+
+## Error handling & edge cases
+
+- **No decision tool called** → `Classify` error → run `error` outcome
+  (`StatusError`), distinct from abstain.
+- **Responder session creation/Execute failure** → propagated as run `error`.
+- **Agent follow-up Execute failure** → mirrors `executeFollowUps`: set
+  `resp.ErrorMsg`, stop the loop.
+- **`max_followups` exhausted while agent still asking** → `cap_exhausted`; loop
+  stops, run proceeds to grading, warning logged.
+- **Mutual exclusivity** of `responder` and `follow_up_prompts` enforced at load
+  time with a clear error.
+- **Context cancellation / task timeout** honoured on every responder and agent
+  turn via the existing per-turn timeout pattern.
+
+## Testing
+
+- **`internal/responder`** — fake `Executor` invoking decision-tool handlers:
+  reply / stop / abstain / no-decision-error; persistent-session resumption
+  (second `Classify` passes the stored `SessionID`); first-vs-later message
+  shape (instructions preamble only on first call); model defaulting.
+- **`internal/orchestration`** — mock engine + injectable classifier (or fake
+  executor): reply → agent follow-up sent with reused session/workspace; stop;
+  abstain → `StatusError` + `Responder.Outcome == "abstained"`; cap exhaustion →
+  graded + `Responder.Outcome == "cap_exhausted"`; mutual-exclusivity rejection.
+- **`internal/models`** — validation: missing instructions, `max_followups < 1`,
+  both `responder` and `follow_up_prompts` set.
+- **Schema** — `internal/validation` and `internal/projectconfig` parity tests
+  for the new `responder` field.
+- All existing tests remain green; `go test ./...` and `golangci-lint run` pass.
+
+## Documentation
+
+Per `AGENTS.md`:
+
+- `README.md` — responder section + YAML example in the eval/inputs docs.
+- `site/src/content/docs/` — eval-YAML reference entry for `inputs.responder`
+  and a short guide on testing interactive skills; build with `npm run build`.
+- Schema files kept in sync.
+- Dashboard (`web/`) — surface `responder.outcome`/`reason`; regenerate
+  screenshots if UI changes.
+- Reference issue #303 in commits; update tracking issue #66 if applicable.
+
+## Rationale
+
+- **Per-task placement** mirrors `follow_up_prompts`, keeps each task
+  self-contained, makes mutual-exclusivity checking local, and directly serves
+  the "vary the target config across tasks" use case — without any eval-level
+  override/merge complexity.
+- **Runner owns the loop, responder owns classification** keeps the responder
+  small and unit-testable, and reuses the battle-tested agent follow-up plumbing
+  rather than duplicating it.
+- **Persistent responder session** models a real user who remembers prior
+  answers, avoiding contradictory or repeated responses across turns.
+- **Abstain as tagged `StatusError`** satisfies the issue's requirement that a
+  vague-brief abstain be reportable separately from transient errors, without
+  introducing a new top-level status value that every report/consumer would need
+  to learn.
diff --git a/internal/execution/copilot.go b/internal/execution/copilot.go
index bc994a496..581853b32 100644
--- a/internal/execution/copilot.go
+++ b/internal/execution/copilot.go
@@ -574,6 +574,25 @@ func (e *CopilotEngine) doShutdown(ctx context.Context) error {
 	return nil
 }
 
+// DeleteSession removes a persistent session created via Execute (with
+// EphemeralSession=false) and stops tracking it for shutdown cleanup. It is
+// used by callers that own a long-lived session, such as the responder, to
+// tear it down promptly rather than waiting for engine Shutdown.
+func (e *CopilotEngine) DeleteSession(ctx context.Context, sessionID string) error {
+	if sessionID == "" {
+		return nil
+	}
+	e.sessionsMu.Lock()
+	delete(e.sessions, sessionID)
+	e.sessionsMu.Unlock()
+
+	e.usageCollectorsMu.Lock()
+	delete(e.usageCollectors, sessionID)
+	e.usageCollectorsMu.Unlock()
+
+	return e.client.DeleteSession(ctx, sessionID)
+}
+
 // SessionUsage returns the final usage stats for a session. Call after Shutdown()
 // to get data from session.shutdown events (ModelMetrics, TotalPremiumRequests).
 // When BYOK was active for this session, the returned stats include sanitized
diff --git a/internal/models/outcome.go b/internal/models/outcome.go
index c1fba4cb9..3af74da9f 100644
--- a/internal/models/outcome.go
+++ b/internal/models/outcome.go
@@ -20,6 +20,14 @@ const (
 	StatusNA Status = "n/a"
 )
 
+// Responder outcome values recorded on RunResult.Responder.Outcome.
+const (
+	ResponderOutcomeStopped      = "stopped"
+	ResponderOutcomeAbstained    = "abstained"
+	ResponderOutcomeCapExhausted = "cap_exhausted"
+	ResponderOutcomeError        = "error"
+)
+
 // GraderKind identifies the type of grader (e.g. regex, file, code).
 type GraderKind string
 
@@ -62,6 +70,17 @@ func AllGraderKinds() []string {
 	return names
 }
 
+// ResponderInfo records the outcome of a responder-driven multi-turn run.
+type ResponderInfo struct {
+	// FollowupsSent is the number of responder answers sent to the agent.
+	FollowupsSent int `json:"followups_sent"`
+	// Outcome is one of: completed, stopped, abstained, cap_exhausted, error.
+	Outcome string `json:"outcome"`
+	// Reason holds the responder's reason when Outcome == "abstained" or an
+	// error message when Outcome == "error".
+	Reason string `json:"reason,omitempty"`
+}
+
 // EvaluationOutcome represents the complete result of an evaluation run
 type EvaluationOutcome struct {
 	RunID           string                   `json:"eval_id"`
@@ -158,6 +177,7 @@ type RunResult struct {
 	SkillInvocations []SkillInvocation        `json:"skill_invocations,omitempty"`
 	Usage            *UsageStats              `json:"usage,omitempty"`
 	WorkspaceDir     string                   `json:"workspace_dir,omitempty"`
+	Responder        *ResponderInfo           `json:"responder,omitempty"`
 }
 
 type GraderResults struct {
diff --git a/internal/models/outcome_test.go b/internal/models/outcome_test.go
index 6a97a5934..47f2ec02f 100644
--- a/internal/models/outcome_test.go
+++ b/internal/models/outcome_test.go
@@ -1,6 +1,7 @@
 package models
 
 import (
+	"encoding/json"
 	"math"
 	"testing"
 
@@ -226,3 +227,23 @@ func TestAggregateUsageStats_AllNil(t *testing.T) {
 func TestAggregateUsageStats_Empty(t *testing.T) {
 	require.Nil(t, AggregateUsageStats(nil))
 }
+
+func TestResponderInfoSerializes(t *testing.T) {
+	rr := RunResult{
+		RunNumber: 1,
+		Status:    StatusError,
+		Responder: &ResponderInfo{
+			FollowupsSent: 3,
+			Outcome:       "abstained",
+			Reason:        "brief too vague",
+		},
+	}
+	data, err := json.Marshal(rr)
+	require.NoError(t, err)
+	require.Contains(t, string(data), `"responder"`)
+	require.Contains(t, string(data), `"outcome":"abstained"`)
+
+	data2, err := json.Marshal(RunResult{RunNumber: 1, Status: StatusPassed})
+	require.NoError(t, err)
+	require.NotContains(t, string(data2), `"responder"`)
+}
diff --git a/internal/models/testcase.go b/internal/models/testcase.go
index 15b3e2206..958b161f3 100644
--- a/internal/models/testcase.go
+++ b/internal/models/testcase.go
@@ -38,6 +38,22 @@ type TaskStimulus struct {
 	WorkDir     string            `yaml:"workdir,omitempty" json:"workdir,omitempty"`
 	Environment map[string]string `yaml:"environment,omitempty" json:"environment,omitempty"`
 	FollowUps   []string          `yaml:"follow_up_prompts,omitempty" json:"follow_ups,omitempty"`
+	Responder   *ResponderConfig  `yaml:"responder,omitempty" json:"responder,omitempty"`
+}
+
+// ResponderConfig configures an LLM-backed surrogate user that answers a
+// skill's follow-up questions during a multi-turn run. It is mutually
+// exclusive with TaskStimulus.FollowUps.
+type ResponderConfig struct {
+	// Model is the model used for the responder LLM. Optional; when empty the
+	// eval-level config.model is used.
+	Model string `yaml:"model,omitempty" json:"model,omitempty"`
+	// Instructions describe the target configuration the responder represents
+	// and the rule for abstaining. Required.
+	Instructions string `yaml:"instructions" json:"instructions"`
+	// MaxFollowups caps how many times the responder may reply before the loop
+	// stops. Required; must be >= 1.
+	MaxFollowups int `yaml:"max_followups" json:"max_followups"`
 }
 
 // ResourceRef points to a file or inline content
@@ -319,6 +335,27 @@ func (tc *TestCase) Validate() error {
 		}
 		return fmt.Errorf("timeout_seconds must be at least 1, got %d", *tc.TimeoutSec)
 	}
+
+	if r := tc.Stimulus.Responder; r != nil {
+		name := tc.TestID
+		if name == "" {
+			name = tc.DisplayName
+		}
+		prefix := "test case"
+		if name != "" {
+			prefix = fmt.Sprintf("test case %q", name)
+		}
+		if strings.TrimSpace(r.Instructions) == "" {
+			return fmt.Errorf("%s: responder.instructions is required", prefix)
+		}
+		if r.MaxFollowups < 1 {
+			return fmt.Errorf("%s: responder.max_followups must be at least 1, got %d", prefix, r.MaxFollowups)
+		}
+		if len(tc.Stimulus.FollowUps) > 0 {
+			return fmt.Errorf("%s: inputs.responder and inputs.follow_up_prompts are mutually exclusive; use one or the other", prefix)
+		}
+	}
+
 	return nil
 }
 
diff --git a/internal/models/testcase_test.go b/internal/models/testcase_test.go
index e57bf553c..00f82fddd 100644
--- a/internal/models/testcase_test.go
+++ b/internal/models/testcase_test.go
@@ -5,6 +5,8 @@ import (
 	"path/filepath"
 	"strings"
 	"testing"
+
+	"github.com/stretchr/testify/require"
 )
 
 func TestLoadTestCase_ShouldTriggerField(t *testing.T) {
@@ -215,3 +217,80 @@ instruction_files:
 		t.Errorf("Expected second instruction file 'docs/task.instructions.md', got %q", tc.InstructionFiles[1])
 	}
 }
+
+func TestResponderConfigParsesUnderInputs(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "task.yaml")
+	yaml := `
+id: configure-agent
+name: Configure a research agent
+inputs:
+  prompt: "Add a new agent to my application"
+  responder:
+    model: gpt-4o
+    instructions: |
+      The agent you want is research-agent with tools web_search.
+      If you can't infer an answer, abstain.
+    max_followups: 8
+`
+	require.NoError(t, os.WriteFile(path, []byte(yaml), 0o600))
+
+	tc, err := LoadTestCase(path)
+	require.NoError(t, err)
+	require.NotNil(t, tc.Stimulus.Responder)
+	require.Equal(t, "gpt-4o", tc.Stimulus.Responder.Model)
+	require.Equal(t, 8, tc.Stimulus.Responder.MaxFollowups)
+	require.Contains(t, tc.Stimulus.Responder.Instructions, "research-agent")
+}
+
+func TestResponderValidationRejectsMissingInstructions(t *testing.T) {
+	tc := &TestCase{
+		TestID: "t1",
+		Stimulus: TaskStimulus{
+			Message:   "go",
+			Responder: &ResponderConfig{MaxFollowups: 3},
+		},
+	}
+	err := tc.Validate()
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "instructions")
+}
+
+func TestResponderValidationRejectsZeroMaxFollowups(t *testing.T) {
+	tc := &TestCase{
+		TestID: "t1",
+		Stimulus: TaskStimulus{
+			Message:   "go",
+			Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 0},
+		},
+	}
+	err := tc.Validate()
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "max_followups")
+}
+
+func TestResponderValidationRejectsBothResponderAndFollowUps(t *testing.T) {
+	tc := &TestCase{
+		TestID: "t1",
+		Stimulus: TaskStimulus{
+			Message:   "go",
+			FollowUps: []string{"next"},
+			Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2},
+		},
+	}
+	err := tc.Validate()
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "follow_up_prompts")
+	require.Contains(t, err.Error(), "responder")
+}
+
+func TestResponderValidationAcceptsValidConfig(t *testing.T) {
+	tc := &TestCase{
+		TestID: "t1",
+		Stimulus: TaskStimulus{
+			Message:   "go",
+			Responder: &ResponderConfig{Instructions: "x", MaxFollowups: 2},
+		},
+	}
+	require.NoError(t, tc.Validate())
+}
diff --git a/internal/orchestration/responder_loop_test.go b/internal/orchestration/responder_loop_test.go
new file mode 100644
index 000000000..5b61bf544
--- /dev/null
+++ b/internal/orchestration/responder_loop_test.go
@@ -0,0 +1,106 @@
+package orchestration
+
+import (
+	"context"
+	"testing"
+
+	"github.com/microsoft/waza/internal/config"
+	"github.com/microsoft/waza/internal/execution"
+	"github.com/microsoft/waza/internal/models"
+	"github.com/microsoft/waza/internal/responder"
+	"github.com/stretchr/testify/require"
+)
+
+// scriptedClassifier returns a queued sequence of decisions, repeating the last.
+type scriptedClassifier struct {
+	decisions []responder.Decision
+	idx       int
+	calls     int
+}
+
+func (s *scriptedClassifier) Classify(_ context.Context, _ string) (responder.Decision, error) {
+	s.calls++
+	d := s.decisions[s.idx]
+	if s.idx < len(s.decisions)-1 {
+		s.idx++
+	}
+	return d, nil
+}
+
+func (s *scriptedClassifier) Close(_ context.Context) error { return nil }
+
+func newResponderTestRunner(t *testing.T) *EvalRunner {
+	t.Helper()
+	spec := &models.EvalSpec{
+		SpecIdentity: models.SpecIdentity{Name: "test-benchmark"},
+		SkillName:    "my-skill",
+		Config: models.Config{
+			EngineType: "mock",
+			ModelID:    "gpt-4",
+			TimeoutSec: 120,
+		},
+	}
+	cfg := config.NewEvalConfig(spec)
+	engine := execution.NewMockEngine("gpt-4")
+	require.NoError(t, engine.Initialize(context.Background()))
+	t.Cleanup(func() { require.NoError(t, engine.Shutdown(context.Background())) })
+	return NewEvalRunner(cfg, engine, WithSkipGraders())
+}
+
+func TestResponderLoopReplyThenStop(t *testing.T) {
+	r := newResponderTestRunner(t)
+	sc := &scriptedClassifier{decisions: []responder.Decision{
+		{Kind: responder.DecisionReply, Answer: "research-agent"},
+		{Kind: responder.DecisionStop},
+	}}
+	r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc }
+
+	tc := &models.TestCase{
+		TestID:   "t1",
+		Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}},
+	}
+	rr := r.executeRun(context.Background(), tc, 1)
+
+	require.NotNil(t, rr.Responder)
+	require.Equal(t, models.ResponderOutcomeStopped, rr.Responder.Outcome)
+	require.Equal(t, 1, rr.Responder.FollowupsSent)
+}
+
+func TestResponderLoopAbstainMarksError(t *testing.T) {
+	r := newResponderTestRunner(t)
+	sc := &scriptedClassifier{decisions: []responder.Decision{
+		{Kind: responder.DecisionAbstain, Reason: "too vague"},
+	}}
+	r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc }
+
+	tc := &models.TestCase{
+		TestID:   "t1",
+		Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 5}},
+	}
+	rr := r.executeRun(context.Background(), tc, 1)
+
+	require.Equal(t, models.StatusError, rr.Status)
+	require.NotNil(t, rr.Responder)
+	require.Equal(t, models.ResponderOutcomeAbstained, rr.Responder.Outcome)
+	require.Contains(t, rr.ErrorMsg, "abstained")
+	require.Contains(t, rr.ErrorMsg, "too vague")
+}
+
+func TestResponderLoopCapExhausted(t *testing.T) {
+	r := newResponderTestRunner(t)
+	sc := &scriptedClassifier{decisions: []responder.Decision{
+		{Kind: responder.DecisionReply, Answer: "a"},
+	}}
+	r.newClassifier = func(models.ResponderConfig, string) responderClassifier { return sc }
+
+	tc := &models.TestCase{
+		TestID:   "t1",
+		Stimulus: models.TaskStimulus{Message: "add agent", Responder: &models.ResponderConfig{Instructions: "x", MaxFollowups: 2}},
+	}
+	rr := r.executeRun(context.Background(), tc, 1)
+
+	require.NotNil(t, rr.Responder)
+	require.Equal(t, models.ResponderOutcomeCapExhausted, rr.Responder.Outcome)
+	require.Equal(t, 2, rr.Responder.FollowupsSent)
+	require.NotEqual(t, models.StatusError, rr.Status)
+}
diff --git a/internal/orchestration/runner.go b/internal/orchestration/runner.go
index 97c9f35d1..710e2a9a8 100644
--- a/internal/orchestration/runner.go
+++ b/internal/orchestration/runner.go
@@ -3,6 +3,7 @@ package orchestration
 import (
 	"context"
 	"fmt"
+	"log/slog"
 	"math"
 	"os"
 	"path/filepath"
@@ -19,6 +20,7 @@ import (
 	"github.com/microsoft/waza/internal/graders"
 	"github.com/microsoft/waza/internal/hooks"
 	"github.com/microsoft/waza/internal/models"
+	"github.com/microsoft/waza/internal/responder"
 	"github.com/microsoft/waza/internal/template"
 	"github.com/microsoft/waza/internal/transcript"
 	"github.com/microsoft/waza/internal/utils"
@@ -26,6 +28,14 @@ import (
 	copilot "github.com/github/copilot-sdk/go"
 )
 
+// responderClassifier classifies an agent message into a responder decision
+// and tears down its session when the run finishes. Implemented by
+// *responder.Classifier; faked in tests.
+type responderClassifier interface {
+	Classify(ctx context.Context, agentMessage string) (responder.Decision, error)
+	Close(ctx context.Context) error
+}
+
 // EvalRunner orchestrates the execution of tests.
 //
 // Deprecated alias: TestRunner is provided for backward compatibility.
@@ -34,6 +44,10 @@ type EvalRunner struct {
 	engine  execution.AgentEngine
 	verbose bool
 
+	// newClassifier builds a responder classifier for a task. Overridable in
+	// tests; defaults to a responder backed by the runner's engine.
+	newClassifier func(cfg models.ResponderConfig, defaultModel string) responderClassifier
+
 	// Task filtering
 	taskFilters []string
 
@@ -136,6 +150,9 @@ func NewEvalRunner(cfg *config.EvalConfig, engine execution.AgentEngine, opts ..
 		verbose:   cfg.Verbose(),
 		listeners: []ProgressListener{},
 	}
+	r.newClassifier = func(cfg models.ResponderConfig, defaultModel string) responderClassifier {
+		return responder.New(r.engine, cfg, defaultModel)
+	}
 	for _, o := range opts {
 		o(r)
 	}
@@ -1072,8 +1089,12 @@ func (r *EvalRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum
 		})
 	}
 
-	// Execute follow-up prompts if defined
-	if len(tc.Stimulus.FollowUps) > 0 {
+	// Drive multi-turn: responder loop takes precedence; otherwise static
+	// follow-ups. Validation guarantees these are mutually exclusive.
+	var responderInfo *models.ResponderInfo
+	if tc.Stimulus.Responder != nil {
+		responderInfo = r.executeResponderLoop(ctx, tc, resp)
+	} else if len(tc.Stimulus.FollowUps) > 0 {
 		r.executeFollowUps(ctx, tc, resp)
 	}
 
@@ -1153,6 +1174,7 @@ func (r *EvalRunner) executeRun(ctx context.Context, tc *models.TestCase, runNum
 		ErrorMsg:         resp.ErrorMsg,
 		SkillInvocations: skillInvocations,
 		WorkspaceDir:     resp.WorkspaceDir,
+		Responder:        responderInfo,
 	}
 }
 
@@ -1274,6 +1296,122 @@ func (r *EvalRunner) executeFollowUps(ctx context.Context, tc *models.TestCase,
 	}
 }
 
+// executeResponderLoop drives a multi-turn run using an LLM-backed surrogate
+// user. After each agent turn it classifies the agent's latest message and
+// either replies (sending a new agent prompt), stops, or aborts on abstain.
+// It mutates resp in place (mirroring executeFollowUps) and returns a summary.
+func (r *EvalRunner) executeResponderLoop(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse) *models.ResponderInfo {
+	cfg := *tc.Stimulus.Responder
+	classifier := r.newClassifier(cfg, r.cfg.Spec().Config.ModelID)
+	defer func() {
+		// Tear down the persistent responder session with a detached context so
+		// cleanup still runs even if ctx was canceled during the run.
+		cleanupCtx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
+		if err := classifier.Close(cleanupCtx); err != nil {
+			slog.WarnContext(ctx, "failed to clean up responder session",
+				"test", tc.DisplayName, "error", err)
+		}
+	}()
+
+	info := &models.ResponderInfo{}
+	left := cfg.MaxFollowups
+
+	for left > 0 {
+		decision, err := classifier.Classify(ctx, resp.FinalOutput)
+		if err != nil {
+			resp.ErrorMsg = fmt.Sprintf("responder error: %v", err)
+			info.Outcome = models.ResponderOutcomeError
+			info.Reason = err.Error()
+			return info
+		}
+
+		switch decision.Kind {
+		case responder.DecisionStop:
+			info.Outcome = models.ResponderOutcomeStopped
+			return info
+
+		case responder.DecisionAbstain:
+			resp.ErrorMsg = fmt.Sprintf("responder abstained: %s", decision.Reason)
+			info.Outcome = models.ResponderOutcomeAbstained
+			info.Reason = decision.Reason
+			return info
+
+		case responder.DecisionReply:
+			if !r.sendResponderReply(ctx, tc, resp, decision.Answer, info.FollowupsSent+1) {
+				info.Outcome = models.ResponderOutcomeError
+				info.Reason = resp.ErrorMsg
+				return info
+			}
+			info.FollowupsSent++
+			left--
+		}
+	}
+
+	// Reaching this point means the loop only exited via successful replies
+	// (Stop, Abstain, and error paths all return early), and validation
+	// guarantees MaxFollowups >= 1, so a reply must have run on the final
+	// iteration. The agent is still asking, but we've spent our budget.
+	info.Outcome = models.ResponderOutcomeCapExhausted
+	slog.WarnContext(ctx, "responder budget exhausted while agent still asking questions",
+		"test", tc.DisplayName, "max_followups", cfg.MaxFollowups)
+	return info
+}
+
+// sendResponderReply sends one responder answer to the agent session, reusing
+// the session and workspace, and merges the agent's response into resp. It
+// returns false (and sets resp.ErrorMsg) on failure.
+func (r *EvalRunner) sendResponderReply(ctx context.Context, tc *models.TestCase, resp *execution.ExecutionResponse, answer string, turn int) bool {
+	followReq, err := r.buildExecutionRequest(tc)
+	if err != nil {
+		resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err)
+		return false
+	}
+	followReq.Message = answer
+	followReq.SessionID = resp.SessionID
+	followReq.WorkspaceDir = resp.WorkspaceDir
+
+	if r.verbose {
+		r.notifyProgress(ProgressEvent{
+			EventType: EventAgentPrompt,
+			TestName:  tc.DisplayName,
+			Details:   map[string]any{"message": answer, "responder_reply": turn},
+		})
+	}
+
+	timeout, err := r.executionTimeout(tc)
+	if err != nil {
+		resp.ErrorMsg = fmt.Sprintf("responder reply %d setup failed: %v", turn, err)
+		return false
+	}
+	followCtx, cancelFollow := context.WithTimeout(ctx, timeout)
+	followResp, err := r.engine.Execute(followCtx, followReq)
+	cancelFollow()
+	if err != nil {
+		resp.ErrorMsg = fmt.Sprintf("responder reply %d failed: %v", turn, err)
+		return false
+	}
+	if followResp.ErrorMsg != "" {
+		resp.ErrorMsg = fmt.Sprintf("responder reply %d: %s", turn, followResp.ErrorMsg)
+		return false
+	}
+
+	resp.Events = append(resp.Events, followResp.Events...)
+	resp.ToolCalls = append(resp.ToolCalls, followResp.ToolCalls...)
+	resp.SkillInvocations = append(resp.SkillInvocations, followResp.SkillInvocations...)
+	resp.DurationMs += followResp.DurationMs
+	resp.FinalOutput = followResp.FinalOutput
+	resp.WorkspaceFiles = followResp.WorkspaceFiles
+	if followResp.Usage != nil {
+		if resp.Usage == nil {
+			resp.Usage = followResp.Usage
+		} else {
+			resp.Usage = models.AggregateUsageStats([]*models.UsageStats{resp.Usage, followResp.Usage})
+		}
+	}
+	return true
+}
+
 func (r *EvalRunner) loadResources(tc *models.TestCase) []execution.ResourceFile {
 	var resources []execution.ResourceFile
 
diff --git a/internal/responder/responder.go b/internal/responder/responder.go
new file mode 100644
index 000000000..e38315ab5
--- /dev/null
+++ b/internal/responder/responder.go
@@ -0,0 +1,256 @@
+// Package responder implements an LLM-backed surrogate user that answers an
+// interactive skill's follow-up questions during a multi-turn evaluation run.
+package responder
+
+import (
+	"context"
+	"errors"
+	"fmt"
+
+	copilot "github.com/github/copilot-sdk/go"
+	"github.com/go-viper/mapstructure/v2"
+	"github.com/microsoft/waza/internal/execution"
+	"github.com/microsoft/waza/internal/models"
+)
+
+// DecisionKind enumerates the responder's possible classifications of an agent
+// message.
+type DecisionKind int
+
+const (
+	// DecisionReply means the responder answered the agent's question.
+	DecisionReply DecisionKind = iota
+	// DecisionStop means the agent is done and no further input is needed.
+	DecisionStop
+	// DecisionAbstain means the responder could not answer from its brief.
+	DecisionAbstain
+)
+
+// Decision is the outcome of classifying a single agent message.
+type Decision struct {
+	Kind   DecisionKind
+	Answer string // set when Kind == DecisionReply
+	Reason string // set when Kind == DecisionAbstain
+}
+
+const (
+	toolRespond = "responder_reply"
+	toolStop    = "responder_stop"
+	toolAbstain = "responder_abstain"
+)
+
+// Executor is the narrow execution surface the responder needs. The concrete
+// AgentEngine satisfies it, and tests supply a fake.
+type Executor interface {
+	Execute(ctx context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error)
+}
+
+// sessionDeleter is an optional capability for explicitly tearing down a
+// persistent session. *execution.CopilotEngine implements it; engines that do
+// not (e.g. the mock) leave Close as a no-op.
+type sessionDeleter interface {
+	DeleteSession(ctx context.Context, sessionID string) error
+}
+
+// decisionRecorder captures the single decision tool the responder LLM calls.
+// err is set if a handler-level failure (malformed arguments or a duplicate
+// decision call) must be surfaced rather than silently swallowed.
+type decisionRecorder struct {
+	decision Decision
+	set      bool
+	err      error
+}
+
+func (d *decisionRecorder) tools() []copilot.Tool {
+	return []copilot.Tool{
+		{
+			Name:        toolRespond,
+			Description: "Answer the agent's question as the user. Call this exactly once with your answer.",
+			Parameters: map[string]any{
+				"type": "object",
+				"properties": map[string]any{
+					"answer": map[string]any{
+						"type":        "string",
+						"description": "Your reply to the agent's question, consistent with your configuration.",
+					},
+				},
+				"required": []string{"answer"},
+			},
+			Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) {
+				if err := d.guardDuplicate(toolRespond); err != nil {
+					return copilot.ToolResult{}, err
+				}
+				var args struct {
+					Answer string `mapstructure:"answer"`
+				}
+				if err := mapstructure.Decode(inv.Arguments, &args); err != nil {
+					d.recordErr(fmt.Errorf("decode %s arguments: %w", toolRespond, err))
+					return copilot.ToolResult{}, err
+				}
+				d.decision = Decision{Kind: DecisionReply, Answer: args.Answer}
+				d.set = true
+				return copilot.ToolResult{}, nil
+			},
+		},
+		{
+			Name:        toolStop,
+			Description: "Signal that the agent has finished and needs no further input. Call this when there is no question to answer.",
+			Parameters: map[string]any{
+				"type":       "object",
+				"properties": map[string]any{},
+			},
+			Handler: func(copilot.ToolInvocation) (copilot.ToolResult, error) {
+				if err := d.guardDuplicate(toolStop); err != nil {
+					return copilot.ToolResult{}, err
+				}
+				d.decision = Decision{Kind: DecisionStop}
+				d.set = true
+				return copilot.ToolResult{}, nil
+			},
+		},
+		{
+			Name:        toolAbstain,
+			Description: "Signal that you cannot answer the agent's question from your configuration. Call this only when the information is genuinely missing.",
+			Parameters: map[string]any{
+				"type": "object",
+				"properties": map[string]any{
+					"reason": map[string]any{
+						"type":        "string",
+						"description": "Why you cannot answer.",
+					},
+				},
+				"required": []string{"reason"},
+			},
+			Handler: func(inv copilot.ToolInvocation) (copilot.ToolResult, error) {
+				if err := d.guardDuplicate(toolAbstain); err != nil {
+					return copilot.ToolResult{}, err
+				}
+				var args struct {
+					Reason string `mapstructure:"reason"`
+				}
+				if err := mapstructure.Decode(inv.Arguments, &args); err != nil {
+					d.recordErr(fmt.Errorf("decode %s arguments: %w", toolAbstain, err))
+					return copilot.ToolResult{}, err
+				}
+				d.decision = Decision{Kind: DecisionAbstain, Reason: args.Reason}
+				d.set = true
+				return copilot.ToolResult{}, nil
+			},
+		},
+	}
+}
+
+// guardDuplicate enforces the "call exactly one decision tool, exactly once"
+// contract advertised in each tool description. If the model calls a second
+// decision tool in the same turn, the handler refuses rather than letting
+// invocation order silently pick the winner.
+func (d *decisionRecorder) guardDuplicate(name string) error {
+	if !d.set {
+		return nil
+	}
+	err := fmt.Errorf("responder called %s after a decision was already recorded", name)
+	d.recordErr(err)
+	return err
+}
+
+// recordErr captures the first handler-level failure so Classify can surface it.
+func (d *decisionRecorder) recordErr(err error) {
+	if d.err == nil {
+		d.err = err
+	}
+}
+
+// Classifier maintains a persistent surrogate-user session and classifies each
+// agent message into a Decision.
+type Classifier struct {
+	exec         Executor
+	model        string
+	instructions string
+	sessionID    string // empty until the first Classify creates the session
+}
+
+// New constructs a Classifier. defaultModel is used when cfg.Model is empty.
+func New(exec Executor, cfg models.ResponderConfig, defaultModel string) *Classifier {
+	model := cfg.Model
+	if model == "" {
+		model = defaultModel
+	}
+	return &Classifier{
+		exec:         exec,
+		model:        model,
+		instructions: cfg.Instructions,
+	}
+}
+
+// Classify sends the agent's latest message to the responder LLM and returns
+// its decision. The first call seeds the session with the responder
+// instructions; subsequent calls resume the same session.
+func (c *Classifier) Classify(ctx context.Context, agentMessage string) (Decision, error) {
+	rec := &decisionRecorder{}
+
+	req := &execution.ExecutionRequest{
+		ModelID:     c.model,
+		Message:     c.buildMessage(agentMessage),
+		Tools:       rec.tools(),
+		MessageMode: execution.MessageModeEnqueue,
+		Streaming:   true,
+		SessionID:   c.sessionID,
+		NoSkills:    true,
+		// The responder session must persist across turns so it can be resumed
+		// (and so its instructions need only be sent once). It is torn down
+		// explicitly via Close. EphemeralSession would delete it after the
+		// first turn, breaking resume.
+		EphemeralSession:     false,
+		SkipWorkspaceCapture: true,
+	}
+
+	resp, err := c.exec.Execute(ctx, req)
+	if resp != nil && resp.SessionID != "" {
+		c.sessionID = resp.SessionID
+	}
+	// A handler-level failure (malformed tool arguments, or the model calling
+	// more than one decision tool) takes precedence: surfacing it as an error
+	// is more useful than silently returning a possibly-bogus decision.
+	if rec.err != nil {
+		return Decision{}, fmt.Errorf("responder tool call invalid: %w", rec.err)
+	}
+	if err != nil {
+		if rec.set {
+			return rec.decision, nil
+		}
+		return Decision{}, fmt.Errorf("responder execution failed: %w", err)
+	}
+	if !rec.set {
+		return Decision{}, errors.New("responder did not call a decision tool")
+	}
+	return rec.decision, nil
+}
+
+// Close tears down the persistent responder session if one was created. It is
+// safe to call multiple times and is a no-op when the underlying executor does
+// not support explicit session deletion.
+func (c *Classifier) Close(ctx context.Context) error {
+	if c.sessionID == "" {
+		return nil
+	}
+	sessionID := c.sessionID
+	c.sessionID = ""
+	if d, ok := c.exec.(sessionDeleter); ok {
+		return d.DeleteSession(ctx, sessionID)
+	}
+	return nil
+}
+
+func (c *Classifier) buildMessage(agentMessage string) string {
+	if c.sessionID == "" {
+		return fmt.Sprintf(
+			"%s\n\nYou are role-playing as the user. The agent just said:\n\n%s\n\n"+
+				"Respond by calling exactly one tool: %s to answer, %s if the agent is finished and needs nothing, or %s if you genuinely cannot answer from your configuration.",
+			c.instructions, agentMessage, toolRespond, toolStop, toolAbstain,
+		)
+	}
+	return fmt.Sprintf(
+		"The agent just said:\n\n%s\n\nRespond by calling exactly one tool (%s, %s, or %s).",
+		agentMessage, toolRespond, toolStop, toolAbstain,
+	)
+}
diff --git a/internal/responder/responder_test.go b/internal/responder/responder_test.go
new file mode 100644
index 000000000..1d557208d
--- /dev/null
+++ b/internal/responder/responder_test.go
@@ -0,0 +1,281 @@
+package responder
+
+import (
+	"context"
+	"testing"
+
+	copilot "github.com/github/copilot-sdk/go"
+	"github.com/microsoft/waza/internal/execution"
+	"github.com/microsoft/waza/internal/models"
+	"github.com/stretchr/testify/require"
+)
+
+func TestDecisionToolsRecordReply(t *testing.T) {
+	d := &decisionRecorder{}
+	tools := d.tools()
+	require.Len(t, tools, 3)
+
+	respond := findTool(t, tools, toolRespond)
+	_, err := respond.Handler(copilot.ToolInvocation{
+		Arguments: map[string]any{"answer": "research-agent"},
+	})
+	require.NoError(t, err)
+	require.True(t, d.set)
+	require.Equal(t, DecisionReply, d.decision.Kind)
+	require.Equal(t, "research-agent", d.decision.Answer)
+}
+
+func TestDecisionToolsRecordStop(t *testing.T) {
+	d := &decisionRecorder{}
+	stop := findTool(t, d.tools(), toolStop)
+	_, err := stop.Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+	require.NoError(t, err)
+	require.True(t, d.set)
+	require.Equal(t, DecisionStop, d.decision.Kind)
+}
+
+func TestDecisionToolsRecordAbstain(t *testing.T) {
+	d := &decisionRecorder{}
+	abstain := findTool(t, d.tools(), toolAbstain)
+	_, err := abstain.Handler(copilot.ToolInvocation{
+		Arguments: map[string]any{"reason": "brief too vague"},
+	})
+	require.NoError(t, err)
+	require.True(t, d.set)
+	require.Equal(t, DecisionAbstain, d.decision.Kind)
+	require.Equal(t, "brief too vague", d.decision.Reason)
+}
+
+type fakeExecutor struct {
+	calls   []*execution.ExecutionRequest
+	respond func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error)
+}
+
+func (f *fakeExecutor) Execute(_ context.Context, req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+	f.calls = append(f.calls, req)
+	return f.respond(req)
+}
+
+func TestClassifyReply(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			_, err := findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{
+				Arguments: map[string]any{"answer": "research-agent"},
+			})
+			require.NoError(t, err)
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "be research-agent", MaxFollowups: 5}, "gpt-4o")
+	d, err := c.Classify(context.Background(), "What is the agent name?")
+	require.NoError(t, err)
+	require.Equal(t, DecisionReply, d.Kind)
+	require.Equal(t, "research-agent", d.Answer)
+}
+
+func TestClassifyAbstain(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			_, _ = findTool(t, req.Tools, toolAbstain).Handler(copilot.ToolInvocation{
+				Arguments: map[string]any{"reason": "no info"},
+			})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	d, err := c.Classify(context.Background(), "Q?")
+	require.NoError(t, err)
+	require.Equal(t, DecisionAbstain, d.Kind)
+	require.Equal(t, "no info", d.Reason)
+}
+
+func TestClassifyNoDecisionToolIsError(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.Error(t, err)
+}
+
+func TestClassifyUsesDefaultModelWhenUnset(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			require.Equal(t, "default-model", req.ModelID)
+			_, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "default-model")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.NoError(t, err)
+}
+
+func TestClassifyPersistsSession(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			_, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{
+				Arguments: map[string]any{"answer": "a"},
+			})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "INSTR", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q1?")
+	require.NoError(t, err)
+	_, err = c.Classify(context.Background(), "Q2?")
+	require.NoError(t, err)
+
+	require.Len(t, exec.calls, 2)
+	require.Empty(t, exec.calls[0].SessionID)
+	require.Contains(t, exec.calls[0].Message, "INSTR")
+	require.Contains(t, exec.calls[0].Message, "Q1?")
+	require.Equal(t, "resp-1", exec.calls[1].SessionID)
+	require.NotContains(t, exec.calls[1].Message, "INSTR")
+	require.Contains(t, exec.calls[1].Message, "Q2?")
+}
+
+func TestClassifyUsesPersistentSession(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			_, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.NoError(t, err)
+
+	require.Len(t, exec.calls, 1)
+	require.False(t, exec.calls[0].EphemeralSession,
+		"responder must use a persistent (non-ephemeral) session so it can be resumed across turns")
+}
+
+// deletingExecutor records the sessions it is asked to delete.
+type deletingExecutor struct {
+	fakeExecutor
+	deleted []string
+}
+
+func (d *deletingExecutor) DeleteSession(_ context.Context, sessionID string) error {
+	d.deleted = append(d.deleted, sessionID)
+	return nil
+}
+
+func TestCloseDeletesSession(t *testing.T) {
+	exec := &deletingExecutor{}
+	exec.respond = func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+		_, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+		return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.NoError(t, err)
+
+	require.NoError(t, c.Close(context.Background()))
+	require.Equal(t, []string{"resp-1"}, exec.deleted)
+
+	// Close is idempotent: the session id is cleared after the first call.
+	require.NoError(t, c.Close(context.Background()))
+	require.Equal(t, []string{"resp-1"}, exec.deleted)
+}
+
+func TestCloseWithoutSessionIsNoop(t *testing.T) {
+	exec := &deletingExecutor{}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	require.NoError(t, c.Close(context.Background()))
+	require.Empty(t, exec.deleted)
+}
+
+func TestCloseWithoutDeleterIsNoop(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			_, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.NoError(t, err)
+	require.NoError(t, c.Close(context.Background()))
+}
+
+func TestDecisionToolsRejectDuplicateCall(t *testing.T) {
+	d := &decisionRecorder{}
+	tools := d.tools()
+	respond := findTool(t, tools, toolRespond)
+	stop := findTool(t, tools, toolStop)
+
+	_, err := respond.Handler(copilot.ToolInvocation{
+		Arguments: map[string]any{"answer": "first"},
+	})
+	require.NoError(t, err)
+
+	// A second decision call must be rejected rather than silently
+	// overwriting the first decision.
+	_, err = stop.Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+	require.Error(t, err)
+	require.Error(t, d.err)
+	// The first decision is preserved so callers can see what was recorded.
+	require.Equal(t, DecisionReply, d.decision.Kind)
+	require.Equal(t, "first", d.decision.Answer)
+}
+
+func TestDecisionToolsRejectMalformedArgs(t *testing.T) {
+	d := &decisionRecorder{}
+	respond := findTool(t, d.tools(), toolRespond)
+
+	// answer must be a string; passing a non-string triggers a decode error
+	// that the handler surfaces instead of recording an empty reply.
+	_, err := respond.Handler(copilot.ToolInvocation{
+		Arguments: map[string]any{"answer": map[string]any{"nested": true}},
+	})
+	require.Error(t, err)
+	require.Error(t, d.err)
+	require.False(t, d.set)
+}
+
+func TestClassifyDuplicateDecisionIsError(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			// The model calls reply first, then stop in the same turn.
+			_, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{
+				Arguments: map[string]any{"answer": "a"},
+			})
+			_, _ = findTool(t, req.Tools, toolStop).Handler(copilot.ToolInvocation{Arguments: map[string]any{}})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "responder tool call invalid")
+}
+
+func TestClassifyMalformedArgsIsError(t *testing.T) {
+	exec := &fakeExecutor{
+		respond: func(req *execution.ExecutionRequest) (*execution.ExecutionResponse, error) {
+			_, _ = findTool(t, req.Tools, toolRespond).Handler(copilot.ToolInvocation{
+				Arguments: map[string]any{"answer": 42},
+			})
+			return &execution.ExecutionResponse{SessionID: "resp-1"}, nil
+		},
+	}
+	c := New(exec, models.ResponderConfig{Instructions: "x", MaxFollowups: 5}, "gpt-4o")
+	_, err := c.Classify(context.Background(), "Q?")
+	require.Error(t, err)
+	require.Contains(t, err.Error(), "responder tool call invalid")
+}
+
+func findTool(t *testing.T, tools []copilot.Tool, name string) copilot.Tool {
+	t.Helper()
+	for _, tl := range tools {
+		if tl.Name == name {
+			return tl
+		}
+	}
+	t.Fatalf("tool %q not found", name)
+	return copilot.Tool{}
+}
diff --git a/internal/validation/schema_test.go b/internal/validation/schema_test.go
index ea7ea2ad4..a345392f4 100644
--- a/internal/validation/schema_test.go
+++ b/internal/validation/schema_test.go
@@ -125,6 +125,19 @@ inputs:
 	require.Empty(t, errs, "task with instruction_files should have no errors")
 }
 
+func TestValidateTaskBytes_Responder(t *testing.T) {
+	yaml := `id: task-1
+name: Configure agent
+inputs:
+  prompt: "add agent"
+  responder:
+    instructions: "be research-agent; abstain if unknown"
+    max_followups: 8
+`
+	errs := ValidateTaskBytes([]byte(yaml))
+	require.Empty(t, errs, "task with inputs.responder should have no errors")
+}
+
 func TestValidateTaskBytes_Invalid(t *testing.T) {
 	errs := ValidateTaskBytes([]byte(invalidTaskYAML))
 	require.NotEmpty(t, errs, "invalid task should have errors")
diff --git a/internal/webapi/additional_test.go b/internal/webapi/additional_test.go
index 693bffaf1..511ef63e6 100644
--- a/internal/webapi/additional_test.go
+++ b/internal/webapi/additional_test.go
@@ -270,6 +270,49 @@ func TestOutcomeToDetailMapsStatsTranscriptAndDigest(t *testing.T) {
 	}
 }
 
+func TestOutcomeToDetailMapsResponder(t *testing.T) {
+	outcome := &models.EvaluationOutcome{
+		RunID:     "responder-run",
+		BenchName: "bench-responder",
+		Setup:     models.OutcomeSetup{ModelID: "gpt-4o"},
+		Digest:    models.OutcomeDigest{TotalTests: 1, Succeeded: 1},
+		TestOutcomes: []models.TestOutcome{
+			{
+				DisplayName: "task-with-responder",
+				Status:      models.StatusPassed,
+				Runs: []models.RunResult{
+					{
+						Responder: &models.ResponderInfo{
+							FollowupsSent: 2,
+							Outcome:       models.ResponderOutcomeAbstained,
+							Reason:        "too vague",
+						},
+					},
+				},
+			},
+		},
+	}
+
+	detail := outcomeToDetail(outcome)
+
+	if len(detail.Tasks) != 1 {
+		t.Fatalf("expected 1 task, got %d", len(detail.Tasks))
+	}
+	responder := detail.Tasks[0].Responder
+	if responder == nil {
+		t.Fatal("expected responder")
+	}
+	if responder.Outcome != "abstained" {
+		t.Errorf("expected outcome abstained, got %q", responder.Outcome)
+	}
+	if responder.FollowupsSent != 2 {
+		t.Errorf("expected 2 followups sent, got %d", responder.FollowupsSent)
+	}
+	if responder.Reason != "too vague" {
+		t.Errorf("expected reason too vague, got %q", responder.Reason)
+	}
+}
+
 func TestOutcomeToDetailNoTasks(t *testing.T) {
 	detail := outcomeToDetail(&models.EvaluationOutcome{
 		RunID:     "empty",
diff --git a/internal/webapi/store.go b/internal/webapi/store.go
index 1cbd3ec3f..032d4558e 100644
--- a/internal/webapi/store.go
+++ b/internal/webapi/store.go
@@ -211,6 +211,13 @@ func outcomeToDetail(o *models.EvaluationOutcome) *RunDetail {
 			}
 			tr.Transcript = mapTranscriptEvents(run.Transcript)
 			tr.SessionDigest = mapSessionDigest(&run.SessionDigest)
+			if run.Responder != nil {
+				tr.Responder = &ResponderInfoResponse{
+					FollowupsSent: run.Responder.FollowupsSent,
+					Outcome:       run.Responder.Outcome,
+					Reason:        run.Responder.Reason,
+				}
+			}
 		}
 		if tr.GraderResults == nil {
 			tr.GraderResults = []GraderResult{}
diff --git a/internal/webapi/types.go b/internal/webapi/types.go
index 59de4b5bc..0998edbeb 100644
--- a/internal/webapi/types.go
+++ b/internal/webapi/types.go
@@ -33,6 +33,7 @@ type TaskResult struct {
 	GraderResults []GraderResult              `json:"graderResults"`
 	Transcript    []TranscriptEventResponse   `json:"transcript,omitempty"`
 	SessionDigest *SessionDigestResponse      `json:"sessionDigest,omitempty"`
+	Responder     *ResponderInfoResponse      `json:"responder,omitempty"`
 	BootstrapCI   *ConfidenceIntervalResponse `json:"bootstrapCI,omitempty"`
 	IsSignificant *bool                       `json:"isSignificant,omitempty"`
 }
@@ -68,6 +69,13 @@ type SessionDigestResponse struct {
 	Errors        []string `json:"errors"`
 }
 
+// ResponderInfoResponse is the API representation of a responder-driven run summary.
+type ResponderInfoResponse struct {
+	FollowupsSent int    `json:"followupsSent"`
+	Outcome       string `json:"outcome"`
+	Reason        string `json:"reason,omitempty"`
+}
+
 // GraderResult is a single grader/validator result.
 type GraderResult struct {
 	Name    string  `json:"name"`
diff --git a/schemas/task.schema.json b/schemas/task.schema.json
index 332259c52..9722c7776 100644
--- a/schemas/task.schema.json
+++ b/schemas/task.schema.json
@@ -135,6 +135,28 @@
             "type": "string"
           },
           "description": "Environment variables set during task execution."
+        },
+        "responder": {
+          "type": "object",
+          "additionalProperties": false,
+          "required": ["instructions", "max_followups"],
+          "description": "LLM-backed surrogate user that answers the skill's follow-up questions. Mutually exclusive with follow_up_prompts.",
+          "properties": {
+            "model": {
+              "type": "string",
+              "description": "Model used for the responder LLM. Defaults to the eval-level config.model."
+            },
+            "instructions": {
+              "type": "string",
+              "minLength": 1,
+              "description": "Describes the target configuration the responder represents and the rule for abstaining."
+            },
+            "max_followups": {
+              "type": "integer",
+              "minimum": 1,
+              "description": "Maximum number of responder replies before the loop stops."
+            }
+          }
         }
       }
     },
diff --git a/site/src/content/docs/guides/eval-yaml.mdx b/site/src/content/docs/guides/eval-yaml.mdx
index 33d5c00f9..05eb4a489 100644
--- a/site/src/content/docs/guides/eval-yaml.mdx
+++ b/site/src/content/docs/guides/eval-yaml.mdx
@@ -390,6 +390,25 @@ inputs:
 
 This is useful for evaluating multi-turn conversations where each step builds on the previous one. Graders run only after all prompts (initial + follow-ups) have completed, so the final output reflects the full conversation.
 
+### Responder (Interactive Skills)
+
+For skills that ask follow-up questions, configure a `responder` — an LLM that plays the user and answers the skill's questions. It is mutually exclusive with `follow_up_prompts`.
+
+```yaml
+inputs:
+  prompt: "Add a new agent to my application"
+  responder:
+    model: gpt-4o          # optional; defaults to config.model
+    instructions: |
+      The agent you want is "research-agent" with system instructions
+      "Search the web and summarise findings", tools web_search + url_fetch,
+      and no handoffs. Answer the skill's questions consistently with this.
+      If you genuinely can't infer an answer, abstain.
+    max_followups: 8
+```
+
+After each agent turn the responder either **replies** (the answer is sent back, continuing the conversation), **stops** (the agent is done), or **abstains** — which fails the run with a distinct `abstained` outcome, signalling the brief is too vague. If `max_followups` is reached while the agent is still asking questions, the loop stops with outcome `cap_exhausted` and graders evaluate the final state. Each task carries its own responder, so the same skill can be tested against several target configurations.
+
 Prompt supports templating:
 
 ```yaml
diff --git a/site/src/content/docs/reference/schema.mdx b/site/src/content/docs/reference/schema.mdx
index 3dbd09868..2c976197f 100644
--- a/site/src/content/docs/reference/schema.mdx
+++ b/site/src/content/docs/reference/schema.mdx
@@ -506,6 +506,29 @@ inputs:
 
 Graders evaluate only the final state after all prompts complete. If any follow-up fails, remaining prompts are skipped and the run is marked as an error.
 
+### responder
+
+**Type:** object  
+**Required:** no
+
+An LLM-backed surrogate user that answers the skill's follow-up questions during a multi-turn run. Mutually exclusive with `follow_up_prompts`.
+
+| Field          | Type    | Required | Description                                            |
+|----------------|---------|----------|--------------------------------------------------------|
+| `model`        | string  | no       | Responder model. Defaults to the eval-level `config.model`. |
+| `instructions` | string  | yes      | Target configuration the responder represents + abstain rule. |
+| `max_followups`| integer | yes      | Max responder replies before the loop stops (`>= 1`).  |
+
+```yaml
+inputs:
+  prompt: "Add a new agent to my application"
+  responder:
+    instructions: "Be research-agent with tools web_search; abstain if unknown."
+    max_followups: 8
+```
+
+The responder classifies each agent message as **reply**, **stop**, or **abstain**. An abstain marks the run as an error with outcome `abstained`, distinct from model timeouts or network errors. If `max_followups` is reached while the agent is still asking questions, the loop stops with outcome `cap_exhausted` and graders evaluate the final state.
+
 ### files
 
 **Type:** array  
diff --git a/web/dist/index.html b/web/dist/index.html
index 4f675d7e2..40fc3d369 100644
--- a/web/dist/index.html
+++ b/web/dist/index.html
@@ -4,7 +4,7 @@
     <meta charset="UTF-8" />
     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
     <title>waza — eval dashboard</title>
-    <script type="module" crossorigin src="/assets/index-BeEA5A4U.js"></script>
+    <script type="module" crossorigin src="/assets/index-B-4TIZTS.js"></script>
     <link rel="stylesheet" crossorigin href="/assets/index-DUl_5EUI.css">
   </head>
   <body class="bg-zinc-900 text-zinc-100 antialiased">
diff --git a/web/src/api/client.ts b/web/src/api/client.ts
index ec6427a4f..48ad99398 100644
--- a/web/src/api/client.ts
+++ b/web/src/api/client.ts
@@ -59,6 +59,12 @@ export interface SessionDigest {
   errors: string[];
 }
 
+export interface ResponderInfo {
+  followupsSent: number;
+  outcome: string;
+  reason?: string;
+}
+
 export interface TaskResult {
   name: string;
   outcome: string;
@@ -68,6 +74,7 @@ export interface TaskResult {
   graderResults: GraderResult[];
   transcript?: TranscriptEvent[];
   sessionDigest?: SessionDigest;
+  responder?: ResponderInfo;
   bootstrapCI?: BootstrapCI;
   isSignificant?: boolean;
 }
diff --git a/web/src/components/RunDetail.tsx b/web/src/components/RunDetail.tsx
index c6dcdcaec..00236cb4b 100644
--- a/web/src/components/RunDetail.tsx
+++ b/web/src/components/RunDetail.tsx
@@ -9,7 +9,7 @@ import {
   Download,
 } from "lucide-react";
 import { useRunDetail } from "../hooks/useApi";
-import type { TaskResult, GraderResult } from "../api/client";
+import type { TaskResult, GraderResult, ResponderInfo } from "../api/client";
 import {
   formatDuration,
   formatCost,
@@ -96,6 +96,30 @@ function SignificanceBadge({ isSignificant }: { isSignificant?: boolean }) {
   );
 }
 
+function ResponderBadge({ responder }: { responder?: ResponderInfo }) {
+  if (!responder) return null;
+
+  let className =
+    "inline-flex items-center rounded-full px-2 py-0.5 text-xs font-medium";
+  if (responder.outcome === "abstained" || responder.outcome === "error") {
+    className += " bg-red-500/10 text-red-400";
+  } else if (responder.outcome === "cap_exhausted") {
+    className += " bg-yellow-500/10 text-yellow-400";
+  } else {
+    className += " bg-zinc-700 text-zinc-300";
+  }
+
+  const replyLabel = responder.followupsSent === 1 ? "reply" : "replies";
+  const reason = responder.reason ? ` — ${responder.reason}` : "";
+
+  return (
+    <span className={className} data-testid="responder-badge">
+      Responder: {responder.outcome} ({responder.followupsSent} {replyLabel})
+      {reason}
+    </span>
+  );
+}
+
 function CIRange({ lower, upper }: { lower: number; upper: number }) {
   return (
     <span
@@ -151,6 +175,7 @@ function TaskRow({ task }: { task: TaskResult }) {
               <ChevronRight className="h-4 w-4 text-zinc-500" />
             )}
             <span className="font-medium text-zinc-100">{task.name}</span>
+            <ResponderBadge responder={task.responder} />
           </span>
         </td>
         <td className="px-4 py-3">