diff --git a/crates/mcptest-config/schemas/v1.json b/crates/mcptest-config/schemas/v1.json index 80fe1be..71721de 100644 --- a/crates/mcptest-config/schemas/v1.json +++ b/crates/mcptest-config/schemas/v1.json @@ -575,9 +575,55 @@ } } } + }, + "input_responder": { + "$ref": "#/$defs/InputResponder", + "description": "Suite-level default elicitation answer source (SEP-2322). Applies to any tool test whose tool returns an InputRequiredResult, unless the test declares its own `input_responses` or `input_responder`." } }, "$defs": { + "InputResponder": { + "title": "Elicitation answer source", + "description": "A dynamic source the runner uses to answer a 2026-07-28 InputRequiredResult elicitation (SEP-2322). Only the `rest` provider exists today.", + "type": "object", + "additionalProperties": false, + "required": [ + "rest" + ], + "properties": { + "rest": { + "$ref": "#/$defs/RestInputResponder" + } + } + }, + "RestInputResponder": { + "title": "REST elicitation responder", + "description": "The runner POSTs each elicitation ({ tool, arguments, requestState, inputRequests }) to `url` and reads back { inputResponses: [{ id, value }] }.", + "type": "object", + "additionalProperties": false, + "required": [ + "url" + ], + "properties": { + "url": { + "type": "string", + "minLength": 1, + "description": "Endpoint the runner POSTs each elicitation to." + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Static headers sent on every POST (for example an Authorization token). Values are redacted in diagnostics." + }, + "timeout_ms": { + "type": "integer", + "minimum": 0, + "description": "Per-request timeout in milliseconds. Defaults to 30000 when omitted." + } + } + }, "CalibrationCheck": { "title": "Calibration check", "description": "One judge-calibration check. Reads a labels file (JSONL or a YAML array of {confidence, correct}) and exposes the computed metrics as assertion targets: `ece`, `brier`, and (when `reliability` and `observed_positive_rate` are both given) `corrected_rate` plus its Wald 95% interval `corrected_rate_low` / `corrected_rate_high`. The `expect:` reuses the standard assertion grammar (target plus matcher); omit it to apply the defaults `ece <= 0.10`, `brier <= 0.25`, and `corrected_rate <= observed_positive_rate`. The target names are runtime-resolved free strings, documented rather than schema-enforced.", @@ -1856,6 +1902,14 @@ "minLength": 1, "description": "Name of a fixture defined under `fixtures.errors[]` that the runner will inject in place of a real tool call. The schema does not enforce that the name resolves to a declared fixture; that cross-reference check happens in the loader (and will be wired up by the runner in a future release)." }, + "input_responses": { + "type": "object", + "description": "Static answers for a 2026-07-28 InputRequiredResult elicitation (SEP-2322): each inputRequest id maps to the answer value. When the tool returns an input-required result, the runner satisfies it from this map and retries. Mutually exclusive with `input_responder`.", + "additionalProperties": true + }, + "input_responder": { + "$ref": "#/$defs/InputResponder" + }, "cache": { "$ref": "#/$defs/CacheDirective" }, diff --git a/crates/mcptest-core/src/executor/dispatch.rs b/crates/mcptest-core/src/executor/dispatch.rs index 61921fb..238732c 100644 --- a/crates/mcptest-core/src/executor/dispatch.rs +++ b/crates/mcptest-core/src/executor/dispatch.rs @@ -85,12 +85,25 @@ pub(super) async fn apply_response_transform( /// same assertable envelope the offline `inject_error` path produces instead /// of failing the test hard. An unexpected error (no error assertion) still /// fails loudly with the server's message, preserving the safety net. +/// What the executor needs to satisfy an `InputRequiredResult` +/// elicitation: the resolved answer source plus the test/server labels +/// for the REST payload and the trace (WOR-1383). +pub(super) struct ElicitSetup<'a> { + /// The resolved answer source (static map or REST endpoint). + pub responder: &'a crate::executor::elicitation::Responder, + /// Test name, for the trace and the REST request body. + pub test_name: &'a str, + /// Server name, for the trace and the REST request body. + pub server: &'a str, +} + pub(super) async fn call_server( client: &crate::protocol::Client, action: &Action, transform: Option<&TransformSpec>, ctx: &TransformContext, expects_error: bool, + elicit: Option<&ElicitSetup<'_>>, ) -> Result<(Value, i64), String> { let (method, mut params, label) = match action { Action::ToolCall { tool, args } => ( @@ -121,6 +134,9 @@ pub(super) async fn call_server( target = %label, "dispatching action" ); + // Keep the transformed params as the base for elicitation retries (only + // when a responder is configured, to avoid a clone on the common path). + let retry_base = elicit.map(|_| params.clone()); match client.request_with_id(method.to_string(), params).await { Ok((id, raw)) => { tracing::trace!( @@ -139,6 +155,22 @@ pub(super) async fn call_server( } else { raw }; + // A tools/call can also answer with an InputRequiredResult; when a + // responder is configured, satisfy it and retry until a final + // result, returning the final request id for header assertions + // (WOR-1383). `params` is the already-transformed base. + if let (Action::ToolCall { .. }, Some(setup)) = (action, elicit) { + let base = retry_base.expect("retry_base is cloned whenever elicit is set"); + return resolve_input_required( + client, + json!({ "result": resolved }), + id, + base, + setup, + label, + ) + .await; + } Ok((json!({ "result": resolved }), id)) } // A live JSON-RPC error becomes an assertable `result.error` envelope @@ -204,6 +236,142 @@ async fn resolve_task_handle( )) } +/// Drive the `InputRequiredResult` retry loop for a tools/call (SEP-2322, +/// WOR-1383). +/// +/// On entry `envelope` is `{"result": }`. While it is an +/// input-required result, the runner resolves answers from `setup`, +/// retries with `build_retry_params` (off the transformed `base_params`), +/// polls any task handle each round, and loops until a final result or +/// the round cap. Returns the final `{"result": ...}` envelope and the +/// final request id (so response-header assertions inspect the final +/// response). Every round emits a redacted `mcptest_core::elicitation` +/// trace event: ids and counts, never `requestState` or answer values. +async fn resolve_input_required( + client: &crate::protocol::Client, + initial_envelope: Value, + initial_id: i64, + base_params: Value, + setup: &ElicitSetup<'_>, + label: &str, +) -> Result<(Value, i64), String> { + use crate::executor::elicitation::{ElicitCallContext, DEFAULT_MAX_ELICITATION_ROUNDS}; + use crate::protocol::elicitation::{build_retry_params, recognize_input_required}; + + let mut envelope = initial_envelope; + let mut params = base_params; + let mut last_id = initial_id; + for round in 0..DEFAULT_MAX_ELICITATION_ROUNDS { + let Some(irr) = recognize_input_required(&envelope) else { + if round > 0 { + tracing::info!( + target: "mcptest_core::elicitation", + event = "elicitation.completed", + test = setup.test_name, server = setup.server, tool = label, + rounds = round, request_id = last_id, + "elicitation resolved", + ); + } + return Ok((envelope, last_id)); + }; + let request_ids: Vec<&str> = irr.input_requests.iter().map(|r| r.id.as_str()).collect(); + let required_count = irr.input_requests.iter().filter(|r| r.required).count(); + tracing::info!( + target: "mcptest_core::elicitation", + event = "elicitation.round_started", + test = setup.test_name, server = setup.server, tool = label, + round = round + 1, responder_kind = setup.responder.kind(), + input_request_ids = ?request_ids, required_count, + optional_count = irr.input_requests.len() - required_count, + request_state_hash = %short_hash(irr.request_state.as_str()), + "elicitation round started", + ); + let ctx = ElicitCallContext { + test_name: setup.test_name.to_string(), + server: setup.server.to_string(), + tool: label.to_string(), + arguments: params + .get("arguments") + .cloned() + .unwrap_or_else(|| json!({})), + request_state: irr.request_state.clone(), + round: round + 1, + }; + let started = std::time::Instant::now(); + let answers = match setup.responder.answer(&irr.input_requests, &ctx).await { + Ok(answers) => answers, + Err(err) => { + tracing::warn!( + target: "mcptest_core::elicitation", + event = "elicitation.failed", + test = setup.test_name, server = setup.server, tool = label, + round = round + 1, error_kind = elicitation_error_kind(&err), + "elicitation could not be answered", + ); + return Err(format!("tools/call `{label}` elicitation: {err}")); + } + }; + tracing::debug!( + target: "mcptest_core::elicitation", + event = "elicitation.answer_resolved", + test = setup.test_name, server = setup.server, tool = label, + round = round + 1, answered = answers.len(), + duration_ms = started.elapsed().as_millis() as u64, + "answers resolved", + ); + let retry = build_retry_params(¶ms, &irr.request_state, &answers); + let (id, raw) = client + .request_with_id("tools/call".to_string(), retry.clone()) + .await + .map_err(|e| format!("tools/call retry for `{label}` failed: {e}"))?; + last_id = id; + let resolved = resolve_task_handle(client, raw, label).await?; + envelope = json!({ "result": resolved }); + params = retry; + tracing::debug!( + target: "mcptest_core::elicitation", + event = "elicitation.retry_dispatched", + test = setup.test_name, server = setup.server, tool = label, + round = round + 1, request_id = last_id, + "retry dispatched", + ); + } + if recognize_input_required(&envelope).is_some() { + tracing::warn!( + target: "mcptest_core::elicitation", + event = "elicitation.failed", + test = setup.test_name, server = setup.server, tool = label, + error_kind = "max_rounds", + "server kept eliciting past the round cap", + ); + return Err(format!( + "tools/call `{label}` elicitation: server kept eliciting after {DEFAULT_MAX_ELICITATION_ROUNDS} rounds; aborting" + )); + } + Ok((envelope, last_id)) +} + +/// Short, run-stable hash of an opaque `requestState`, so a trace can +/// correlate the rounds of one elicitation without logging the token. +fn short_hash(value: &str) -> String { + use std::hash::{Hash, Hasher}; + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + value.hash(&mut hasher); + format!("{:08x}", hasher.finish() & 0xffff_ffff) +} + +/// One-word `error_kind` for the failure trace, never the message body. +fn elicitation_error_kind(error: &crate::executor::elicitation::ElicitationError) -> &'static str { + use crate::executor::elicitation::ElicitationError; + match error { + ElicitationError::MissingResponse { .. } => "missing_response", + ElicitationError::MaxRoundsExceeded { .. } => "max_rounds", + ElicitationError::NoFixture => "no_fixture", + ElicitationError::WrongKind { .. } => "wrong_kind", + ElicitationError::Rest(_) => "rest", + } +} + /// What a single `tasks/get` poll tells the runner to do next. enum PollOutcome { /// Non-terminal: poll again. @@ -289,7 +457,9 @@ pub(super) async fn run_metamorphic( tool: tool.to_string(), args: base_args.clone(), }; - let (base_envelope, _id) = call_server(client, &base_action, None, ctx, false).await?; + // Metamorphic follow-up calls do not participate in elicitation (v1): + // the primary call already resolved any InputRequiredResult (WOR-1383). + let (base_envelope, _id) = call_server(client, &base_action, None, ctx, false, None).await?; let base_result = inner_result(base_envelope); let mut pairs = Vec::with_capacity(spec.relations.len()); for relation in &spec.relations { @@ -298,7 +468,7 @@ pub(super) async fn run_metamorphic( tool: tool.to_string(), args: followup_args, }; - let (followup_envelope, _id) = call_server(client, &action, None, ctx, false).await?; + let (followup_envelope, _id) = call_server(client, &action, None, ctx, false, None).await?; let followup_result = inner_result(followup_envelope); pairs.push((relation.clone(), base_result.clone(), followup_result)); } diff --git a/crates/mcptest-core/src/executor/elicitation.rs b/crates/mcptest-core/src/executor/elicitation.rs index c03a0cc..315134f 100644 --- a/crates/mcptest-core/src/executor/elicitation.rs +++ b/crates/mcptest-core/src/executor/elicitation.rs @@ -16,11 +16,16 @@ //! 4. Stops after a max-rounds cap so a misbehaving server cannot //! loop forever. +use std::collections::BTreeMap; +use std::time::Duration; + use serde_json::Value; +use crate::network::ProxyConfig; use crate::protocol::elicitation::{ build_retry_params, recognize_input_required, InputRequest, InputResponse, RequestState, }; +use crate::suite::RestInputResponderSpec; /// Default cap on the number of elicitation rounds the runner will /// follow before giving up. Five matches the SEP-2322 example flow @@ -87,6 +92,21 @@ pub enum ElicitationError { /// can point at "non-interactive mode without fixture" rather /// than at a specific missing id. NoFixture, + /// An answer's value did not match the `kind` the server's + /// [`InputRequest`] declared (`string` / `number` / `boolean`). + /// Catches an author typo (a number where a string is required) + /// and a misbehaving REST responder alike. + WrongKind { + /// The `InputRequest::id` whose answer was wrong-typed. + request_id: String, + /// The `kind` the request declared. + expected: String, + }, + /// The REST responder could not be reached or returned an + /// unusable reply (non-2xx, invalid JSON, missing/duplicate/unknown + /// ids). Carries a redacted human-readable reason; never the + /// response body or any header value. + Rest(String), } impl std::fmt::Display for ElicitationError { @@ -103,8 +123,49 @@ impl std::fmt::Display for ElicitationError { f, "server returned InputRequiredResult but no inputResponses fixture was supplied" ), + Self::WrongKind { + request_id, + expected, + } => write!( + f, + "answer for inputRequest `{request_id}` is not a {expected}" + ), + Self::Rest(reason) => write!(f, "REST elicitation responder: {reason}"), + } + } +} + +/// Check that one answer's JSON type matches the `kind` its +/// [`InputRequest`] declared. Applied to both static and REST answers so +/// a wrong-typed answer fails the same way regardless of source. Unknown +/// `kind` values (future-reserved) are not constrained. +fn answer_matches_kind(kind: &str, value: &Value) -> bool { + match kind { + "string" => value.is_string(), + "number" => value.is_number(), + "boolean" => value.is_boolean(), + _ => true, + } +} + +/// Validate every answer against its request's `kind`. The answers are +/// matched to requests by id; an answer with no matching request is +/// ignored here (the REST path rejects unknown ids separately). +pub fn validate_answer_kinds( + requests: &[InputRequest], + answers: &[InputResponse], +) -> Result<(), ElicitationError> { + for answer in answers { + if let Some(request) = requests.iter().find(|r| r.id == answer.id) { + if !answer_matches_kind(&request.kind, &answer.value) { + return Err(ElicitationError::WrongKind { + request_id: request.id.clone(), + expected: request.kind.clone(), + }); + } } } + Ok(()) } /// Decide whether the runner has every answer it needs. @@ -191,6 +252,191 @@ where Ok(response) } +/// Live answer source for an `InputRequiredResult` elicitation, built +/// from the parsed suite spec at plan-build time (WOR-1383). `Static` +/// answers from a pre-supplied map; `Rest` POSTs each elicitation to an +/// HTTP endpoint. +#[derive(Debug, Clone)] +pub enum Responder { + /// Deterministic answers from the suite's `input_responses` map. + Static(InputResponseFixture), + /// Dynamic answers from a REST endpoint. + Rest(RestResponder), +} + +/// REST elicitation responder: POSTs each elicitation and reads back the +/// `inputResponses`. The `reqwest::Client` is built once at plan time. +#[derive(Debug, Clone)] +pub struct RestResponder { + url: String, + headers: BTreeMap, + client: reqwest::Client, +} + +/// Context for one elicitation round: the call being retried plus the +/// round index. A responder may use it to answer; the runner also stamps +/// it (redacted) into the trace. +#[derive(Debug, Clone)] +pub struct ElicitCallContext { + /// Test name (for the trace and the REST payload). + pub test_name: String, + /// Server name the test targets. + pub server: String, + /// Tool being called. + pub tool: String, + /// The call's `arguments` object. + pub arguments: Value, + /// The opaque `requestState` echoed on the retry. + pub request_state: RequestState, + /// 1-based elicitation round. + pub round: u8, +} + +impl Responder { + /// Resolve answers for one elicitation round. + pub async fn answer( + &self, + requests: &[InputRequest], + ctx: &ElicitCallContext, + ) -> Result, ElicitationError> { + match self { + Responder::Static(fixture) => { + let answers = collect_responses(requests, fixture)?; + validate_answer_kinds(requests, &answers)?; + Ok(answers) + } + Responder::Rest(rest) => rest.answer(requests, ctx).await, + } + } + + /// `static` or `rest`, for the trace `responder_kind` field. + pub fn kind(&self) -> &'static str { + match self { + Responder::Static(_) => "static", + Responder::Rest(_) => "rest", + } + } + + /// True when answers come from live IO (REST). Such a test is + /// treated like one with a `transform` at the cache gate: its result + /// is not cached, since a dynamic answer could differ next run. + pub fn is_dynamic(&self) -> bool { + matches!(self, Responder::Rest(_)) + } +} + +impl RestResponder { + /// Build the live responder from its spec and the run-wide proxy. + /// Fails fast at plan-build time on a bad url / header / proxy rather + /// than deferring the error to the first elicitation. + pub fn from_spec(spec: &RestInputResponderSpec, proxy: &ProxyConfig) -> Result { + let timeout = Duration::from_millis(spec.timeout_ms.unwrap_or(30_000)); + let builder = proxy + .apply(reqwest::Client::builder().timeout(timeout)) + .map_err(|e| format!("input_responder proxy: {e}"))?; + let client = builder + .build() + .map_err(|e| format!("input_responder client: {e}"))?; + Ok(Self { + url: spec.url.clone(), + headers: spec.headers.clone(), + client, + }) + } + + async fn answer( + &self, + requests: &[InputRequest], + ctx: &ElicitCallContext, + ) -> Result, ElicitationError> { + let body = serde_json::json!({ + "test": ctx.test_name, + "server": ctx.server, + "tool": ctx.tool, + "arguments": ctx.arguments, + "round": ctx.round, + "requestState": ctx.request_state.as_str(), + "inputRequests": requests, + }); + let mut request = self.client.post(&self.url).json(&body); + for (name, value) in &self.headers { + request = request.header(name, value); + } + let response = request.send().await.map_err(|e| { + ElicitationError::Rest(format!("POST failed: {}", redact_send_error(&e))) + })?; + let status = response.status(); + if !status.is_success() { + return Err(ElicitationError::Rest(format!( + "endpoint returned HTTP {}", + status.as_u16() + ))); + } + let parsed: Value = response + .json() + .await + .map_err(|_| ElicitationError::Rest("response body was not valid JSON".to_string()))?; + let raw = parsed + .get("inputResponses") + .and_then(Value::as_array) + .ok_or_else(|| { + ElicitationError::Rest("response is missing an `inputResponses` array".to_string()) + })?; + let answers: Vec = serde_json::from_value(Value::Array(raw.clone())) + .map_err(|_| { + ElicitationError::Rest( + "`inputResponses` entries must be `{ id, value }` objects".to_string(), + ) + })?; + validate_rest_answers(requests, &answers)?; + Ok(answers) + } +} + +/// Validate a REST responder's answers: no duplicate ids, no unknown +/// ids, every required id answered, and every answer well-typed. +fn validate_rest_answers( + requests: &[InputRequest], + answers: &[InputResponse], +) -> Result<(), ElicitationError> { + let mut seen = std::collections::HashSet::new(); + for answer in answers { + if !seen.insert(answer.id.as_str()) { + return Err(ElicitationError::Rest(format!( + "duplicate answer for id `{}`", + answer.id + ))); + } + if !requests.iter().any(|r| r.id == answer.id) { + return Err(ElicitationError::Rest(format!( + "answer for unknown id `{}`", + answer.id + ))); + } + } + for request in requests { + if request.required && !answers.iter().any(|a| a.id == request.id) { + return Err(ElicitationError::Rest(format!( + "no answer for required id `{}`", + request.id + ))); + } + } + validate_answer_kinds(requests, answers) +} + +/// Categorize a reqwest send error without leaking the URL, headers, or +/// any token into diagnostics. +fn redact_send_error(error: &reqwest::Error) -> &'static str { + if error.is_timeout() { + "request timed out" + } else if error.is_connect() { + "could not connect to the endpoint" + } else { + "transport error" + } +} + #[cfg(test)] mod tests { use super::*; @@ -313,4 +559,130 @@ mod tests { .expect("plain response passes through"); assert_eq!(out, plain); } + + fn elicit_ctx() -> ElicitCallContext { + ElicitCallContext { + test_name: "t".into(), + server: "s".into(), + tool: "book_flight".into(), + arguments: json!({}), + request_state: RequestState("tok".into()), + round: 1, + } + } + + #[test] + fn validate_answer_kinds_rejects_a_wrong_typed_answer() { + let requests = vec![req("dest", "string", true)]; + let answers = vec![InputResponse { + id: "dest".into(), + value: json!(42), + }]; + let err = validate_answer_kinds(&requests, &answers).unwrap_err(); + assert!( + matches!(err, ElicitationError::WrongKind { ref request_id, ref expected } if request_id == "dest" && expected == "string") + ); + } + + #[tokio::test] + async fn static_responder_rejects_a_wrong_typed_static_answer() { + let requests = vec![req("dest", "string", true)]; + let responder = Responder::Static(InputResponseFixture::from_map([( + "dest".to_string(), + json!(42), + )])); + let err = responder + .answer(&requests, &elicit_ctx()) + .await + .unwrap_err(); + assert!(matches!(err, ElicitationError::WrongKind { .. })); + assert_eq!(responder.kind(), "static"); + assert!(!responder.is_dynamic()); + } + + #[tokio::test] + async fn rest_responder_resolves_answers_from_the_endpoint() { + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/answer")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "inputResponses": [{ "id": "dest", "value": "Denver" }] + }))) + .mount(&server) + .await; + let spec = RestInputResponderSpec { + url: format!("{}/answer", server.uri()), + headers: Default::default(), + timeout_ms: Some(3000), + }; + let responder = + Responder::Rest(RestResponder::from_spec(&spec, &ProxyConfig::default()).unwrap()); + assert_eq!(responder.kind(), "rest"); + assert!(responder.is_dynamic()); + let answers = responder + .answer(&[req("dest", "string", true)], &elicit_ctx()) + .await + .expect("responder answers"); + assert_eq!(answers.len(), 1); + assert_eq!(answers[0].id, "dest"); + assert_eq!(answers[0].value, json!("Denver")); + } + + #[tokio::test] + async fn rest_responder_fails_on_an_http_error() { + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/answer")) + .respond_with(ResponseTemplate::new(500)) + .mount(&server) + .await; + let spec = RestInputResponderSpec { + url: format!("{}/answer", server.uri()), + headers: Default::default(), + timeout_ms: Some(3000), + }; + let responder = + Responder::Rest(RestResponder::from_spec(&spec, &ProxyConfig::default()).unwrap()); + let err = responder + .answer(&[req("dest", "string", true)], &elicit_ctx()) + .await + .unwrap_err(); + assert!(matches!(err, ElicitationError::Rest(_)), "{err:?}"); + } + + #[tokio::test] + async fn rest_responder_rejects_an_unknown_answer_id() { + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let server = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/answer")) + .respond_with(ResponseTemplate::new(200).set_body_json(json!({ + "inputResponses": [{ "id": "other", "value": "x" }] + }))) + .mount(&server) + .await; + let spec = RestInputResponderSpec { + url: format!("{}/answer", server.uri()), + headers: Default::default(), + timeout_ms: Some(3000), + }; + let responder = + Responder::Rest(RestResponder::from_spec(&spec, &ProxyConfig::default()).unwrap()); + let err = responder + .answer(&[req("dest", "string", true)], &elicit_ctx()) + .await + .unwrap_err(); + match err { + ElicitationError::Rest(reason) => assert!(reason.contains("unknown id"), "{reason}"), + other => panic!("expected a Rest error, got {other:?}"), + } + } } diff --git a/crates/mcptest-core/src/executor/mod.rs b/crates/mcptest-core/src/executor/mod.rs index 1bde580..a5b02f9 100644 --- a/crates/mcptest-core/src/executor/mod.rs +++ b/crates/mcptest-core/src/executor/mod.rs @@ -39,7 +39,7 @@ use crate::transform::TransformContext; use dispatch::{ agent_transform_context, apply_response_transform, assertions_expect_error, call_server, - run_metamorphic, synthesize_error_envelope, transform_context, + run_metamorphic, synthesize_error_envelope, transform_context, ElicitSetup, }; /// Concrete MCP executor consumed by `Runner::run_all`. @@ -157,6 +157,12 @@ pub struct ToolPlan { /// the executor runs the taxonomy-keyed bad-request probes and gates on /// clean rejection. pub negative_path: Option, + /// Optional answer source for a 2026-07-28 `InputRequiredResult` + /// elicitation (WOR-1383). Resolved at plan-build time from the test's + /// `input_responses` / `input_responder` or the suite-level default; the + /// executor uses it to satisfy an elicitation and retry. `None` when the + /// test declares no source (an eliciting tool then fails fast). + pub responder: Option, } /// Agent-test plan: a serialized [`crate::matchers`]-addressable trace @@ -363,12 +369,20 @@ impl McpExecutor { // routed into an assertable envelope instead of failing the call // (WOR-1357); otherwise an unexpected error still fails hard. let expects_error = assertions_expect_error(&tool.expectations); + // Thread the resolved elicitation answer source so a tool that returns + // an InputRequiredResult is satisfied and retried (WOR-1383). + let elicit = tool.responder.as_ref().map(|responder| ElicitSetup { + responder, + test_name, + server: &tool.server, + }); let (raw, request_id) = call_server( &server.client, &tool.action, tool.transform.as_ref(), &ctx, expects_error, + elicit.as_ref(), ) .await .map_err(TestOutcome::fail)?; @@ -1302,6 +1316,7 @@ mod tests { metamorphic: None, fuzz: None, negative_path: None, + responder: None, } } diff --git a/crates/mcptest-core/src/suite/elicitation_parse.rs b/crates/mcptest-core/src/suite/elicitation_parse.rs new file mode 100644 index 0000000..ee3328c --- /dev/null +++ b/crates/mcptest-core/src/suite/elicitation_parse.rs @@ -0,0 +1,211 @@ +//! Parse the elicitation answer-source config (WOR-1383). +//! +//! A 2026-07-28 server can answer a `tools/call` with an +//! `InputRequiredResult` (SEP-2322); the runner satisfies it from a +//! configured answer source and retries. Two shapes are parsed here: +//! +//! - a test-level `input_responses:` static map (`id` to value) or +//! `input_responder:` provider (mutually exclusive), and +//! - a suite-level `input_responder:` default. +//! +//! Pulled out of [`super::test_parse`] so that module stays under the +//! AGENTS module-size ceiling. The runtime responder (which holds a +//! `reqwest::Client`) is built from these plain specs at plan-build time, +//! not here. + +use std::collections::BTreeMap; + +use serde_yaml::Value as YamlValue; + +use super::test_parse::yaml_to_json; +use super::types::{InputResponderSpec, InputResponseSpec, RestInputResponderSpec}; +use super::ParseError; + +/// Parse a tool test's `input_responses` (static map) and +/// `input_responder` (provider). The two are mutually exclusive: a test +/// declares at most one answer source. +pub(super) fn parse_test_input_config( + entry: &YamlValue, + path: &str, +) -> Result<(Option, Option), ParseError> { + let responses = entry + .get("input_responses") + .map(|node| parse_input_responses(node, path)) + .transpose()?; + let responder = entry + .get("input_responder") + .map(|node| parse_input_responder(node, path)) + .transpose()?; + if responses.is_some() && responder.is_some() { + return Err(ParseError::BadShape { + path: path.to_string(), + field: "input_responder", + expected: "absent when `input_responses` is set (declare one elicitation answer source, not both)", + }); + } + Ok((responses, responder)) +} + +/// Parse the optional suite-level `input_responder:` default. +pub(super) fn parse_suite_input_responder( + root: &YamlValue, +) -> Result, ParseError> { + root.get("input_responder") + .map(|node| parse_input_responder(node, "/input_responder")) + .transpose() +} + +/// Parse `input_responses:` into a `{ id: value }` map. Values are +/// coerced through the shared YAML-to-JSON path so they line up with the +/// wire `inputResponses[].value`. +fn parse_input_responses(node: &YamlValue, path: &str) -> Result { + let map = node.as_mapping().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responses", + expected: "a mapping of inputRequest id to answer value", + })?; + let mut values = BTreeMap::new(); + for (key, value) in map { + let id = key.as_str().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responses", + expected: "string keys (inputRequest ids)", + })?; + values.insert(id.to_string(), yaml_to_json(value)?); + } + Ok(InputResponseSpec { values }) +} + +/// Parse `input_responder: { rest: { ... } }`. Only the `rest` provider +/// exists today; an unknown provider key is a load-time error. +fn parse_input_responder(node: &YamlValue, path: &str) -> Result { + let map = node.as_mapping().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder", + expected: "a mapping with a `rest` provider", + })?; + let rest = map + .get(YamlValue::from("rest")) + .ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder", + expected: "a `rest` provider block", + })?; + Ok(InputResponderSpec::Rest(parse_rest(rest, path)?)) +} + +fn parse_rest(node: &YamlValue, path: &str) -> Result { + let url = node + .get("url") + .and_then(|u| u.as_str()) + .ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder.rest.url", + expected: "string (the endpoint the runner POSTs elicitations to)", + })? + .to_string(); + let headers = node + .get("headers") + .map(|h| parse_headers(h, path)) + .transpose()? + .unwrap_or_default(); + let timeout_ms = match node.get("timeout_ms") { + Some(t) => Some(t.as_u64().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder.rest.timeout_ms", + expected: "a non-negative integer (milliseconds)", + })?), + None => None, + }; + Ok(RestInputResponderSpec { + url, + headers, + timeout_ms, + }) +} + +fn parse_headers(node: &YamlValue, path: &str) -> Result, ParseError> { + let map = node.as_mapping().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder.rest.headers", + expected: "a mapping of header name to value", + })?; + let mut headers = BTreeMap::new(); + for (key, value) in map { + let name = key.as_str().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder.rest.headers", + expected: "string header names", + })?; + let value = value.as_str().ok_or(ParseError::BadShape { + path: path.to_string(), + field: "input_responder.rest.headers", + expected: "string header values", + })?; + headers.insert(name.to_string(), value.to_string()); + } + Ok(headers) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_yaml::from_str; + + #[test] + fn parses_static_input_responses_into_a_value_map() { + let entry: YamlValue = + from_str("input_responses:\n destination: Denver\n seats: 2\n").unwrap(); + let (responses, responder) = parse_test_input_config(&entry, "/tools/0").unwrap(); + assert!(responder.is_none()); + let spec = responses.expect("input_responses parsed"); + assert_eq!(spec.values["destination"], serde_json::json!("Denver")); + assert_eq!(spec.values["seats"], serde_json::json!(2)); + } + + #[test] + fn parses_rest_responder_with_headers_and_timeout() { + let entry: YamlValue = from_str( + "input_responder:\n rest:\n url: https://x/answer\n timeout_ms: 5000\n headers:\n Authorization: Bearer t\n", + ) + .unwrap(); + let (responses, responder) = parse_test_input_config(&entry, "/tools/0").unwrap(); + assert!(responses.is_none()); + match responder.expect("responder parsed") { + InputResponderSpec::Rest(rest) => { + assert_eq!(rest.url, "https://x/answer"); + assert_eq!(rest.timeout_ms, Some(5000)); + assert_eq!(rest.headers["Authorization"], "Bearer t"); + } + } + } + + #[test] + fn rejects_a_test_that_sets_both_sources() { + let entry: YamlValue = + from_str("input_responses:\n a: b\ninput_responder:\n rest:\n url: https://x\n") + .unwrap(); + let err = parse_test_input_config(&entry, "/tools/0").unwrap_err(); + assert!(matches!( + err, + ParseError::BadShape { + field: "input_responder", + .. + } + )); + } + + #[test] + fn rest_provider_requires_a_url() { + let entry: YamlValue = from_str("input_responder:\n rest:\n headers: {}\n").unwrap(); + assert!(parse_test_input_config(&entry, "/tools/0").is_err()); + } + + #[test] + fn suite_level_responder_is_optional() { + let none: YamlValue = from_str("servers: {}\n").unwrap(); + assert!(parse_suite_input_responder(&none).unwrap().is_none()); + let some: YamlValue = from_str("input_responder:\n rest:\n url: https://x\n").unwrap(); + assert!(parse_suite_input_responder(&some).unwrap().is_some()); + } +} diff --git a/crates/mcptest-core/src/suite/fixtures_parse.rs b/crates/mcptest-core/src/suite/fixtures_parse.rs index c39395d..a2ba31c 100644 --- a/crates/mcptest-core/src/suite/fixtures_parse.rs +++ b/crates/mcptest-core/src/suite/fixtures_parse.rs @@ -116,6 +116,8 @@ mod tests { metamorphic: None, fuzz: None, negative_path: None, + input_responses: None, + input_responder: None, } } diff --git a/crates/mcptest-core/src/suite/mod.rs b/crates/mcptest-core/src/suite/mod.rs index 641bc00..119f9ea 100644 --- a/crates/mcptest-core/src/suite/mod.rs +++ b/crates/mcptest-core/src/suite/mod.rs @@ -33,6 +33,7 @@ mod compositions_parse; mod context_rot_parse; mod derived_parse; pub mod duration; +mod elicitation_parse; mod eval_parse; pub mod faults; mod faults_parse; @@ -305,6 +306,7 @@ pub fn parse(value: &YamlValue) -> Result { let compositions = compositions_parse::parse(value)?; let faults = faults_parse::parse(value)?; let security = security_parse::parse(value)?; + let input_responder = elicitation_parse::parse_suite_input_responder(value)?; // Fold the suite-level `defaultTest` baseline into every test // and agent test after parsing so the merge rule lives in one place. if let Some(default) = &default_test { @@ -345,6 +347,7 @@ pub fn parse(value: &YamlValue) -> Result { compositions, faults, security, + input_responder, }) } diff --git a/crates/mcptest-core/src/suite/test_parse.rs b/crates/mcptest-core/src/suite/test_parse.rs index aa78242..20dedd5 100644 --- a/crates/mcptest-core/src/suite/test_parse.rs +++ b/crates/mcptest-core/src/suite/test_parse.rs @@ -71,6 +71,8 @@ pub(super) fn parse_tool_test(entry: &YamlValue, path: &str) -> Result Result Result Result, + /// Optional suite-level default elicitation answer source. Applies to + /// any tool test whose tool returns an `InputRequiredResult`, unless the + /// test overrides it with its own `input_responses` / `input_responder` + /// (WOR-1383). `None` when the suite declares none. + pub input_responder: Option, } /// Parsed view of the top-level `security:` block. @@ -436,6 +441,46 @@ pub struct TypedTest { /// set, the executor runs the taxonomy-keyed bad-request probes and gates on /// clean rejection (WOR-1239). pub negative_path: Option, + /// Optional static answers for a 2026-07-28 `InputRequiredResult` + /// elicitation (SEP-2322). When the tool answers a `tools/call` with an + /// input-required result, the runner satisfies it from this map and + /// retries (WOR-1383). Mutually exclusive with [`Self::input_responder`]. + pub input_responses: Option, + /// Optional per-test elicitation answer source that overrides the + /// suite-level [`ResolvedSuite::input_responder`]. Mutually exclusive with + /// [`Self::input_responses`] (WOR-1383). + pub input_responder: Option, +} + +/// Static answers for an `InputRequiredResult` elicitation: each +/// [`crate::protocol::elicitation::InputRequest`] id resolves to a value +/// (WOR-1383). +#[derive(Debug, Clone, PartialEq)] +pub struct InputResponseSpec { + /// `inputRequest.id` to the answer value. + pub values: BTreeMap, +} + +/// Parsed (not yet live) configuration for a dynamic elicitation answer +/// source. The runtime responder that holds a `reqwest::Client` is built +/// from this at plan-build time (WOR-1383). +#[derive(Debug, Clone, PartialEq)] +pub enum InputResponderSpec { + /// Answer elicitations by POSTing them to a REST endpoint. + Rest(RestInputResponderSpec), +} + +/// REST elicitation responder config: the runner POSTs the `inputRequests` +/// to `url` and reads back `inputResponses` (WOR-1383). +#[derive(Debug, Clone, PartialEq)] +pub struct RestInputResponderSpec { + /// Endpoint the runner POSTs each elicitation to. + pub url: String, + /// Static headers sent on every POST (for example an `Authorization` + /// token). Values are redacted in diagnostics. + pub headers: BTreeMap, + /// Per-request timeout in milliseconds. Defaults to 30s when unset. + pub timeout_ms: Option, } /// What a test invokes against the server. diff --git a/crates/mcptest-core/tests/inject_error_offline.rs b/crates/mcptest-core/tests/inject_error_offline.rs index 0bf4f6d..c65f26d 100644 --- a/crates/mcptest-core/tests/inject_error_offline.rs +++ b/crates/mcptest-core/tests/inject_error_offline.rs @@ -53,6 +53,7 @@ async fn injected_error_runs_offline_against_an_empty_pool() { metamorphic: None, fuzz: None, negative_path: None, + responder: None, }); let mut plans: HashMap> = HashMap::new(); @@ -99,6 +100,7 @@ async fn injected_error_envelope_carries_the_named_message_for_text_assertions() metamorphic: None, fuzz: None, negative_path: None, + responder: None, }); let mut plans: HashMap> = HashMap::new(); @@ -148,6 +150,7 @@ async fn injected_error_envelope_failing_assertion_returns_a_failure_outcome() { metamorphic: None, fuzz: None, negative_path: None, + responder: None, }); let mut plans: HashMap> = HashMap::new(); diff --git a/crates/mcptest/schemas/v1.json b/crates/mcptest/schemas/v1.json index 80fe1be..71721de 100644 --- a/crates/mcptest/schemas/v1.json +++ b/crates/mcptest/schemas/v1.json @@ -575,9 +575,55 @@ } } } + }, + "input_responder": { + "$ref": "#/$defs/InputResponder", + "description": "Suite-level default elicitation answer source (SEP-2322). Applies to any tool test whose tool returns an InputRequiredResult, unless the test declares its own `input_responses` or `input_responder`." } }, "$defs": { + "InputResponder": { + "title": "Elicitation answer source", + "description": "A dynamic source the runner uses to answer a 2026-07-28 InputRequiredResult elicitation (SEP-2322). Only the `rest` provider exists today.", + "type": "object", + "additionalProperties": false, + "required": [ + "rest" + ], + "properties": { + "rest": { + "$ref": "#/$defs/RestInputResponder" + } + } + }, + "RestInputResponder": { + "title": "REST elicitation responder", + "description": "The runner POSTs each elicitation ({ tool, arguments, requestState, inputRequests }) to `url` and reads back { inputResponses: [{ id, value }] }.", + "type": "object", + "additionalProperties": false, + "required": [ + "url" + ], + "properties": { + "url": { + "type": "string", + "minLength": 1, + "description": "Endpoint the runner POSTs each elicitation to." + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Static headers sent on every POST (for example an Authorization token). Values are redacted in diagnostics." + }, + "timeout_ms": { + "type": "integer", + "minimum": 0, + "description": "Per-request timeout in milliseconds. Defaults to 30000 when omitted." + } + } + }, "CalibrationCheck": { "title": "Calibration check", "description": "One judge-calibration check. Reads a labels file (JSONL or a YAML array of {confidence, correct}) and exposes the computed metrics as assertion targets: `ece`, `brier`, and (when `reliability` and `observed_positive_rate` are both given) `corrected_rate` plus its Wald 95% interval `corrected_rate_low` / `corrected_rate_high`. The `expect:` reuses the standard assertion grammar (target plus matcher); omit it to apply the defaults `ece <= 0.10`, `brier <= 0.25`, and `corrected_rate <= observed_positive_rate`. The target names are runtime-resolved free strings, documented rather than schema-enforced.", @@ -1856,6 +1902,14 @@ "minLength": 1, "description": "Name of a fixture defined under `fixtures.errors[]` that the runner will inject in place of a real tool call. The schema does not enforce that the name resolves to a declared fixture; that cross-reference check happens in the loader (and will be wired up by the runner in a future release)." }, + "input_responses": { + "type": "object", + "description": "Static answers for a 2026-07-28 InputRequiredResult elicitation (SEP-2322): each inputRequest id maps to the answer value. When the tool returns an input-required result, the runner satisfies it from this map and retries. Mutually exclusive with `input_responder`.", + "additionalProperties": true + }, + "input_responder": { + "$ref": "#/$defs/InputResponder" + }, "cache": { "$ref": "#/$defs/CacheDirective" }, diff --git a/crates/mcptest/src/cli/handlers/run_live_plans.rs b/crates/mcptest/src/cli/handlers/run_live_plans.rs index a56fe22..32a676d 100644 --- a/crates/mcptest/src/cli/handlers/run_live_plans.rs +++ b/crates/mcptest/src/cli/handlers/run_live_plans.rs @@ -88,7 +88,13 @@ pub(super) async fn build_all_plans(ctx: BuildPlansCtx<'_>) -> Result = Vec::new(); let mut plans: HashMap> = HashMap::new(); - build_tool_plans(ctx.suite, ctx.selected_set, &mut specs, &mut plans); + build_tool_plans( + ctx.suite, + ctx.selected_set, + ctx.proxy, + &mut specs, + &mut plans, + )?; let mut last_real_provider = None; let mut last_real_model = None; @@ -162,12 +168,13 @@ pub(super) async fn build_all_plans(ctx: BuildPlansCtx<'_>) -> Result, + proxy: &mcptest_core::network::ProxyConfig, specs: &mut Vec, plans: &mut std::collections::HashMap< String, std::sync::Arc, >, -) { +) -> Result<()> { use mcptest_core::executor::{PlannedTest, Scoring, TargetTransport, ToolPlan}; use mcptest_core::runner::TestSpec; use std::sync::Arc; @@ -221,7 +228,44 @@ fn build_tool_plans( fuzz: t.fuzz.clone(), // Thread the test's negative-path block through. negative_path: t.negative_path.clone(), + // Resolve the elicitation answer source (test override beats the + // suite default). REST construction can fail on a bad url / + // proxy, which fails plan building rather than the first round. + responder: resolve_responder(t, suite, proxy)?, })), ); } + Ok(()) +} + +/// Resolve a test's effective elicitation answer source at plan-build +/// time (WOR-1383). Precedence: the test's `input_responses` (static), +/// then its `input_responder`, then the suite-level `input_responder`. +/// A REST responder builds a proxy-aware client here so a bad endpoint +/// fails the run setup, not the first elicitation. +fn resolve_responder( + test: &mcptest_core::suite::TypedTest, + suite: &mcptest_core::suite::ResolvedSuite, + proxy: &mcptest_core::network::ProxyConfig, +) -> Result> { + use mcptest_core::executor::elicitation::{InputResponseFixture, Responder, RestResponder}; + use mcptest_core::suite::InputResponderSpec; + + if let Some(spec) = &test.input_responses { + return Ok(Some(Responder::Static(InputResponseFixture::from_map( + spec.values.clone(), + )))); + } + let spec = test + .input_responder + .as_ref() + .or(suite.input_responder.as_ref()); + match spec { + Some(InputResponderSpec::Rest(rest)) => { + let responder = RestResponder::from_spec(rest, proxy) + .map_err(|e| anyhow::anyhow!("test `{}` input_responder: {e}", test.name))?; + Ok(Some(Responder::Rest(responder))) + } + None => Ok(None), + } } diff --git a/crates/mcptest/src/commands/compliance_suite.rs b/crates/mcptest/src/commands/compliance_suite.rs index 3608289..e6b30ef 100644 --- a/crates/mcptest/src/commands/compliance_suite.rs +++ b/crates/mcptest/src/commands/compliance_suite.rs @@ -197,6 +197,8 @@ fn plan_suite_checks( fuzz: None, // The compliance probe runs no negative-path probes. negative_path: None, + // Compliance probes call raw protocol methods, not eliciting tools. + responder: None, })), ); } @@ -424,6 +426,8 @@ mod tests { metamorphic: None, fuzz: None, negative_path: None, + input_responses: None, + input_responder: None, } } diff --git a/docs-site/llms-full.sha256 b/docs-site/llms-full.sha256 index 4a33165..2621e85 100644 --- a/docs-site/llms-full.sha256 +++ b/docs-site/llms-full.sha256 @@ -1 +1 @@ -d548e01bb72a340818804cd92e8cd199159eac92440dc1f8aef5e9c8587da7e9 +feb626f4b48e0bd7bfc21723f7a4a30e4d890d6f36cd9bfaff71ff149461161a diff --git a/docs/elicitation.md b/docs/elicitation.md index 9d1fdb3..b690735 100644 --- a/docs/elicitation.md +++ b/docs/elicitation.md @@ -92,41 +92,95 @@ All wire types use camelCase serde renames so they match the spec on the wire. `InputRequest.required` defaults to `true` when the server omits the field (matches the SEP-2322 default). +## Configuring the answer source + +The runner has no interactive user, so a tool test that elicits must declare +where its answers come from. Two sources are available; a test picks at most +one, and the suite can set a default. + +**Static answers** (`input_responses`, deterministic and offline): + +```yaml +tools: + - name: "book a flight" + server: api + tool: book_flight + args: {} + input_responses: + destination: "Denver" + expect: + - target: result.content[0].text + matcher: { cel: 'value == "booked a flight to Denver"' } +``` + +**REST endpoint** (`input_responder`, dynamic). The runner POSTs each +elicitation and reads the answers back, so an external service (LLM-backed, +business logic, or a human-in-the-loop UI) can answer: + +```yaml +# suite-level default for every eliciting test +input_responder: + rest: + url: "https://elicit.example/answer" + timeout_ms: 30000 # optional, default 30000 + headers: # optional; values are redacted in diagnostics + Authorization: "Bearer ${ELICIT_TOKEN}" +``` + +The runner POSTs `{ test, server, tool, arguments, round, requestState, +inputRequests }` and expects `{ "inputResponses": [ { "id", "value" }, ... ] }`. +It rejects a non-2xx status, invalid JSON, a missing/duplicate/unknown id, a +missing required id, and an answer whose value does not match the request's +`kind`. A bad url, header, or proxy fails the run at setup, not the first round. + +**Precedence:** a test's `input_responses`, then its `input_responder`, then the +suite-level `input_responder`, then fail-fast (a test that elicits with no source +fails with a clear message rather than hanging). `input_responses` and +`input_responder` are mutually exclusive on one test. A test that answers from a +REST endpoint is treated like one with a `transform`: its result is not cached. + +**Observability.** Every round emits a structured `tracing` event on target +`mcptest_core::elicitation` (`round_started`, `answer_resolved`, +`retry_dispatched`, `completed`, `failed`) carrying the test/server/tool, round, +responder kind, the input-request ids and required/optional counts, the request +id, and a stable short hash of the `requestState`. The runner never logs the raw +`requestState`, the answer values, or the REST headers. + ## What's shipped * The typed wire model and the recognizer (`mcptest_core::protocol::elicitation`). * `build_retry_params` for constructing the retry envelope from a validated answer list. -* Runner-side retry loop (`mcptest_core::executor::elicitation`): - `run_elicitation_chain(initial_response, initial_params, - fixture, max_rounds, next_response)` walks the elicitation chain - until the server returns a non-elicitation result or the cap - fires. `collect_responses(requests, fixture)` enforces the - required-vs-optional rule and skips optional requests that have - no fixture entry. -* `InputResponseFixture` for non-interactive answer sources, plus - `DEFAULT_MAX_ELICITATION_ROUNDS = 5` so a misbehaving server - cannot loop forever. +* Live runner integration: a `tools/call` that returns an + `InputRequiredResult` is resolved from the test's configured answer + source and retried until a final result, transparently, so a plain + tool test asserts the resolved result. Task handles are polled each + round, the final request id surfaces for response-header assertions, + and the round cap (`DEFAULT_MAX_ELICITATION_ROUNDS = 5`) stops a + misbehaving server. +* Two answer sources: a static `input_responses` map and a `rest` + `input_responder` (see [Configuring the answer source](#configuring-the-answer-source)), + resolved at plan-build time with a clear precedence and fail-fast. +* The pure helpers (`mcptest_core::executor::elicitation`): + `collect_responses` (required-vs-optional rule), + `build_retry_envelope`, `run_elicitation_chain`, and + `validate_answer_kinds` (uniform `string`/`number`/`boolean` typing + for both sources). * Tests pin every recognition branch, the camelCase wire keys, the - `required` default, the params-non-object defensive path, plus - the executor's pure retry loop (one-round happy path, max-rounds - cap, pass-through). + `required` default, the pure retry loop, the static and REST + responders (including REST error handling), and an offline + end-to-end run against the mock's input-required tool. ## Planned follow-up -* YAML surface: declare an `inputResponses:` fixture on a tool - test so a suite can satisfy elicitation deterministically. The - library accepts the fixture today; the YAML field + loader - wiring lands once the wire shape is reviewed. -* Cassette extension: record the elicitation round-trip so a - replay reproduces it byte-stable, even when the request state is - opaque. The current cassette format captures one - request/response per exchange; multi-turn capture extends the - schema with an explicit elicitation-turn array. -* Interactive prompt: read answers from stdin in TTY mode, - validate against the requested `kind`, fall through to the - fixture in non-interactive mode. +* Cassette extension: record the elicitation round-trip so a replay + reproduces it byte-stable. Owned by the cassette-format work, not + this surface. +* `tasks/update`: answering a task that pauses in `input_required` + (today such a task fails with a clear message). +* Interactive prompt: read answers from stdin in TTY mode as a third + answer source, validating against the requested `kind`. ## Cross-references diff --git a/docs/llms-full.txt b/docs/llms-full.txt index f427f44..334d9e1 100644 --- a/docs/llms-full.txt +++ b/docs/llms-full.txt @@ -3058,6 +3058,37 @@ report is clean). Omit `expect:` to apply the default gate, which fails on any crash, hang, protocol violation, or leak. To fuzz every tool a server exposes without writing a suite, use `mcptest fuzz` (see the CLI reference). +### `input_responses` / `input_responder` (optional, on tool tests) + +When a tool answers a `tools/call` with a 2026-07-28 `InputRequiredResult` +(SEP-2322), the runner satisfies the elicitation from the test's configured +answer source and retries until a final result. Declare one source: + +- `input_responses` (object): static answers, an `inputRequest` id to a value. + Deterministic and offline. +- `input_responder` (object): a dynamic source. Today the only provider is + `rest: { url, headers?, timeout_ms? }`: the runner POSTs each elicitation and + reads the answers back. + +```yaml +tools: + - name: booking resolves its elicited destination + server: api + tool: book_flight + args: {} + input_responses: + destination: "Denver" +``` + +A suite-level `input_responder:` sets a default for every eliciting test. +Precedence: a test's `input_responses`, then its `input_responder`, then the +suite-level `input_responder`, then a fast failure (a test that elicits with no +source fails rather than hanging). `input_responses` and `input_responder` are +mutually exclusive on one test. See +[elicitation.md](./elicitation.md#configuring-the-answer-source) for the REST +request/response contract, the precedence and redaction rules, and the trace +events. + ## `resources` block Type: array of objects, optional, default `[]`. diff --git a/docs/reference/config.md b/docs/reference/config.md index 8c916b4..75891ed 100644 --- a/docs/reference/config.md +++ b/docs/reference/config.md @@ -329,6 +329,8 @@ Per-entry fields: | `expect` | array or object | no | `[]` | Assertions. See [`expect`](#expect-block). | | `timeout_ms` | integer >= 1 | no | `-` | Per-test timeout override. | | `inject_error` | string | no | `-` | Fixture name from `fixtures.errors[]`. See [`fixtures`](#fixtures). | +| `input_responses` | object | no | `-` | Static answers for an `InputRequiredResult` elicitation (`id` to value). Mutually exclusive with `input_responder`. See [elicitation.md](../elicitation.md#configuring-the-answer-source). | +| `input_responder` | object | no | `-` | Dynamic elicitation answer source (`rest: { url, headers?, timeout_ms? }`). Overrides the suite-level default. See [elicitation.md](../elicitation.md#configuring-the-answer-source). | | `cache` | enum | no | `auto` | One of `auto`, `always`, `never`. See [Cache directive](#cache-directive). | | `effects` | array | no | `[]` | One or more of `external`, `local`, `filesystem`. | | `server_version` | string | no | `-` | Version pin required for HTTP-transport cache eligibility. | diff --git a/docs/yaml-reference.md b/docs/yaml-reference.md index dcd1333..6848108 100644 --- a/docs/yaml-reference.md +++ b/docs/yaml-reference.md @@ -925,6 +925,37 @@ report is clean). Omit `expect:` to apply the default gate, which fails on any crash, hang, protocol violation, or leak. To fuzz every tool a server exposes without writing a suite, use `mcptest fuzz` (see the CLI reference). +### `input_responses` / `input_responder` (optional, on tool tests) + +When a tool answers a `tools/call` with a 2026-07-28 `InputRequiredResult` +(SEP-2322), the runner satisfies the elicitation from the test's configured +answer source and retries until a final result. Declare one source: + +- `input_responses` (object): static answers, an `inputRequest` id to a value. + Deterministic and offline. +- `input_responder` (object): a dynamic source. Today the only provider is + `rest: { url, headers?, timeout_ms? }`: the runner POSTs each elicitation and + reads the answers back. + +```yaml +tools: + - name: booking resolves its elicited destination + server: api + tool: book_flight + args: {} + input_responses: + destination: "Denver" +``` + +A suite-level `input_responder:` sets a default for every eliciting test. +Precedence: a test's `input_responses`, then its `input_responder`, then the +suite-level `input_responder`, then a fast failure (a test that elicits with no +source fails rather than hanging). `input_responses` and `input_responder` are +mutually exclusive on one test. See +[elicitation.md](./elicitation.md#configuring-the-answer-source) for the REST +request/response contract, the precedence and redaction rules, and the trace +events. + ## `resources` block Type: array of objects, optional, default `[]`. diff --git a/examples/async-tasks.yml b/examples/async-tasks.yml index 366defa..ab31c51 100644 --- a/examples/async-tasks.yml +++ b/examples/async-tasks.yml @@ -42,13 +42,17 @@ tools: matcher: cel: 'value == "report ready"' - # An input-required tool's first call elicits input: the result carries the - # inputRequests list and the opaque requestState token the client must echo. - - name: "an input-required tool first elicits input" + # An input-required tool answers its first call with an InputRequiredResult. + # The runner satisfies it from the static `input_responses` map and retries + # with the elicited value, so the test sees the resolved result, not the + # intermediate elicitation. + - name: "an input-required tool is resolved from static answers" server: elicit_server tool: book_flight args: {} + input_responses: + destination: "Denver" expect: - - target: "result" + - target: "result.content[0].text" matcher: - cel: 'value.inputRequests.size() >= 1 && value.requestState == "trip-token-1"' + cel: 'value == "booked a flight to Denver"' diff --git a/schemas/v1.json b/schemas/v1.json index 80fe1be..71721de 100644 --- a/schemas/v1.json +++ b/schemas/v1.json @@ -575,9 +575,55 @@ } } } + }, + "input_responder": { + "$ref": "#/$defs/InputResponder", + "description": "Suite-level default elicitation answer source (SEP-2322). Applies to any tool test whose tool returns an InputRequiredResult, unless the test declares its own `input_responses` or `input_responder`." } }, "$defs": { + "InputResponder": { + "title": "Elicitation answer source", + "description": "A dynamic source the runner uses to answer a 2026-07-28 InputRequiredResult elicitation (SEP-2322). Only the `rest` provider exists today.", + "type": "object", + "additionalProperties": false, + "required": [ + "rest" + ], + "properties": { + "rest": { + "$ref": "#/$defs/RestInputResponder" + } + } + }, + "RestInputResponder": { + "title": "REST elicitation responder", + "description": "The runner POSTs each elicitation ({ tool, arguments, requestState, inputRequests }) to `url` and reads back { inputResponses: [{ id, value }] }.", + "type": "object", + "additionalProperties": false, + "required": [ + "url" + ], + "properties": { + "url": { + "type": "string", + "minLength": 1, + "description": "Endpoint the runner POSTs each elicitation to." + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Static headers sent on every POST (for example an Authorization token). Values are redacted in diagnostics." + }, + "timeout_ms": { + "type": "integer", + "minimum": 0, + "description": "Per-request timeout in milliseconds. Defaults to 30000 when omitted." + } + } + }, "CalibrationCheck": { "title": "Calibration check", "description": "One judge-calibration check. Reads a labels file (JSONL or a YAML array of {confidence, correct}) and exposes the computed metrics as assertion targets: `ece`, `brier`, and (when `reliability` and `observed_positive_rate` are both given) `corrected_rate` plus its Wald 95% interval `corrected_rate_low` / `corrected_rate_high`. The `expect:` reuses the standard assertion grammar (target plus matcher); omit it to apply the defaults `ece <= 0.10`, `brier <= 0.25`, and `corrected_rate <= observed_positive_rate`. The target names are runtime-resolved free strings, documented rather than schema-enforced.", @@ -1856,6 +1902,14 @@ "minLength": 1, "description": "Name of a fixture defined under `fixtures.errors[]` that the runner will inject in place of a real tool call. The schema does not enforce that the name resolves to a declared fixture; that cross-reference check happens in the loader (and will be wired up by the runner in a future release)." }, + "input_responses": { + "type": "object", + "description": "Static answers for a 2026-07-28 InputRequiredResult elicitation (SEP-2322): each inputRequest id maps to the answer value. When the tool returns an input-required result, the runner satisfies it from this map and retries. Mutually exclusive with `input_responder`.", + "additionalProperties": true + }, + "input_responder": { + "$ref": "#/$defs/InputResponder" + }, "cache": { "$ref": "#/$defs/CacheDirective" },