Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions crates/mcptest-config/schemas/v1.json
Original file line number Diff line number Diff line change
Expand Up @@ -575,9 +575,55 @@
}
}
}
},
"input_responder": {
"$ref": "#/$defs/InputResponder",
"description": "Suite-level default elicitation answer source (SEP-2322). Applies to any tool test whose tool returns an InputRequiredResult, unless the test declares its own `input_responses` or `input_responder`."
}
},
"$defs": {
"InputResponder": {
"title": "Elicitation answer source",
"description": "A dynamic source the runner uses to answer a 2026-07-28 InputRequiredResult elicitation (SEP-2322). Only the `rest` provider exists today.",
"type": "object",
"additionalProperties": false,
"required": [
"rest"
],
"properties": {
"rest": {
"$ref": "#/$defs/RestInputResponder"
}
}
},
"RestInputResponder": {
"title": "REST elicitation responder",
"description": "The runner POSTs each elicitation ({ tool, arguments, requestState, inputRequests }) to `url` and reads back { inputResponses: [{ id, value }] }.",
"type": "object",
"additionalProperties": false,
"required": [
"url"
],
"properties": {
"url": {
"type": "string",
"minLength": 1,
"description": "Endpoint the runner POSTs each elicitation to."
},
"headers": {
"type": "object",
"additionalProperties": {
"type": "string"
},
"description": "Static headers sent on every POST (for example an Authorization token). Values are redacted in diagnostics."
},
"timeout_ms": {
"type": "integer",
"minimum": 0,
"description": "Per-request timeout in milliseconds. Defaults to 30000 when omitted."
}
}
},
"CalibrationCheck": {
"title": "Calibration check",
"description": "One judge-calibration check. Reads a labels file (JSONL or a YAML array of {confidence, correct}) and exposes the computed metrics as assertion targets: `ece`, `brier`, and (when `reliability` and `observed_positive_rate` are both given) `corrected_rate` plus its Wald 95% interval `corrected_rate_low` / `corrected_rate_high`. The `expect:` reuses the standard assertion grammar (target plus matcher); omit it to apply the defaults `ece <= 0.10`, `brier <= 0.25`, and `corrected_rate <= observed_positive_rate`. The target names are runtime-resolved free strings, documented rather than schema-enforced.",
Expand Down Expand Up @@ -1856,6 +1902,14 @@
"minLength": 1,
"description": "Name of a fixture defined under `fixtures.errors[]` that the runner will inject in place of a real tool call. The schema does not enforce that the name resolves to a declared fixture; that cross-reference check happens in the loader (and will be wired up by the runner in a future release)."
},
"input_responses": {
"type": "object",
"description": "Static answers for a 2026-07-28 InputRequiredResult elicitation (SEP-2322): each inputRequest id maps to the answer value. When the tool returns an input-required result, the runner satisfies it from this map and retries. Mutually exclusive with `input_responder`.",
"additionalProperties": true
},
"input_responder": {
"$ref": "#/$defs/InputResponder"
},
"cache": {
"$ref": "#/$defs/CacheDirective"
},
Expand Down
174 changes: 172 additions & 2 deletions crates/mcptest-core/src/executor/dispatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,25 @@ pub(super) async fn apply_response_transform(
/// same assertable envelope the offline `inject_error` path produces instead
/// of failing the test hard. An unexpected error (no error assertion) still
/// fails loudly with the server's message, preserving the safety net.
/// What the executor needs to satisfy an `InputRequiredResult`
/// elicitation: the resolved answer source plus the test/server labels
/// for the REST payload and the trace (WOR-1383).
pub(super) struct ElicitSetup<'a> {
/// The resolved answer source (static map or REST endpoint).
pub responder: &'a crate::executor::elicitation::Responder,
/// Test name, for the trace and the REST request body.
pub test_name: &'a str,
/// Server name, for the trace and the REST request body.
pub server: &'a str,
}

pub(super) async fn call_server(
client: &crate::protocol::Client,
action: &Action,
transform: Option<&TransformSpec>,
ctx: &TransformContext,
expects_error: bool,
elicit: Option<&ElicitSetup<'_>>,
) -> Result<(Value, i64), String> {
let (method, mut params, label) = match action {
Action::ToolCall { tool, args } => (
Expand Down Expand Up @@ -121,6 +134,9 @@ pub(super) async fn call_server(
target = %label,
"dispatching action"
);
// Keep the transformed params as the base for elicitation retries (only
// when a responder is configured, to avoid a clone on the common path).
let retry_base = elicit.map(|_| params.clone());
match client.request_with_id(method.to_string(), params).await {
Ok((id, raw)) => {
tracing::trace!(
Expand All @@ -139,6 +155,22 @@ pub(super) async fn call_server(
} else {
raw
};
// A tools/call can also answer with an InputRequiredResult; when a
// responder is configured, satisfy it and retry until a final
// result, returning the final request id for header assertions
// (WOR-1383). `params` is the already-transformed base.
if let (Action::ToolCall { .. }, Some(setup)) = (action, elicit) {
let base = retry_base.expect("retry_base is cloned whenever elicit is set");
return resolve_input_required(
client,
json!({ "result": resolved }),
id,
base,
setup,
label,
)
.await;
}
Ok((json!({ "result": resolved }), id))
}
// A live JSON-RPC error becomes an assertable `result.error` envelope
Expand Down Expand Up @@ -204,6 +236,142 @@ async fn resolve_task_handle(
))
}

/// Drive the `InputRequiredResult` retry loop for a tools/call (SEP-2322,
/// WOR-1383).
///
/// On entry `envelope` is `{"result": <first result>}`. While it is an
/// input-required result, the runner resolves answers from `setup`,
/// retries with `build_retry_params` (off the transformed `base_params`),
/// polls any task handle each round, and loops until a final result or
/// the round cap. Returns the final `{"result": ...}` envelope and the
/// final request id (so response-header assertions inspect the final
/// response). Every round emits a redacted `mcptest_core::elicitation`
/// trace event: ids and counts, never `requestState` or answer values.
async fn resolve_input_required(
client: &crate::protocol::Client,
initial_envelope: Value,
initial_id: i64,
base_params: Value,
setup: &ElicitSetup<'_>,
label: &str,
) -> Result<(Value, i64), String> {
use crate::executor::elicitation::{ElicitCallContext, DEFAULT_MAX_ELICITATION_ROUNDS};
use crate::protocol::elicitation::{build_retry_params, recognize_input_required};

let mut envelope = initial_envelope;
let mut params = base_params;
let mut last_id = initial_id;
for round in 0..DEFAULT_MAX_ELICITATION_ROUNDS {
let Some(irr) = recognize_input_required(&envelope) else {
if round > 0 {
tracing::info!(
target: "mcptest_core::elicitation",
event = "elicitation.completed",
test = setup.test_name, server = setup.server, tool = label,
rounds = round, request_id = last_id,
"elicitation resolved",
);
}
return Ok((envelope, last_id));
};
let request_ids: Vec<&str> = irr.input_requests.iter().map(|r| r.id.as_str()).collect();
let required_count = irr.input_requests.iter().filter(|r| r.required).count();
tracing::info!(
target: "mcptest_core::elicitation",
event = "elicitation.round_started",
test = setup.test_name, server = setup.server, tool = label,
round = round + 1, responder_kind = setup.responder.kind(),
input_request_ids = ?request_ids, required_count,
optional_count = irr.input_requests.len() - required_count,
request_state_hash = %short_hash(irr.request_state.as_str()),
"elicitation round started",
);
let ctx = ElicitCallContext {
test_name: setup.test_name.to_string(),
server: setup.server.to_string(),
tool: label.to_string(),
arguments: params
.get("arguments")
.cloned()
.unwrap_or_else(|| json!({})),
request_state: irr.request_state.clone(),
round: round + 1,
};
let started = std::time::Instant::now();
let answers = match setup.responder.answer(&irr.input_requests, &ctx).await {
Ok(answers) => answers,
Err(err) => {
tracing::warn!(
target: "mcptest_core::elicitation",
event = "elicitation.failed",
test = setup.test_name, server = setup.server, tool = label,
round = round + 1, error_kind = elicitation_error_kind(&err),
"elicitation could not be answered",
);
return Err(format!("tools/call `{label}` elicitation: {err}"));
}
};
tracing::debug!(
target: "mcptest_core::elicitation",
event = "elicitation.answer_resolved",
test = setup.test_name, server = setup.server, tool = label,
round = round + 1, answered = answers.len(),
duration_ms = started.elapsed().as_millis() as u64,
"answers resolved",
);
let retry = build_retry_params(&params, &irr.request_state, &answers);
let (id, raw) = client
.request_with_id("tools/call".to_string(), retry.clone())
.await
.map_err(|e| format!("tools/call retry for `{label}` failed: {e}"))?;
last_id = id;
let resolved = resolve_task_handle(client, raw, label).await?;
envelope = json!({ "result": resolved });
params = retry;
tracing::debug!(
target: "mcptest_core::elicitation",
event = "elicitation.retry_dispatched",
test = setup.test_name, server = setup.server, tool = label,
round = round + 1, request_id = last_id,
"retry dispatched",
);
}
if recognize_input_required(&envelope).is_some() {
tracing::warn!(
target: "mcptest_core::elicitation",
event = "elicitation.failed",
test = setup.test_name, server = setup.server, tool = label,
error_kind = "max_rounds",
"server kept eliciting past the round cap",
);
return Err(format!(
"tools/call `{label}` elicitation: server kept eliciting after {DEFAULT_MAX_ELICITATION_ROUNDS} rounds; aborting"
));
}
Ok((envelope, last_id))
}

/// Short, run-stable hash of an opaque `requestState`, so a trace can
/// correlate the rounds of one elicitation without logging the token.
fn short_hash(value: &str) -> String {
use std::hash::{Hash, Hasher};
let mut hasher = std::collections::hash_map::DefaultHasher::new();
value.hash(&mut hasher);
format!("{:08x}", hasher.finish() & 0xffff_ffff)
}

/// One-word `error_kind` for the failure trace, never the message body.
fn elicitation_error_kind(error: &crate::executor::elicitation::ElicitationError) -> &'static str {
use crate::executor::elicitation::ElicitationError;
match error {
ElicitationError::MissingResponse { .. } => "missing_response",
ElicitationError::MaxRoundsExceeded { .. } => "max_rounds",
ElicitationError::NoFixture => "no_fixture",
ElicitationError::WrongKind { .. } => "wrong_kind",
ElicitationError::Rest(_) => "rest",
}
}

/// What a single `tasks/get` poll tells the runner to do next.
enum PollOutcome {
/// Non-terminal: poll again.
Expand Down Expand Up @@ -289,7 +457,9 @@ pub(super) async fn run_metamorphic(
tool: tool.to_string(),
args: base_args.clone(),
};
let (base_envelope, _id) = call_server(client, &base_action, None, ctx, false).await?;
// Metamorphic follow-up calls do not participate in elicitation (v1):
// the primary call already resolved any InputRequiredResult (WOR-1383).
let (base_envelope, _id) = call_server(client, &base_action, None, ctx, false, None).await?;
let base_result = inner_result(base_envelope);
let mut pairs = Vec::with_capacity(spec.relations.len());
for relation in &spec.relations {
Expand All @@ -298,7 +468,7 @@ pub(super) async fn run_metamorphic(
tool: tool.to_string(),
args: followup_args,
};
let (followup_envelope, _id) = call_server(client, &action, None, ctx, false).await?;
let (followup_envelope, _id) = call_server(client, &action, None, ctx, false, None).await?;
let followup_result = inner_result(followup_envelope);
pairs.push((relation.clone(), base_result.clone(), followup_result));
}
Expand Down
Loading
Loading