From 575f2277d8699c3ad38522d5268ad07735c999a2 Mon Sep 17 00:00:00 2001 From: Rick Crawford Date: Wed, 17 Jun 2026 22:05:33 -0700 Subject: [PATCH] policy: `mcptest policy simulate` OSS governance gate (WOR-1421) A deterministic, offline policy simulator: it reads a small declarative policy file plus saved mcptest artifacts (run report, judge certification, conformance report, security report, model-compat diff, evidence artifact), extracts named facts from each, evaluates the policy rules against them, applies expiring waivers, and prints a pass/fail/warn verdict with a deterministic exit code. No network, no live run, so a team can gate a release locally before adopting heavier enterprise policy infrastructure. - mcptest-core policy.rs (pure): PolicyFile/PolicyRule (one comparator each: max/min/equals/one_of; severity fail|warn) + PolicyWaiver (rule/owner/reason/ expiry/issue), seven facts_from_* extractors over serde_json::Value, and evaluate() producing a PolicyOutcome. A failing rule with an active waiver is Waived; an expired waiver fails closed; a missing fact is Unevaluated and fails closed (a missing input never silently passes). - mcptest policy simulate (cli/args/policy.rs + handlers/policy.rs): reads the policy YAML and whichever --artifact files are supplied, evaluates, renders pretty/json. Dry-run always exits 0; --gate exits 1 on fail. - examples/policy/policy.yml worked example (skipped by the examples gate, not a run suite), docs/policy-simulator.md with the full fact catalog, cli-reference + command-groups + help template, llms regen. Co-Authored-By: Claude Opus 4.8 (1M context) --- crates/mcptest-core/src/lib.rs | 4 + crates/mcptest-core/src/policy.rs | 827 ++++++++++++++++++++ crates/mcptest/src/cli/args/mod.rs | 2 + crates/mcptest/src/cli/args/policy.rs | 62 ++ crates/mcptest/src/cli/handlers/mod.rs | 2 + crates/mcptest/src/cli/handlers/policy.rs | 133 ++++ crates/mcptest/src/cli/mod.rs | 4 + crates/mcptest/tests/cli_policy_simulate.rs | 179 +++++ docs-site/llms-full.sha256 | 2 +- docs/SUMMARY.md | 1 + docs/cli-reference.md | 50 +- docs/llms-full.txt | 50 +- docs/policy-simulator.md | 193 +++++ examples/policy/policy.yml | 30 + scripts/check-examples.sh | 2 + 15 files changed, 1538 insertions(+), 3 deletions(-) create mode 100644 crates/mcptest-core/src/policy.rs create mode 100644 crates/mcptest/src/cli/args/policy.rs create mode 100644 crates/mcptest/src/cli/handlers/policy.rs create mode 100644 crates/mcptest/tests/cli_policy_simulate.rs create mode 100644 docs/policy-simulator.md create mode 100644 examples/policy/policy.yml diff --git a/crates/mcptest-core/src/lib.rs b/crates/mcptest-core/src/lib.rs index 7bb50d14..36c2bcaf 100644 --- a/crates/mcptest-core/src/lib.rs +++ b/crates/mcptest-core/src/lib.rs @@ -85,6 +85,9 @@ //! - [`plugins`]: subprocess plugin protocol. Spawns a polyglot //! plugin binary, exchanges newline-delimited JSON over stdin/stdout, //! and caches the process for the run lifetime. +//! - [`policy`]: offline governance policy simulator. Turns saved artifacts +//! into a flat fact map and evaluates declarative rules plus expiring +//! waivers into a pass / warn / fail verdict, with no network access. //! - [`subprocess`]: one-shot JSON subprocess helper shared by //! the transform step and the context-aware hooks. Spawns a command, //! writes one JSON value to stdin, and parses one JSON value from stdout. @@ -125,6 +128,7 @@ pub mod migration; pub mod model_compat; pub mod network; pub mod plugins; +pub mod policy; pub mod profiles; pub mod propose; pub mod protocol; diff --git a/crates/mcptest-core/src/policy.rs b/crates/mcptest-core/src/policy.rs new file mode 100644 index 00000000..5c6a95fb --- /dev/null +++ b/crates/mcptest-core/src/policy.rs @@ -0,0 +1,827 @@ +//! OSS governance policy simulator. +//! +//! This module turns saved mcptest artifacts (a run report, a judge +//! certification, a conformance score, a security scan, and so on) into a flat +//! map of named "facts", then evaluates a small declarative policy of rules and +//! expiring waivers against those facts. It exists so a team can gate a release +//! on the artifacts it already produces without standing up an external policy +//! service. It is deliberately tiny: comparators are `max`/`min`/`equals`/ +//! `one_of` rather than a general expression DSL, because a governance gate +//! that nobody can read is a governance gate nobody trusts. + +use std::collections::BTreeMap; + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +/// A fact value extracted from an artifact: a number, a boolean, or text. +/// +/// Facts are the single currency the evaluator understands. Keeping the type +/// closed (three variants, no nesting) is what lets the comparators stay +/// trivial and the report stay readable. +#[derive(Debug, Clone, PartialEq, Serialize)] +#[serde(untagged)] +pub enum FactValue { + /// A numeric fact, for example a count of failed tests. + Num(f64), + /// A boolean fact, for example whether a judge is certified. + Bool(bool), + /// A textual fact, for example a conformance badge such as `T1`. + Text(String), +} + +impl FactValue { + /// The fact as a number, when it is numeric. Used by `max`/`min` rules. + pub fn as_num(&self) -> Option { + match self { + FactValue::Num(n) => Some(*n), + _ => None, + } + } + + /// The fact as a boolean, when it is one. Used by `equals: true|false`. + pub fn as_bool(&self) -> Option { + match self { + FactValue::Bool(b) => Some(*b), + _ => None, + } + } + + /// The fact as text, when it is textual. Used by `one_of` and string + /// `equals`. + pub fn as_text(&self) -> Option<&str> { + match self { + FactValue::Text(s) => Some(s.as_str()), + _ => None, + } + } + + /// Render the fact for a report line. Numbers print without a trailing + /// `.0` when integral so `run.failed = 2` reads naturally. + fn render(&self) -> String { + match self { + FactValue::Num(n) => { + if n.fract() == 0.0 { + format!("{}", *n as i64) + } else { + format!("{n}") + } + } + FactValue::Bool(b) => b.to_string(), + FactValue::Text(s) => s.clone(), + } + } +} + +/// The flat fact map an evaluation runs against. Keyed by dotted fact name +/// (for example `run.failed`) so rules cite a stable identifier. +pub type Facts = BTreeMap; + +/// Severity of a policy rule: a failed `fail` rule fails the gate; a failed +/// `warn` rule only warns. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum RuleSeverity { + /// A failing rule fails the overall verdict (the default). + #[default] + Fail, + /// A failing rule only warns and never fails the gate. + Warn, +} + +/// One declarative policy rule: name a fact and one comparator. Exactly one of +/// `max`/`min`/`equals`/`one_of` must be set. +#[derive(Debug, Clone, Deserialize)] +pub struct PolicyRule { + /// Stable rule identifier, cited in the report and matched by waivers. + pub id: String, + /// Optional human description shown when a rule is authored verbosely. + #[serde(default)] + pub description: Option, + /// The fact name this rule constrains, for example `run.failed`. + pub fact: String, + /// Upper bound: the fact (a number) must be `<= max`. + #[serde(default)] + pub max: Option, + /// Lower bound: the fact (a number) must be `>= min`. + #[serde(default)] + pub min: Option, + /// Exact match against a JSON literal (bool, number, or string). + #[serde(default)] + pub equals: Option, + /// Membership: the fact (rendered to text) must be one of these. + #[serde(default)] + pub one_of: Option>, + /// Whether a failure fails the gate or only warns. + #[serde(default)] + pub severity: RuleSeverity, +} + +/// A waiver suppressing one rule's failure until it expires. An expired waiver +/// does not suppress (fail closed) and is itself reported. +#[derive(Debug, Clone, Deserialize)] +pub struct PolicyWaiver { + /// The rule id this waiver suppresses. + pub rule: String, + /// Who owns the waiver, so reviewers know whom to ask. + pub owner: String, + /// Why the failure is tolerated, captured for the audit trail. + pub reason: String, + /// RFC 3339 UTC expiry (for example `2026-12-31T00:00:00Z`). + pub expiry: String, + /// Optional tracking reference (for example a GitHub issue id). + #[serde(default)] + pub issue: Option, +} + +/// The parsed policy file. +#[derive(Debug, Clone, Deserialize)] +pub struct PolicyFile { + /// Schema version of the policy document, reserved for future migrations. + pub version: String, + /// The rules evaluated against the facts. + pub rules: Vec, + /// Waivers that may suppress specific rule failures until they expire. + #[serde(default)] + pub waivers: Vec, +} + +/// Per-rule status after evaluation. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum RuleStatus { + /// The rule's comparator was satisfied. + Pass, + /// The rule failed and no waiver applied (fail severity). + Fail, + /// The rule failed and no waiver applied (warn severity). + Warn, + /// Failed a `fail` rule but an active waiver suppressed it. + Waived, + /// Failed and the waiver that would suppress it has expired (fail closed). + ExpiredWaiver, + /// The fact the rule references was not available (artifact not provided or + /// malformed). Treated as a failure so a missing input never silently passes. + Unevaluated, +} + +/// One rule's evaluated outcome, the cited evidence row. +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct RuleOutcome { + /// The rule id this row reports on. + pub id: String, + /// The fact the rule referenced. + pub fact: String, + /// The resolved status for this rule. + pub status: RuleStatus, + /// The observed fact value rendered for the report, when available. + #[serde(skip_serializing_if = "Option::is_none")] + pub observed: Option, + /// One-line human explanation. + pub detail: String, +} + +/// Overall verdict. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)] +#[serde(rename_all = "lowercase")] +pub enum Verdict { + /// Every rule passed (or was waived). + Pass, + /// No fail-severity rule failed, but at least one warn rule did. + Warn, + /// At least one fail-severity rule failed, expired its waiver, or could + /// not be evaluated. + Fail, +} + +/// The full simulation result. +#[derive(Debug, Clone, PartialEq, Serialize)] +pub struct PolicyOutcome { + /// The overall verdict the gate keys off. + pub verdict: Verdict, + /// Per-rule outcomes, in policy order, as cited evidence. + pub rules: Vec, +} + +impl PolicyOutcome { + /// True when the overall verdict is `Fail` (any fail-severity rule failed, + /// hit an expired waiver, or could not be evaluated). + pub fn failed(&self) -> bool { + self.verdict == Verdict::Fail + } +} + +/// Evaluate a policy against a fact map at a given wall-clock instant. +/// +/// `now_epoch` is Unix epoch seconds, supplied by the caller so the evaluation +/// stays pure and tests can pin time. Each rule is resolved independently, then +/// the overall verdict is the worst per-rule status: any fail / expired-waiver +/// / unevaluated rule makes the verdict `Fail`, otherwise any warn makes it +/// `Warn`, otherwise `Pass`. Waived and passing rules never fail the gate. +pub fn evaluate(policy: &PolicyFile, facts: &Facts, now_epoch: i64) -> PolicyOutcome { + let mut rules = Vec::with_capacity(policy.rules.len()); + for rule in &policy.rules { + rules.push(evaluate_rule(rule, facts, &policy.waivers, now_epoch)); + } + + let verdict = if rules.iter().any(|r| { + matches!( + r.status, + RuleStatus::Fail | RuleStatus::ExpiredWaiver | RuleStatus::Unevaluated + ) + }) { + Verdict::Fail + } else if rules.iter().any(|r| r.status == RuleStatus::Warn) { + Verdict::Warn + } else { + Verdict::Pass + }; + + PolicyOutcome { verdict, rules } +} + +/// Resolve a single rule against the facts and waivers. Split out of +/// [`evaluate`] to keep that function short and to keep all the per-rule +/// branching in one place. +fn evaluate_rule( + rule: &PolicyRule, + facts: &Facts, + waivers: &[PolicyWaiver], + now_epoch: i64, +) -> RuleOutcome { + let comparator_count = rule.max.is_some() as u8 + + rule.min.is_some() as u8 + + rule.equals.is_some() as u8 + + rule.one_of.is_some() as u8; + if comparator_count != 1 { + return RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status: RuleStatus::Unevaluated, + observed: None, + detail: format!("rule {} must set exactly one comparator", rule.id), + }; + } + + let Some(fact) = facts.get(&rule.fact) else { + return RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status: RuleStatus::Unevaluated, + observed: None, + detail: format!( + "fact `{}` not available (artifact not provided?)", + rule.fact + ), + }; + }; + + let check = check_comparator(rule, fact); + match check { + ComparatorResult::Unevaluated(detail) => RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status: RuleStatus::Unevaluated, + observed: Some(fact.clone()), + detail, + }, + ComparatorResult::Passed(detail) => RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status: RuleStatus::Pass, + observed: Some(fact.clone()), + detail, + }, + ComparatorResult::Failed(detail) => resolve_failure(rule, fact, waivers, now_epoch, detail), + } +} + +/// Decide the status of a rule that failed its comparator: an active waiver +/// downgrades it to `Waived`, an expired or unparseable waiver fails closed as +/// `ExpiredWaiver`, and no waiver yields the rule's own severity. +fn resolve_failure( + rule: &PolicyRule, + fact: &FactValue, + waivers: &[PolicyWaiver], + now_epoch: i64, + fail_detail: String, +) -> RuleOutcome { + if let Some(waiver) = waivers.iter().find(|w| w.rule == rule.id) { + let expiry = rfc3339_to_epoch(&waiver.expiry); + let active = expiry.map(|e| now_epoch <= e).unwrap_or(false); + if active { + let issue = waiver + .issue + .as_ref() + .map(|i| format!(" ({i})")) + .unwrap_or_default(); + let until = waiver.expiry.get(0..10).unwrap_or(waiver.expiry.as_str()); + return RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status: RuleStatus::Waived, + observed: Some(fact.clone()), + detail: format!("waived by {} until {until}{issue}", waiver.owner), + }; + } + let until = waiver.expiry.get(0..10).unwrap_or(waiver.expiry.as_str()); + return RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status: RuleStatus::ExpiredWaiver, + observed: Some(fact.clone()), + detail: format!("waiver for {} expired {until}", rule.id), + }; + } + + let status = if rule.severity == RuleSeverity::Fail { + RuleStatus::Fail + } else { + RuleStatus::Warn + }; + RuleOutcome { + id: rule.id.clone(), + fact: rule.fact.clone(), + status, + observed: Some(fact.clone()), + detail: fail_detail, + } +} + +/// The raw verdict of applying a rule's single comparator to a fact, before +/// waivers or severity are considered. +enum ComparatorResult { + /// The comparator held; detail is a human description of why. + Passed(String), + /// The comparator did not hold; detail explains the breach. + Failed(String), + /// The fact's type did not match the comparator (for example a `max` rule + /// over a textual fact); detail explains the mismatch. + Unevaluated(String), +} + +/// Apply the one set comparator on `rule` to `fact`. The caller has already +/// verified exactly one comparator is set. +fn check_comparator(rule: &PolicyRule, fact: &FactValue) -> ComparatorResult { + if let Some(max) = rule.max { + return match fact.as_num() { + Some(n) if n <= max => { + ComparatorResult::Passed(format!("{} = {} within max {max}", rule.fact, n)) + } + Some(n) => ComparatorResult::Failed(format!("{} = {} exceeds max {max}", rule.fact, n)), + None => ComparatorResult::Unevaluated(format!( + "{} is not numeric, cannot apply max", + rule.fact + )), + }; + } + if let Some(min) = rule.min { + return match fact.as_num() { + Some(n) if n >= min => { + ComparatorResult::Passed(format!("{} = {} at least min {min}", rule.fact, n)) + } + Some(n) => ComparatorResult::Failed(format!("{} = {} below min {min}", rule.fact, n)), + None => ComparatorResult::Unevaluated(format!( + "{} is not numeric, cannot apply min", + rule.fact + )), + }; + } + if let Some(expected) = &rule.equals { + return check_equals(rule, fact, expected); + } + if let Some(allowed) = &rule.one_of { + let rendered = fact.render(); + return if allowed.iter().any(|v| v == &rendered) { + ComparatorResult::Passed(format!("{} = {rendered} is allowed", rule.fact)) + } else { + ComparatorResult::Failed(format!( + "{} = {rendered} not one of [{}]", + rule.fact, + allowed.join(", ") + )) + }; + } + // Unreachable: the caller guarantees one comparator is set. + ComparatorResult::Unevaluated(format!("rule {} has no comparator", rule.id)) +} + +/// Compare a fact against an `equals` JSON literal, choosing the comparison by +/// the literal's JSON type so `equals: true` and `equals: "T1"` both work. +fn check_equals(rule: &PolicyRule, fact: &FactValue, expected: &Value) -> ComparatorResult { + match expected { + Value::Bool(b) => match fact.as_bool() { + Some(actual) if actual == *b => { + ComparatorResult::Passed(format!("{} = {actual}, as expected", rule.fact)) + } + Some(actual) => { + ComparatorResult::Failed(format!("{} = {actual}, expected {b}", rule.fact)) + } + None => ComparatorResult::Unevaluated(format!( + "{} is not boolean, cannot compare to {b}", + rule.fact + )), + }, + Value::Number(num) => { + let want = num.as_f64(); + match (fact.as_num(), want) { + (Some(actual), Some(want)) if actual == want => { + ComparatorResult::Passed(format!("{} = {actual}, as expected", rule.fact)) + } + (Some(actual), Some(want)) => { + ComparatorResult::Failed(format!("{} = {actual}, expected {want}", rule.fact)) + } + _ => ComparatorResult::Unevaluated(format!( + "{} is not numeric, cannot compare", + rule.fact + )), + } + } + Value::String(s) => match fact.as_text() { + Some(actual) if actual == s => { + ComparatorResult::Passed(format!("{} = {actual}, as expected", rule.fact)) + } + Some(actual) => { + ComparatorResult::Failed(format!("{} = {actual}, expected {s}", rule.fact)) + } + None => ComparatorResult::Unevaluated(format!( + "{} is not text, cannot compare to {s}", + rule.fact + )), + }, + _ => ComparatorResult::Unevaluated(format!( + "rule {} equals must be a bool, number, or string", + rule.id + )), + } +} + +/// Parse the `YYYY-MM-DDTHH:MM:SS` prefix of an RFC 3339 timestamp as UTC into +/// Unix epoch seconds. +/// +/// Dependency-free on purpose so the simulator pulls no date crate. Fractional +/// seconds and timezone offsets are ignored, and inputs shorter than 19 +/// characters return `None`. Exposed as `pub` so the CLI handler and tests can +/// reuse the exact same parse the evaluator uses for waiver expiry. +pub fn rfc3339_to_epoch(ts: &str) -> Option { + if ts.len() < 19 { + return None; + } + let year: i64 = ts.get(0..4)?.parse().ok()?; + let month: i64 = ts.get(5..7)?.parse().ok()?; + let day: i64 = ts.get(8..10)?.parse().ok()?; + let hour: i64 = ts.get(11..13)?.parse().ok()?; + let min: i64 = ts.get(14..16)?.parse().ok()?; + let sec: i64 = ts.get(17..19)?.parse().ok()?; + // days_from_civil (Howard Hinnant) + let y = if month <= 2 { year - 1 } else { year }; + let era = if y >= 0 { y } else { y - 399 } / 400; + let yoe = y - era * 400; + let doy = (153 * (if month > 2 { month - 3 } else { month + 9 }) + 2) / 5 + day - 1; + let doe = yoe * 365 + yoe / 4 - yoe / 100 + doy; + let days = era * 146_097 + doe - 719_468; + Some(days * 86_400 + hour * 3_600 + min * 60 + sec) +} + +/// Read defensively into the facts map: insert a numeric fact from a JSON +/// `u64` field only when present, never panicking on a missing or wrong-typed +/// field. +fn insert_u64(facts: &mut Facts, key: &str, field: Option) { + if let Some(n) = field { + facts.insert(key.to_string(), FactValue::Num(n as f64)); + } +} + +/// Extract facts from a `mcptest run --reporter json` report. +/// +/// Reads the `summary` object for the run tallies. Each field is optional so a +/// trimmed report (for example one without `inconclusive`) simply yields fewer +/// facts rather than an error. +pub fn facts_from_run_report(v: &Value) -> Facts { + let mut facts = Facts::new(); + let summary = &v["summary"]; + insert_u64(&mut facts, "run.total", summary["total"].as_u64()); + insert_u64(&mut facts, "run.passed", summary["passed"].as_u64()); + insert_u64(&mut facts, "run.failed", summary["failed"].as_u64()); + insert_u64(&mut facts, "run.skipped", summary["skipped"].as_u64()); + insert_u64( + &mut facts, + "run.inconclusive", + summary["inconclusive"].as_u64(), + ); + facts +} + +/// Extract facts from a `mcptest judge certify` certification record. +/// +/// `now_epoch` lets the extractor compute `judge.expired` from the +/// certification's validity window so a stale certification can gate without +/// the policy author hardcoding a date. The expiry fact is only emitted when +/// the record carries a validity window. +pub fn facts_from_certification(v: &Value, now_epoch: i64) -> Facts { + let mut facts = Facts::new(); + if let Some(certified) = v["certified"].as_bool() { + facts.insert("judge.certified".to_string(), FactValue::Bool(certified)); + } + if let Some(ece) = v["metrics"]["ece"].as_f64() { + facts.insert("judge.ece".to_string(), FactValue::Num(ece)); + } + if let Some(brier) = v["metrics"]["brier"].as_f64() { + facts.insert("judge.brier".to_string(), FactValue::Num(brier)); + } + if let Some(valid_until) = v["validity_window"]["valid_until"].as_str() { + if let Some(epoch) = rfc3339_to_epoch(valid_until) { + facts.insert( + "judge.expired".to_string(), + FactValue::Bool(now_epoch > epoch), + ); + } + } + facts +} + +/// Extract facts from a `mcptest conformance run` report. +/// +/// The badge and per-tier tallies let a policy gate on, say, "tier 1 or 2" or +/// "every MUST passed". The tier is read flexibly because some report shapes +/// nest it under an object and some expose it as a bare string. +pub fn facts_from_conformance(v: &Value) -> Facts { + let mut facts = Facts::new(); + if let Some(badge) = v["badge"].as_str() { + facts.insert( + "conformance.badge".to_string(), + FactValue::Text(badge.to_string()), + ); + } + insert_u64( + &mut facts, + "conformance.must_passed", + v["must"]["passed"].as_u64(), + ); + insert_u64( + &mut facts, + "conformance.must_total", + v["must"]["total"].as_u64(), + ); + insert_u64( + &mut facts, + "conformance.should_passed", + v["should"]["passed"].as_u64(), + ); + insert_u64( + &mut facts, + "conformance.should_total", + v["should"]["total"].as_u64(), + ); + let tier = v["tier"] + .as_str() + .or_else(|| v["tier"]["tier"].as_str()) + .map(|s| s.to_string()); + if let Some(tier) = tier { + facts.insert("conformance.tier".to_string(), FactValue::Text(tier)); + } + facts +} + +/// Extract facts from a `mcptest security` report. +/// +/// Tallies findings by severity so a policy can gate on, for example, +/// "no critical findings". The five severity counts are always emitted (zero +/// when absent) so a `max: 0` rule never lands as `Unevaluated` just because a +/// clean scan had no findings of that severity. +pub fn facts_from_security(v: &Value) -> Facts { + let mut facts = Facts::new(); + let findings = v["findings"] + .as_array() + .or_else(|| v.as_array()) + .cloned() + .unwrap_or_default(); + let mut critical = 0u64; + let mut high = 0u64; + let mut medium = 0u64; + let mut low = 0u64; + let mut info = 0u64; + for finding in &findings { + match finding["severity"].as_str() { + Some("critical") => critical += 1, + Some("high") => high += 1, + Some("medium") => medium += 1, + Some("low") => low += 1, + Some("info") => info += 1, + _ => {} + } + } + facts.insert( + "security.critical_count".to_string(), + FactValue::Num(critical as f64), + ); + facts.insert( + "security.high_count".to_string(), + FactValue::Num(high as f64), + ); + facts.insert( + "security.medium_count".to_string(), + FactValue::Num(medium as f64), + ); + facts.insert("security.low_count".to_string(), FactValue::Num(low as f64)); + facts.insert( + "security.info_count".to_string(), + FactValue::Num(info as f64), + ); + facts.insert( + "security.total_findings".to_string(), + FactValue::Num(findings.len() as f64), + ); + facts +} + +/// Extract facts from a `mcptest model-compat diff` report. +/// +/// The per-bucket tallies let a policy gate on model drift, for example +/// "no model regressed to fail". +pub fn facts_from_model_compat(v: &Value) -> Facts { + let mut facts = Facts::new(); + let summary = &v["summary"]; + insert_u64(&mut facts, "model_compat.total", summary["total"].as_u64()); + insert_u64(&mut facts, "model_compat.pass", summary["pass"].as_u64()); + insert_u64(&mut facts, "model_compat.drift", summary["drift"].as_u64()); + insert_u64(&mut facts, "model_compat.fail", summary["fail"].as_u64()); + facts +} + +/// Extract facts from an evidence artifact. +/// +/// Surfaces the reproducibility and origin flags so a policy can require a +/// reproducible, verifiable-origin run before a release. +pub fn facts_from_evidence(v: &Value) -> Facts { + let mut facts = Facts::new(); + if let Some(reproducible) = v["reproducible"].as_bool() { + facts.insert( + "evidence.reproducible".to_string(), + FactValue::Bool(reproducible), + ); + } + if let Some(unverifiable) = v["unverifiable_origin"].as_bool() { + facts.insert( + "evidence.unverifiable_origin".to_string(), + FactValue::Bool(unverifiable), + ); + } + facts +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn policy_from(yaml: &str) -> PolicyFile { + serde_yaml::from_str(yaml).expect("policy fixture parses") + } + + #[test] + fn a_satisfied_max_rule_passes() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: no-fail\n fact: run.failed\n max: 0\n", + ); + let facts = facts_from_run_report(&json!({"summary": {"failed": 0}})); + let outcome = evaluate(&policy, &facts, 0); + assert_eq!(outcome.verdict, Verdict::Pass); + assert_eq!(outcome.rules[0].status, RuleStatus::Pass); + } + + #[test] + fn an_exceeded_max_rule_fails_the_verdict() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: no-fail\n fact: run.failed\n max: 0\n", + ); + let facts = facts_from_run_report(&json!({"summary": {"failed": 2}})); + let outcome = evaluate(&policy, &facts, 0); + assert_eq!(outcome.verdict, Verdict::Fail); + assert_eq!(outcome.rules[0].status, RuleStatus::Fail); + assert!(outcome.rules[0].detail.contains("exceeds max 0")); + } + + #[test] + fn equals_true_passes_and_false_fails() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: certified\n fact: judge.certified\n equals: true\n", + ); + let pass = facts_from_certification(&json!({"certified": true}), 0); + assert_eq!(evaluate(&policy, &pass, 0).verdict, Verdict::Pass); + let fail = facts_from_certification(&json!({"certified": false}), 0); + let outcome = evaluate(&policy, &fail, 0); + assert_eq!(outcome.verdict, Verdict::Fail); + assert!(outcome.rules[0].detail.contains("expected true")); + } + + #[test] + fn judge_expired_is_computed_from_validity_window() { + let cert = json!({ + "certified": true, + "validity_window": {"valid_until": "2023-11-14T22:13:20Z"} + }); + // now after expiry -> expired true. + let after = facts_from_certification(&cert, 1_700_000_001); + assert_eq!(after.get("judge.expired"), Some(&FactValue::Bool(true))); + // now before expiry -> expired false. + let before = facts_from_certification(&cert, 1_600_000_000); + assert_eq!(before.get("judge.expired"), Some(&FactValue::Bool(false))); + } + + #[test] + fn an_active_waiver_suppresses_a_failure() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: no-fail\n fact: run.failed\n max: 0\nwaivers:\n - rule: no-fail\n owner: alice\n reason: known\n expiry: \"2099-01-01T00:00:00Z\"\n issue: GH-123\n", + ); + let facts = facts_from_run_report(&json!({"summary": {"failed": 2}})); + let outcome = evaluate(&policy, &facts, 1_700_000_000); + assert_eq!(outcome.rules[0].status, RuleStatus::Waived); + assert_eq!(outcome.verdict, Verdict::Pass); + assert!(outcome.rules[0].detail.contains("alice")); + assert!(outcome.rules[0].detail.contains("GH-123")); + } + + #[test] + fn an_expired_waiver_does_not_suppress_and_fails_closed() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: no-fail\n fact: run.failed\n max: 0\nwaivers:\n - rule: no-fail\n owner: alice\n reason: known\n expiry: \"2000-01-01T00:00:00Z\"\n", + ); + let facts = facts_from_run_report(&json!({"summary": {"failed": 2}})); + let outcome = evaluate(&policy, &facts, 1_700_000_000); + assert_eq!(outcome.rules[0].status, RuleStatus::ExpiredWaiver); + assert_eq!(outcome.verdict, Verdict::Fail); + assert!(outcome.rules[0].detail.contains("expired")); + } + + #[test] + fn a_missing_fact_is_unevaluated_and_fails() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: no-fail\n fact: run.failed\n max: 0\n", + ); + let facts = Facts::new(); + let outcome = evaluate(&policy, &facts, 0); + assert_eq!(outcome.rules[0].status, RuleStatus::Unevaluated); + assert_eq!(outcome.verdict, Verdict::Fail); + assert!(outcome.rules[0].detail.contains("not available")); + } + + #[test] + fn a_warn_severity_failure_warns_but_does_not_fail() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: tier\n fact: conformance.badge\n one_of: [T1, T2]\n severity: warn\n", + ); + let facts = facts_from_conformance(&json!({"badge": "T3"})); + let outcome = evaluate(&policy, &facts, 0); + assert_eq!(outcome.rules[0].status, RuleStatus::Warn); + assert_eq!(outcome.verdict, Verdict::Warn); + } + + #[test] + fn one_of_on_conformance_badge_matches_membership() { + let policy = policy_from( + "version: \"1.0\"\nrules:\n - id: tier\n fact: conformance.badge\n one_of: [T1, T2]\n", + ); + let pass = facts_from_conformance(&json!({"badge": "T1"})); + assert_eq!(evaluate(&policy, &pass, 0).verdict, Verdict::Pass); + let fail = facts_from_conformance(&json!({"badge": "F"})); + assert_eq!(evaluate(&policy, &fail, 0).verdict, Verdict::Fail); + } + + #[test] + fn security_counts_tally_from_a_findings_array() { + let report = json!({ + "findings": [ + {"severity": "critical"}, + {"severity": "high"}, + {"severity": "high"}, + {"severity": "low"} + ] + }); + let facts = facts_from_security(&report); + assert_eq!( + facts.get("security.critical_count"), + Some(&FactValue::Num(1.0)) + ); + assert_eq!(facts.get("security.high_count"), Some(&FactValue::Num(2.0))); + assert_eq!( + facts.get("security.total_findings"), + Some(&FactValue::Num(4.0)) + ); + // A clean severity still emits a zero count. + assert_eq!( + facts.get("security.medium_count"), + Some(&FactValue::Num(0.0)) + ); + } + + #[test] + fn rfc3339_parses_known_epochs() { + assert_eq!(rfc3339_to_epoch("1970-01-01T00:00:00Z"), Some(0)); + assert_eq!( + rfc3339_to_epoch("2023-11-14T22:13:20Z"), + Some(1_700_000_000) + ); + assert_eq!(rfc3339_to_epoch("short"), None); + } +} diff --git a/crates/mcptest/src/cli/args/mod.rs b/crates/mcptest/src/cli/args/mod.rs index 7f0ab06a..bbd6d970 100644 --- a/crates/mcptest/src/cli/args/mod.rs +++ b/crates/mcptest/src/cli/args/mod.rs @@ -32,6 +32,7 @@ pub mod migrate; pub mod mock; pub mod model_compat; pub mod pipe; +pub mod policy; pub mod prompt; pub mod propose; pub mod record; @@ -86,6 +87,7 @@ pub use model_compat::{ ModelCompatDiffFormatCli, ModelCompatRunArgs, }; pub use pipe::{OnBudgetExceededCli, PipeArgs, PipeFormat}; +pub use policy::{PolicyArgs, PolicyCommand, PolicyFormat, PolicySimulateArgs}; pub use prompt::PromptArgs; pub use propose::ProposeArgs; pub use record::RecordArgs; diff --git a/crates/mcptest/src/cli/args/policy.rs b/crates/mcptest/src/cli/args/policy.rs new file mode 100644 index 00000000..2c4165f4 --- /dev/null +++ b/crates/mcptest/src/cli/args/policy.rs @@ -0,0 +1,62 @@ +//! Arguments for `mcptest policy`. +use std::path::PathBuf; + +use clap::{Args, Subcommand, ValueEnum}; + +/// Arguments for `mcptest policy`. +#[derive(Debug, Args)] +pub struct PolicyArgs { + /// The `policy` subcommand to run. + #[command(subcommand)] + pub command: PolicyCommand, +} + +/// Subcommands for `mcptest policy`. +#[derive(Debug, Subcommand)] +pub enum PolicyCommand { + /// Evaluate a policy against saved mcptest artifacts. + Simulate(PolicySimulateArgs), +} + +/// Arguments for `mcptest policy simulate`. +#[derive(Debug, Args)] +pub struct PolicySimulateArgs { + /// Policy file (declarative YAML rules + waivers). + #[arg(long, value_name = "FILE")] + pub policy: PathBuf, + /// Run report JSON from `mcptest run --reporter json`. + #[arg(long = "run-report", value_name = "FILE")] + pub run_report: Option, + /// Judge certification record from `mcptest judge certify`. + #[arg(long = "judge-cert", value_name = "FILE")] + pub judge_cert: Option, + /// Conformance report JSON from `mcptest conformance run`. + #[arg(long = "conformance-report", value_name = "FILE")] + pub conformance_report: Option, + /// Security report JSON from `mcptest security`. + #[arg(long, value_name = "FILE")] + pub security: Option, + /// Model-compat diff JSON from `mcptest model-compat diff`. + #[arg(long = "model-compat", value_name = "FILE")] + pub model_compat: Option, + /// Evidence artifact JSON from `mcptest evidence`. + #[arg(long, value_name = "FILE")] + pub evidence: Option, + /// Exit non-zero when the policy fails (CI gate). Off by default (dry-run + /// always exits 0 and just prints the verdict). + #[arg(long)] + pub gate: bool, + /// Output format. + #[arg(long, value_name = "FORMAT", default_value = "pretty")] + pub format: PolicyFormat, +} + +/// Output format for `policy simulate`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum)] +#[value(rename_all = "lowercase")] +pub enum PolicyFormat { + /// Human-readable verdict and per-rule lines. + Pretty, + /// The full [`mcptest_core::policy::PolicyOutcome`] as pretty JSON. + Json, +} diff --git a/crates/mcptest/src/cli/handlers/mod.rs b/crates/mcptest/src/cli/handlers/mod.rs index 3f9edf6a..135f76b0 100644 --- a/crates/mcptest/src/cli/handlers/mod.rs +++ b/crates/mcptest/src/cli/handlers/mod.rs @@ -39,6 +39,7 @@ pub(crate) mod migrate; pub(crate) mod mock; pub(crate) mod model_compat; pub(crate) mod pipe; +pub(crate) mod policy; pub(crate) mod prompt; pub(crate) mod propose; pub(crate) mod readiness; @@ -103,6 +104,7 @@ pub(crate) use migrate::migrate_command; pub(crate) use mock::mock_command; pub(crate) use model_compat::model_compat_command; pub(crate) use pipe::pipe_command; +pub(crate) use policy::policy_command; pub(crate) use prompt::prompt_command; pub(crate) use propose::propose_command; pub(crate) use record::record_command; diff --git a/crates/mcptest/src/cli/handlers/policy.rs b/crates/mcptest/src/cli/handlers/policy.rs new file mode 100644 index 00000000..2e51f0a6 --- /dev/null +++ b/crates/mcptest/src/cli/handlers/policy.rs @@ -0,0 +1,133 @@ +//! Handler for `mcptest policy `. +use std::path::Path; +use std::time::{SystemTime, UNIX_EPOCH}; + +use anyhow::{Context, Result}; +use mcptest_core::policy::{ + evaluate, facts_from_certification, facts_from_conformance, facts_from_evidence, + facts_from_model_compat, facts_from_run_report, facts_from_security, Facts, PolicyFile, + PolicyOutcome, RuleStatus, Verdict, +}; + +use crate::cli::args::{PolicyArgs, PolicyCommand, PolicyFormat, PolicySimulateArgs}; + +/// Dispatch `mcptest policy` to its one subcommand. +/// +/// Kept as a thin router so adding a second `policy` verb later is a one-line +/// match arm, matching the shape of the other grouped subcommands. +pub(crate) fn policy_command(args: PolicyArgs) -> Result { + match args.command { + PolicyCommand::Simulate(a) => simulate_command(a), + } +} + +/// Run `mcptest policy simulate`. +/// +/// Loads the declarative policy, builds a fact map from whichever artifacts the +/// caller passed, evaluates the policy at the current instant, and prints the +/// verdict. The exit code is the governance signal: a dry-run always exits 0 so +/// the verdict is informational, while `--gate` exits 1 on a failing verdict so +/// CI can block on it. +fn simulate_command(args: PolicySimulateArgs) -> Result { + let body = std::fs::read_to_string(&args.policy) + .with_context(|| format!("failed to read policy {}", args.policy.display()))?; + let policy: PolicyFile = serde_yaml::from_str(&body) + .with_context(|| format!("failed to parse policy {}", args.policy.display()))?; + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs() as i64) + .unwrap_or(0); + + let facts = collect_facts(&args, now)?; + let outcome = evaluate(&policy, &facts, now); + + match args.format { + PolicyFormat::Json => println!("{}", serde_json::to_string_pretty(&outcome)?), + PolicyFormat::Pretty => print_pretty(&outcome), + } + + if args.gate && outcome.failed() { + Ok(1) + } else { + Ok(0) + } +} + +/// Read and extend the fact map from every artifact path the caller supplied. +/// +/// Pulled out of [`simulate_command`] so that function stays short and the set +/// of artifact-to-extractor wirings reads as one table. +fn collect_facts(args: &PolicySimulateArgs, now: i64) -> Result { + let mut facts = Facts::new(); + if let Some(path) = &args.run_report { + facts.extend(facts_from_run_report(&read_json(path)?)); + } + if let Some(path) = &args.judge_cert { + facts.extend(facts_from_certification(&read_json(path)?, now)); + } + if let Some(path) = &args.conformance_report { + facts.extend(facts_from_conformance(&read_json(path)?)); + } + if let Some(path) = &args.security { + facts.extend(facts_from_security(&read_json(path)?)); + } + if let Some(path) = &args.model_compat { + facts.extend(facts_from_model_compat(&read_json(path)?)); + } + if let Some(path) = &args.evidence { + facts.extend(facts_from_evidence(&read_json(path)?)); + } + Ok(facts) +} + +/// Print the verdict and one cited line per rule in a human-readable block. +fn print_pretty(outcome: &PolicyOutcome) { + let verdict = match outcome.verdict { + Verdict::Pass => "pass", + Verdict::Warn => "warn", + Verdict::Fail => "fail", + }; + println!("policy: {verdict}"); + for rule in &outcome.rules { + println!( + " [{}] {} ({}): {}", + tag(rule.status), + rule.id, + rule.fact, + rule.detail + ); + } +} + +/// Map a rule status to its fixed-width display tag. +fn tag(status: RuleStatus) -> &'static str { + match status { + RuleStatus::Pass => "PASS", + RuleStatus::Fail => "FAIL", + RuleStatus::Warn => "WARN", + RuleStatus::Waived => "WAIVED", + RuleStatus::ExpiredWaiver => "EXPIRED-WAIVER", + RuleStatus::Unevaluated => "UNEVALUATED", + } +} + +/// Read a JSON artifact from disk into a [`serde_json::Value`], attaching the +/// path to any read or parse error so a missing or malformed input is obvious. +fn read_json(path: &Path) -> Result { + let body = std::fs::read_to_string(path) + .with_context(|| format!("failed to read artifact {}", path.display()))?; + serde_json::from_str(&body) + .with_context(|| format!("failed to parse artifact {}", path.display())) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn read_json_reports_the_path_on_a_missing_file() { + let err = read_json(Path::new("/nope/does-not-exist.json")).unwrap_err(); + assert!(err.to_string().contains("does-not-exist.json")); + } +} diff --git a/crates/mcptest/src/cli/mod.rs b/crates/mcptest/src/cli/mod.rs index c7410e29..52d43f34 100644 --- a/crates/mcptest/src/cli/mod.rs +++ b/crates/mcptest/src/cli/mod.rs @@ -77,6 +77,7 @@ Commands: evidence Aggregate a run into a portable, signable evidence artifact sbom Print the SBOM baked into the binary at build time matchers List every matcher the YAML schema accepts + policy Evaluate a policy against saved artifacts schema Emit the JSON Schema for the YAML config Plumbing: @@ -233,6 +234,8 @@ pub enum Command { Cache(CacheArgs), /// Run a declarative multi-step tool-call pipeline. Pipe(PipeArgs), + /// Evaluate a policy against saved artifacts (a local governance gate). + Policy(PolicyArgs), /// Scan a tools/list snapshot, or drive the live red-team lane with /// `security redteam`. Security(SecurityArgs), @@ -313,6 +316,7 @@ where Command::Propose(args) => handlers::propose_command(args), Command::Cache(args) => handlers::cache_command(args), Command::Pipe(args) => handlers::pipe_command(args), + Command::Policy(args) => handlers::policy_command(args), Command::Security(args) => handlers::security_command(args, &cli.global), Command::WebBotAuth(args) => handlers::web_bot_auth_command(args), Command::Sbom(args) => handlers::sbom_command(args), diff --git a/crates/mcptest/tests/cli_policy_simulate.rs b/crates/mcptest/tests/cli_policy_simulate.rs new file mode 100644 index 00000000..61c94028 --- /dev/null +++ b/crates/mcptest/tests/cli_policy_simulate.rs @@ -0,0 +1,179 @@ +//! Integration tests for `mcptest policy simulate`. +//! +//! Drives the binary over a saved run report and a small policy, asserting the +//! verdict text and the exit code (the governance signal): a dry-run always +//! exits 0, `--gate` exits 1 on a failing verdict, an active waiver suppresses +//! a failure, and an expired waiver fails closed. + +use assert_cmd::Command; +use predicates::prelude::*; +use tempfile::tempdir; + +fn mcptest() -> Command { + Command::cargo_bin("mcptest").expect("binary builds") +} + +fn write(path: &std::path::Path, body: &str) { + std::fs::write(path, body).unwrap(); +} + +/// A clean run report: three tests, none failed. +fn clean_run_report() -> &'static str { + "{\"summary\":{\"total\":3,\"passed\":3,\"failed\":0,\"skipped\":0,\"duration_ms\":1}}" +} + +/// A failing run report: one of three tests failed. +fn failing_run_report() -> &'static str { + "{\"summary\":{\"total\":3,\"passed\":2,\"failed\":1,\"skipped\":0,\"duration_ms\":1}}" +} + +/// A minimal policy that only forbids failed tests. +fn no_fail_policy() -> &'static str { + "version: \"1.0\"\nrules:\n - id: no-failed-tests\n fact: run.failed\n max: 0\n" +} + +#[test] +fn a_passing_policy_over_a_run_report_succeeds() { + let dir = tempdir().unwrap(); + let report = dir.path().join("run.json"); + let policy = dir.path().join("policy.yml"); + write(&report, clean_run_report()); + write(&policy, no_fail_policy()); + + mcptest() + .args([ + "policy", + "simulate", + "--policy", + policy.to_str().unwrap(), + "--run-report", + report.to_str().unwrap(), + ]) + .assert() + .success() + .stdout(predicate::str::contains("pass")); +} + +#[test] +fn a_failing_policy_with_gate_exits_one() { + let dir = tempdir().unwrap(); + let report = dir.path().join("run.json"); + let policy = dir.path().join("policy.yml"); + write(&report, failing_run_report()); + write(&policy, no_fail_policy()); + + mcptest() + .args([ + "policy", + "simulate", + "--policy", + policy.to_str().unwrap(), + "--run-report", + report.to_str().unwrap(), + "--gate", + ]) + .assert() + .code(1) + .stdout(predicate::str::contains("no-failed-tests")) + .stdout(predicate::str::contains("FAIL")); +} + +#[test] +fn a_failing_policy_without_gate_still_exits_zero() { + let dir = tempdir().unwrap(); + let report = dir.path().join("run.json"); + let policy = dir.path().join("policy.yml"); + write(&report, failing_run_report()); + write(&policy, no_fail_policy()); + + mcptest() + .args([ + "policy", + "simulate", + "--policy", + policy.to_str().unwrap(), + "--run-report", + report.to_str().unwrap(), + ]) + .assert() + .success() + .stdout(predicate::str::contains("fail")) + .stdout(predicate::str::contains("FAIL")); +} + +#[test] +fn an_active_waiver_keeps_the_gate_green() { + let dir = tempdir().unwrap(); + let report = dir.path().join("run.json"); + let policy = dir.path().join("policy.yml"); + write(&report, failing_run_report()); + write( + &policy, + "version: \"1.0\"\nrules:\n - id: no-failed-tests\n fact: run.failed\n max: 0\nwaivers:\n - rule: no-failed-tests\n owner: alice\n reason: known flake\n expiry: \"2099-01-01T00:00:00Z\"\n", + ); + + mcptest() + .args([ + "policy", + "simulate", + "--policy", + policy.to_str().unwrap(), + "--run-report", + report.to_str().unwrap(), + "--gate", + ]) + .assert() + .success() + .stdout(predicate::str::contains("WAIVED")); +} + +#[test] +fn an_expired_waiver_fails_closed_under_gate() { + let dir = tempdir().unwrap(); + let report = dir.path().join("run.json"); + let policy = dir.path().join("policy.yml"); + write(&report, failing_run_report()); + write( + &policy, + "version: \"1.0\"\nrules:\n - id: no-failed-tests\n fact: run.failed\n max: 0\nwaivers:\n - rule: no-failed-tests\n owner: alice\n reason: stale\n expiry: \"2000-01-01T00:00:00Z\"\n", + ); + + mcptest() + .args([ + "policy", + "simulate", + "--policy", + policy.to_str().unwrap(), + "--run-report", + report.to_str().unwrap(), + "--gate", + ]) + .assert() + .code(1) + .stdout(predicate::str::contains("EXPIRED-WAIVER")) + .stdout(predicate::str::contains("expired")); +} + +#[test] +fn json_format_emits_a_verdict_field() { + let dir = tempdir().unwrap(); + let report = dir.path().join("run.json"); + let policy = dir.path().join("policy.yml"); + write(&report, clean_run_report()); + write(&policy, no_fail_policy()); + + mcptest() + .args([ + "policy", + "simulate", + "--policy", + policy.to_str().unwrap(), + "--run-report", + report.to_str().unwrap(), + "--format", + "json", + ]) + .assert() + .success() + .stdout(predicate::str::contains("\"verdict\"")); +} diff --git a/docs-site/llms-full.sha256 b/docs-site/llms-full.sha256 index 6c4b1a9c..1a89d0dd 100644 --- a/docs-site/llms-full.sha256 +++ b/docs-site/llms-full.sha256 @@ -1 +1 @@ -cc16f70b1ae3d29472c79d8d0d226155dd092de48d11ec9e3275d382f18797c1 +88562f76c5d5964f98034d6063f192117bd81a97a73771063d719ff7461dde8d diff --git a/docs/SUMMARY.md b/docs/SUMMARY.md index 3f8c6d9c..7f872568 100644 --- a/docs/SUMMARY.md +++ b/docs/SUMMARY.md @@ -110,6 +110,7 @@ - [Software Bill of Materials (mcptest sbom)](./sbom.md) - [Portable run evidence (mcptest evidence)](./evidence.md) - [Session ledger (mcptest ledger)](./session-ledger.md) +- [Policy simulator (mcptest policy simulate)](./policy-simulator.md) # Advanced and deep dives diff --git a/docs/cli-reference.md b/docs/cli-reference.md index 4be8194d..90d0a805 100644 --- a/docs/cli-reference.md +++ b/docs/cli-reference.md @@ -40,7 +40,7 @@ further down. | Agent integration | [`mcp-server`](#mcp-server), [`skill`](#skill), [`prompt`](#prompt), [`propose`](#propose) | | Explore a server | [`inspect`](#inspect), [`tools`](#tools-resources-prompts-capabilities), [`resources`](#tools-resources-prompts-capabilities), [`prompts`](#tools-resources-prompts-capabilities), [`capabilities`](#tools-resources-prompts-capabilities), [`discover`](#discover), [`generate`](#generate), [`mock`](#mock) | | Quality and security | [`compliance`](#compliance), [`conformance`](#conformance), [`security`](#security), [`coverage`](#coverage), [`eval`](#eval), [`judge`](#judge), [`fuzz`](#fuzz), [`lint`](#lint), [`schema-lint`](#schema-lint), [`model-compat`](#model-compat) | -| Reports and artifacts | [`report`](#report), [`diff`](#diff), [`baseline`](#baseline), [`ledger`](#ledger), [`evidence`](#evidence), [`sbom`](#sbom), [`matchers`](#matchers), [`schema`](#schema) | +| Reports and artifacts | [`report`](#report), [`diff`](#diff), [`baseline`](#baseline), [`ledger`](#ledger), [`evidence`](#evidence), [`policy`](#policy), [`sbom`](#sbom), [`matchers`](#matchers), [`schema`](#schema) | | Plumbing | [`completions`](#completions), [`cache`](#cache), [`login`](#login), [`migrate`](#migrate), [`record`](#record), [`distill`](#distill), [`pipe`](#pipe), [`web-bot-auth`](#web-bot-auth) | One command is deliberately absent from `--help`: [`exec`](#exec) is the @@ -829,6 +829,54 @@ is a separate step. **Status.** Working (`certify`). +### `policy` + +Evaluate a declarative governance policy against saved mcptest artifacts: a +local OSS gate that answers "would this release pass our quality, security, and +judge-certification requirements" without any external service. v1 ships one +subcommand: + +```sh +mcptest policy simulate --policy policy.yml + [--run-report run.json] [--judge-cert cert.json] + [--conformance-report conf.json] [--security sec.json] + [--model-compat diff.json] [--evidence ev.json] + [--gate] [--format pretty|json] +``` + +**Description.** The policy file declares `rules` (each names a `fact` and one +comparator: `max`, `min`, `equals`, or `one_of`, with `severity: fail` or +`warn`) and optional `waivers` (each references a rule and carries an `owner`, +`reason`, RFC 3339 `expiry`, and `issue`). `simulate` reads whichever artifact +files are supplied, extracts named facts from each (for example `run.failed`, +`judge.certified`, `judge.expired`, `conformance.badge`, +`security.critical_count`, `model_compat.fail`, `evidence.reproducible`), +evaluates every rule, and prints a per-rule verdict with the observed value. +Waivers suppress a failing rule until they expire; an expired waiver fails +closed, and a rule whose fact is missing is reported `unevaluated` and fails +closed too. See [policy-simulator.md](policy-simulator.md) for the full fact +catalog. + +**Arguments.** + +| Argument | Type | Description | +| -------- | ---- | ----------- | +| `--policy ` | path | The declarative policy YAML. Required. | +| `--run-report ` | path | Run report JSON from `mcptest run --reporter json`. | +| `--judge-cert ` | path | Judge certification record from `mcptest judge certify`. | +| `--conformance-report ` | path | Conformance report JSON from `mcptest conformance run`. | +| `--security ` | path | Security report JSON from `mcptest security`. | +| `--model-compat ` | path | Model-compat diff JSON from `mcptest model-compat diff`. | +| `--evidence ` | path | Evidence artifact JSON from `mcptest evidence`. | +| `--gate` | flag | Exit non-zero when the policy fails. Off by default (dry-run prints the verdict and always exits 0). | +| `--format ` | enum | `pretty` (default) or `json`. | + +**Exit codes.** Dry-run always exits `0`. With `--gate`: `0` when the verdict is +pass or warn, `1` when any fail-severity rule fails, hits an expired waiver, or +references a fact that was not supplied. + +**Status.** Working (`simulate`). + ### `conformance` Score a running MCP server against the vendored SEP corpus, diff --git a/docs/llms-full.txt b/docs/llms-full.txt index aef66395..b6511ddb 100644 --- a/docs/llms-full.txt +++ b/docs/llms-full.txt @@ -5264,7 +5264,7 @@ further down. | Agent integration | [`mcp-server`](#mcp-server), [`skill`](#skill), [`prompt`](#prompt), [`propose`](#propose) | | Explore a server | [`inspect`](#inspect), [`tools`](#tools-resources-prompts-capabilities), [`resources`](#tools-resources-prompts-capabilities), [`prompts`](#tools-resources-prompts-capabilities), [`capabilities`](#tools-resources-prompts-capabilities), [`discover`](#discover), [`generate`](#generate), [`mock`](#mock) | | Quality and security | [`compliance`](#compliance), [`conformance`](#conformance), [`security`](#security), [`coverage`](#coverage), [`eval`](#eval), [`judge`](#judge), [`fuzz`](#fuzz), [`lint`](#lint), [`schema-lint`](#schema-lint), [`model-compat`](#model-compat) | -| Reports and artifacts | [`report`](#report), [`diff`](#diff), [`baseline`](#baseline), [`ledger`](#ledger), [`evidence`](#evidence), [`sbom`](#sbom), [`matchers`](#matchers), [`schema`](#schema) | +| Reports and artifacts | [`report`](#report), [`diff`](#diff), [`baseline`](#baseline), [`ledger`](#ledger), [`evidence`](#evidence), [`policy`](#policy), [`sbom`](#sbom), [`matchers`](#matchers), [`schema`](#schema) | | Plumbing | [`completions`](#completions), [`cache`](#cache), [`login`](#login), [`migrate`](#migrate), [`record`](#record), [`distill`](#distill), [`pipe`](#pipe), [`web-bot-auth`](#web-bot-auth) | One command is deliberately absent from `--help`: [`exec`](#exec) is the @@ -6053,6 +6053,54 @@ is a separate step. **Status.** Working (`certify`). +### `policy` + +Evaluate a declarative governance policy against saved mcptest artifacts: a +local OSS gate that answers "would this release pass our quality, security, and +judge-certification requirements" without any external service. v1 ships one +subcommand: + +```sh +mcptest policy simulate --policy policy.yml + [--run-report run.json] [--judge-cert cert.json] + [--conformance-report conf.json] [--security sec.json] + [--model-compat diff.json] [--evidence ev.json] + [--gate] [--format pretty|json] +``` + +**Description.** The policy file declares `rules` (each names a `fact` and one +comparator: `max`, `min`, `equals`, or `one_of`, with `severity: fail` or +`warn`) and optional `waivers` (each references a rule and carries an `owner`, +`reason`, RFC 3339 `expiry`, and `issue`). `simulate` reads whichever artifact +files are supplied, extracts named facts from each (for example `run.failed`, +`judge.certified`, `judge.expired`, `conformance.badge`, +`security.critical_count`, `model_compat.fail`, `evidence.reproducible`), +evaluates every rule, and prints a per-rule verdict with the observed value. +Waivers suppress a failing rule until they expire; an expired waiver fails +closed, and a rule whose fact is missing is reported `unevaluated` and fails +closed too. See [policy-simulator.md](policy-simulator.md) for the full fact +catalog. + +**Arguments.** + +| Argument | Type | Description | +| -------- | ---- | ----------- | +| `--policy ` | path | The declarative policy YAML. Required. | +| `--run-report ` | path | Run report JSON from `mcptest run --reporter json`. | +| `--judge-cert ` | path | Judge certification record from `mcptest judge certify`. | +| `--conformance-report ` | path | Conformance report JSON from `mcptest conformance run`. | +| `--security ` | path | Security report JSON from `mcptest security`. | +| `--model-compat ` | path | Model-compat diff JSON from `mcptest model-compat diff`. | +| `--evidence ` | path | Evidence artifact JSON from `mcptest evidence`. | +| `--gate` | flag | Exit non-zero when the policy fails. Off by default (dry-run prints the verdict and always exits 0). | +| `--format ` | enum | `pretty` (default) or `json`. | + +**Exit codes.** Dry-run always exits `0`. With `--gate`: `0` when the verdict is +pass or warn, `1` when any fail-severity rule fails, hits an expired waiver, or +references a fact that was not supplied. + +**Status.** Working (`simulate`). + ### `conformance` Score a running MCP server against the vendored SEP corpus, diff --git a/docs/policy-simulator.md b/docs/policy-simulator.md new file mode 100644 index 00000000..df03478e --- /dev/null +++ b/docs/policy-simulator.md @@ -0,0 +1,193 @@ +# Policy simulator + +`mcptest policy simulate` is a local governance gate. It reads a small +declarative policy file plus whichever mcptest artifacts you already have on +disk, extracts named facts from each artifact, evaluates the policy rules +against those facts, applies any waivers, and prints a pass / warn / fail +verdict with a cited line per rule. It runs entirely offline: no network, no +hosted collector, no live server. The same artifacts your pipeline already +produces become the inputs to a release gate you can read in one sitting. + +This is the open-source, single-developer half of governance. It does not store +results, manage approvals across a team, or enforce anything centrally; it +evaluates a policy you commit next to your tests and returns an exit code your +CI can block on. + +## The command + +```sh +mcptest policy simulate --policy policy.yml \ + --run-report run.json \ + --judge-cert cert.json \ + --conformance-report conformance.json \ + --security security.json \ + --model-compat model-compat.json \ + --evidence evidence.json +``` + +Only `--policy` is required. Pass whichever artifact flags you have; each one +adds its facts to the pool the rules evaluate against. A rule whose fact has no +backing artifact is reported as `unevaluated` and fails the gate, so a missing +input never silently passes (more on that below). + +| Flag | Artifact | Produced by | +| ---- | -------- | ----------- | +| `--policy` | The policy file itself (YAML) | You author it. See `examples/policy/policy.yml`. | +| `--run-report` | Run report (JSON) | `mcptest run --reporter json --output run.json` | +| `--judge-cert` | Judge certification record (JSON) | `mcptest judge certify --output cert.json` | +| `--conformance-report` | Conformance score (JSON) | `mcptest conformance run --format json` | +| `--security` | Security scan (JSON) | `mcptest security tools-list.json --format json` | +| `--model-compat` | Model-compatibility diff (JSON) | `mcptest model-compat diff --format json` | +| `--evidence` | Evidence artifact (JSON) | `mcptest evidence run.json --out evidence.json` | +| `--gate` | (no artifact) | Turns a failing verdict into a non-zero exit code. | +| `--format` | (no artifact) | `pretty` (default) or `json`. | + +## The policy file + +A policy is a tiny YAML document: a version string, a list of rules, and an +optional list of waivers. There is no expression language on purpose. Each rule +names exactly one fact and one comparator, so anyone reading the file can see at +a glance what gates the release. + +```yaml +version: "1.0" +rules: + - id: no-failed-tests + description: Every test in the run must pass. + fact: run.failed + max: 0 + - id: judge-certified + description: The grading judge must be certified. + fact: judge.certified + equals: true + - id: conformance-tier + description: The server must reach conformance tier 1 or 2. + fact: conformance.badge + one_of: [T1, T2] + severity: warn +waivers: + - rule: conformance-tier + owner: platform-team + reason: A known SHOULD gap tracked upstream. + expiry: "2099-01-01T00:00:00Z" + issue: GH-1234 +``` + +The full worked example lives at `examples/policy/policy.yml`. + +### Rules + +Each rule has: + +- `id`: a stable identifier, cited in the report and matched by waivers. +- `description` (optional): a human note shown when you author the rule. +- `fact`: the fact name the rule constrains (see the catalog below). +- exactly one comparator: + - `max`: the fact (a number) must be less than or equal to this value. + - `min`: the fact (a number) must be greater than or equal to this value. + - `equals`: the fact must equal this literal. The comparison follows the + literal's type: `equals: true` compares a boolean fact, `equals: 0` a + numeric fact, `equals: "T1"` a textual fact. + - `one_of`: the fact (rendered to text) must be one of the listed values. +- `severity` (optional, defaults to `fail`): `fail` means a failing rule fails + the gate; `warn` means a failing rule only warns and never fails the gate. + +Setting zero or more than one comparator on a rule makes it `unevaluated`, which +fails the gate, so a malformed rule is loud rather than silent. + +### Waivers + +A waiver suppresses one rule's failure until it expires: + +- `rule`: the rule id it suppresses. +- `owner`: who owns the waiver, so reviewers know whom to ask. +- `reason`: why the failure is tolerated, captured for the audit trail. +- `expiry`: an RFC 3339 UTC timestamp, for example `2026-12-31T00:00:00Z`. +- `issue` (optional): a tracking reference such as a GitHub issue id. + +Waivers fail closed. While a waiver is active, a failing rule is reported as +`waived` and does not fail the gate. Once the expiry passes (or if the expiry +does not parse), the waiver no longer suppresses anything: the rule is reported +as `expired-waiver` and the gate fails. A waiver is a dated promise to fix +something, not a permanent exception, and the simulator enforces the date. + +## The fact catalog + +Each artifact flag contributes a fixed set of facts. A fact that an artifact +does not carry is simply absent (and any rule referencing it is `unevaluated`), +except the five security severity counts, which are always emitted (zero when a +clean scan had no findings of that severity) so a `max: 0` rule on them works +even against a clean scan. + +| Fact | Type | Source artifact | +| ---- | ---- | --------------- | +| `run.total` | number | `--run-report` | +| `run.passed` | number | `--run-report` | +| `run.failed` | number | `--run-report` | +| `run.skipped` | number | `--run-report` | +| `run.inconclusive` | number | `--run-report` (when present) | +| `judge.certified` | boolean | `--judge-cert` | +| `judge.ece` | number | `--judge-cert` (expected calibration error) | +| `judge.brier` | number | `--judge-cert` (Brier score) | +| `judge.expired` | boolean | `--judge-cert` (computed from the certification's validity window versus the current time) | +| `conformance.badge` | text | `--conformance-report` (`T1` / `T2` / `T3` / `F`) | +| `conformance.must_passed` | number | `--conformance-report` (MUST checks passed) | +| `conformance.must_total` | number | `--conformance-report` (MUST checks total) | +| `conformance.should_passed` | number | `--conformance-report` (SHOULD checks passed) | +| `conformance.should_total` | number | `--conformance-report` (SHOULD checks total) | +| `conformance.tier` | text | `--conformance-report` | +| `security.critical_count` | number | `--security` | +| `security.high_count` | number | `--security` | +| `security.medium_count` | number | `--security` | +| `security.low_count` | number | `--security` | +| `security.info_count` | number | `--security` | +| `security.total_findings` | number | `--security` | +| `model_compat.total` | number | `--model-compat` | +| `model_compat.pass` | number | `--model-compat` | +| `model_compat.drift` | number | `--model-compat` | +| `model_compat.fail` | number | `--model-compat` | +| `evidence.reproducible` | boolean | `--evidence` | +| `evidence.unverifiable_origin` | boolean | `--evidence` | + +## Verdict and exit codes + +Every rule resolves to one of: `pass`, `fail`, `warn`, `waived`, +`expired-waiver`, or `unevaluated`. The overall verdict is the worst outcome +across all rules: + +- `fail` if any rule is `fail`, `expired-waiver`, or `unevaluated`. +- otherwise `warn` if any rule is `warn`. +- otherwise `pass`. + +`waived` and `pass` rules never fail the gate. + +Exit codes follow the verdict only when you ask for a gate: + +- Without `--gate` the command is a dry run: it always exits 0 and just prints + the verdict. This is useful for showing the report without blocking. +- With `--gate` the command exits 1 when the verdict is `fail`, and 0 + otherwise. This is what you wire into CI. + +Two behaviors are worth restating because they keep the gate honest: + +- An expired waiver fails closed. A stale promise to fix something stops + suppressing the failure the moment it expires. +- A missing fact is `unevaluated` and fails. If a rule references a fact and you + did not pass the artifact that provides it, the gate fails rather than + pretending the check passed. + +## How it relates to evidence and judge certification + +The simulator sits one layer above the artifacts. The evidence artifact +(`mcptest evidence`) bundles a run's metadata into one portable, signable file; +the policy simulator can read that file and gate on its `reproducible` and +`unverifiable_origin` flags alongside everything else. Judge certification +(`mcptest judge certify`) proves a grading judge is calibrated before its +verdict may gate; the simulator turns that proof into a gate condition with the +`judge.certified` and `judge.expired` facts, so a release can require that the +judge behind its evals was certified and that the certification has not gone +stale. In short, the other commands produce trustworthy artifacts; the policy +simulator decides, locally and reproducibly, whether those artifacts clear the +bar you set. + +See `examples/policy/policy.yml` for a starting point you can copy and trim. diff --git a/examples/policy/policy.yml b/examples/policy/policy.yml new file mode 100644 index 00000000..f955fc99 --- /dev/null +++ b/examples/policy/policy.yml @@ -0,0 +1,30 @@ +# A starter governance policy for `mcptest policy simulate`. +version: "1.0" +rules: + - id: no-failed-tests + description: Every test in the run must pass. + fact: run.failed + max: 0 + - id: judge-certified + description: The grading judge must be certified. + fact: judge.certified + equals: true + - id: judge-not-expired + description: The judge certification must not have expired. + fact: judge.expired + equals: false + - id: conformance-tier + description: The server must reach conformance tier 1 or 2. + fact: conformance.badge + one_of: [T1, T2] + severity: warn + - id: no-critical-security + description: No critical security findings. + fact: security.critical_count + max: 0 +waivers: + - rule: conformance-tier + owner: platform-team + reason: A known SHOULD gap tracked upstream. + expiry: "2099-01-01T00:00:00Z" + issue: GH-1234 diff --git a/scripts/check-examples.sh b/scripts/check-examples.sh index de46ee6c..51663139 100755 --- a/scripts/check-examples.sh +++ b/scripts/check-examples.sh @@ -48,6 +48,8 @@ skip_reason() { echo "expected-failures baseline used by the coverage suite, not a run suite" ;; "examples/pipe-search-then-update.yml") echo "pipe plan (schemas/pipe/v0.json), not a run suite" ;; + "examples/policy/policy.yml") + echo "governance policy file for 'mcptest policy simulate', not a run suite" ;; *) echo "" ;; esac }