diff --git a/Makefile b/Makefile
index f5c9aaa..c48b489 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,14 @@
-.PHONY: demo
+.PHONY: demo demo-resilience
demo:
EVAL_COMPOSE_FILE=deploy/docker-compose.yml \
POSTGRES_DSN=postgres://gateway:gateway@127.0.0.1:15432/gateway?sslmode=disable \
AGENT_URL=http://127.0.0.1:18085 \
go run ./cmd/eval-runner evalsuite/default.yaml
+
+demo-resilience: build-compose-bins
+ @bash scripts/demo-resilience.sh
+
+build-compose-bins:
+ @mkdir -p .compose-bin
+ @GOOS=linux GOARCH=$$(go env GOARCH) CGO_ENABLED=0 go build -o .compose-bin/gateway ./cmd/gateway
diff --git a/cmd/eval-runner/config.go b/cmd/eval-runner/config.go
index 3fd5b53..0470a23 100644
--- a/cmd/eval-runner/config.go
+++ b/cmd/eval-runner/config.go
@@ -12,6 +12,7 @@ type Config struct {
PostgresDSN string
ComposeFile string
AgentURL string
+ SkipCompose bool
}
func LoadConfig() (*Config, error) {
@@ -29,6 +30,7 @@ func LoadConfig() (*Config, error) {
PostgresDSN: postgresDSN,
ComposeFile: envStringWithInfoNotice("EVAL_COMPOSE_FILE", defaultComposeFilePath, "using default compose file path"),
AgentURL: agentURL,
+ SkipCompose: os.Getenv("EVAL_SKIP_COMPOSE") == "true",
}, nil
}
diff --git a/cmd/eval-runner/main.go b/cmd/eval-runner/main.go
index 07ec9dd..42959da 100644
--- a/cmd/eval-runner/main.go
+++ b/cmd/eval-runner/main.go
@@ -44,14 +44,31 @@ type evalRunnerDeps struct {
}
func main() {
+ args := os.Args[1:]
+
+ if len(args) > 0 && args[0] == "--serve" {
+ suitePath := defaultSuitePath
+ if len(args) > 1 {
+ suitePath = args[1]
+ }
+ if err := serve(suitePath); err != nil {
+ _, _ = fmt.Fprintln(os.Stderr, err.Error())
+ os.Exit(1)
+ }
+ return
+ }
+
os.Exit(run(evalRunnerDeps{
- args: os.Args[1:],
+ args: args,
stdout: os.Stdout,
stderr: os.Stderr,
lookPath: exec.LookPath,
loadConfig: LoadConfig,
loadSuite: LoadSuite,
newOrch: func(cfg *Config) stackOrchestrator {
+ if cfg.SkipCompose {
+ return noopOrchestrator{}
+ }
return NewOrchestrator(cfg.ComposeFile, defaultComposeProjectName)
},
openDB: openPostgresPool,
diff --git a/cmd/eval-runner/orchestrator.go b/cmd/eval-runner/orchestrator.go
index c73aeff..18fcec7 100644
--- a/cmd/eval-runner/orchestrator.go
+++ b/cmd/eval-runner/orchestrator.go
@@ -63,6 +63,11 @@ func (o *Orchestrator) runCompose(ctx context.Context, args ...string) (string,
return combined.String(), nil
}
+type noopOrchestrator struct{}
+
+func (noopOrchestrator) Up(_ context.Context) error { return nil }
+func (noopOrchestrator) Down(_ context.Context) error { return nil }
+
func tailLines(text string, count int) string {
lines := strings.Split(strings.TrimSpace(text), "\n")
if len(lines) == 0 || lines[0] == "" {
diff --git a/cmd/eval-runner/runner.go b/cmd/eval-runner/runner.go
index 6dc5283..ad03df0 100644
--- a/cmd/eval-runner/runner.go
+++ b/cmd/eval-runner/runner.go
@@ -13,7 +13,7 @@ import (
"github.com/jackc/pgx/v5/pgxpool"
)
-const caseRunnerHTTPTimeout = 60 * time.Second
+const caseRunnerHTTPTimeout = 90 * time.Second
type CaseRunner struct {
AgentBaseURL string
@@ -31,12 +31,39 @@ func NewCaseRunner(agentBaseURL string, db *pgxpool.Pool) *CaseRunner {
}
}
+const auditPollInterval = 300 * time.Millisecond
+const auditPollTimeout = 30 * time.Second
+
func (r *CaseRunner) Run(ctx context.Context, c EvalCase) ([]TraceRow, error) {
sessionID, err := r.trigger(ctx, c.Input)
if err != nil {
return nil, err
}
+ // AuditWriter is async. Poll until the last row's decision matches the
+ // expected policyOutcome (or until timeout). This avoids returning before
+ // the terminal record (e.g. upstream_error written after allow) is flushed.
+ deadline := time.Now().Add(auditPollTimeout)
+ for {
+ trace, err := r.queryTrace(ctx, sessionID)
+ if err != nil {
+ return nil, err
+ }
+ if len(trace) > 0 && trace[len(trace)-1].Decision == c.PolicyOutcome {
+ return trace, nil
+ }
+ if time.Now().After(deadline) {
+ return trace, nil
+ }
+ select {
+ case <-ctx.Done():
+ return nil, ctx.Err()
+ case <-time.After(auditPollInterval):
+ }
+ }
+}
+
+func (r *CaseRunner) queryTrace(ctx context.Context, sessionID string) ([]TraceRow, error) {
rows, err := r.DB.Query(
ctx,
`SELECT tool_name, decision, arguments
diff --git a/cmd/eval-runner/serve.go b/cmd/eval-runner/serve.go
new file mode 100644
index 0000000..06a004a
--- /dev/null
+++ b/cmd/eval-runner/serve.go
@@ -0,0 +1,170 @@
+package main
+
+import (
+ "context"
+ _ "embed"
+ "encoding/json"
+ "fmt"
+ "log/slog"
+ "net/http"
+ "os"
+ "strings"
+
+ "github.com/jackc/pgx/v5/pgxpool"
+)
+
+//go:embed ui.html
+var uiHTML []byte
+
+type evalResponse struct {
+ Passed bool `json:"passed"`
+ PassCount int `json:"pass_count"`
+ TotalCount int `json:"total_count"`
+ Cases []CaseResult `json:"cases"`
+ Report string `json:"report"`
+}
+
+func serve(suitePath string) error {
+ cfg, err := LoadConfig()
+ if err != nil {
+ return err
+ }
+
+ suite, err := LoadSuite(suitePath)
+ if err != nil {
+ return fmt.Errorf("load suite: %w", err)
+ }
+
+ ctx := context.Background()
+ db, err := openPostgresPool(ctx, cfg.PostgresDSN)
+ if err != nil {
+ return fmt.Errorf("connect to postgres: %w", err)
+ }
+ defer db.Close()
+
+ pool, ok := db.(*pgxpool.Pool)
+ if !ok {
+ return fmt.Errorf("database connection is not a *pgxpool.Pool")
+ }
+ runner := NewCaseRunner(cfg.AgentURL, pool)
+
+ // AI agent runner — optional, only active when AI_AGENT_URL is set
+ aiAgentURL := os.Getenv("AI_AGENT_URL")
+ var aiRunner caseExecutor
+ var aiSuite *EvalSuite
+ if aiAgentURL != "" {
+ aiRunner = NewCaseRunner(aiAgentURL, pool)
+ aiSuitePath := os.Getenv("AI_SUITE_PATH")
+ if aiSuitePath == "" {
+ aiSuitePath = "evalsuite/ai-agent.yaml"
+ }
+ aiSuite, err = LoadSuite(aiSuitePath)
+ if err != nil {
+ return fmt.Errorf("load AI suite: %w", err)
+ }
+ }
+
+ port := os.Getenv("EVAL_SERVE_PORT")
+ if port == "" {
+ port = "8099"
+ }
+
+ http.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) {
+ w.Header().Set("Content-Type", "text/html; charset=utf-8")
+ _, _ = w.Write(uiHTML)
+ })
+
+ http.HandleFunc("POST /run-eval", makeEvalHandler(runner, suite, pool))
+
+ http.HandleFunc("POST /run-eval/ai", func(w http.ResponseWriter, r *http.Request) {
+ if aiRunner == nil {
+ http.Error(w, `{"error":"AI_AGENT_URL not configured"}`, http.StatusServiceUnavailable)
+ return
+ }
+ makeEvalHandler(aiRunner, aiSuite, pool)(w, r)
+ })
+
+ http.HandleFunc("POST /run-eval/custom", makeCustomEvalHandler(pool))
+
+ http.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) {
+ w.WriteHeader(http.StatusOK)
+ })
+
+ slog.Info("eval server listening", "port", port)
+ return http.ListenAndServe(":"+port, nil)
+}
+
+func makeCustomEvalHandler(pool *pgxpool.Pool) http.HandlerFunc {
+ return func(w http.ResponseWriter, r *http.Request) {
+ var body struct {
+ Suite string `json:"suite"`
+ AgentURL string `json:"agent_url"`
+ }
+ if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
+ http.Error(w, fmt.Sprintf("invalid request: %v", err), http.StatusBadRequest)
+ return
+ }
+ if body.AgentURL == "" {
+ http.Error(w, "missing agent_url", http.StatusBadRequest)
+ return
+ }
+ if body.Suite == "" {
+ http.Error(w, "missing suite", http.StatusBadRequest)
+ return
+ }
+
+ suite, err := LoadSuiteFromReader(strings.NewReader(body.Suite))
+ if err != nil {
+ http.Error(w, fmt.Sprintf("invalid suite: %v", err), http.StatusBadRequest)
+ return
+ }
+
+ runner := NewCaseRunner(body.AgentURL, pool)
+ makeEvalHandler(runner, suite, pool)(w, r)
+ }
+}
+
+func makeEvalHandler(runner caseExecutor, suite *EvalSuite, _ *pgxpool.Pool) http.HandlerFunc {
+ return func(w http.ResponseWriter, r *http.Request) {
+ results := make([]CaseResult, 0, len(suite.Cases))
+ for _, testCase := range suite.Cases {
+ trace, err := runner.Run(r.Context(), testCase)
+ result := CaseResult{Name: testCase.Name}
+ if err != nil {
+ result.Failures = []CheckFailure{{
+ Check: "run",
+ Expected: "case completes successfully",
+ Observed: err.Error(),
+ }}
+ } else {
+ result = Evaluate(testCase, trace)
+ }
+ results = append(results, result)
+ }
+
+ passCount := 0
+ for _, r := range results {
+ if r.Passed {
+ passCount++
+ }
+ }
+
+ report := GenerateReport(results)
+
+ if r.Header.Get("Accept") == "application/json" {
+ resp := evalResponse{
+ Passed: passCount == len(results),
+ PassCount: passCount,
+ TotalCount: len(results),
+ Cases: results,
+ Report: report,
+ }
+ w.Header().Set("Content-Type", "application/json")
+ _ = json.NewEncoder(w).Encode(resp)
+ return
+ }
+
+ w.Header().Set("Content-Type", "text/plain")
+ _, _ = fmt.Fprint(w, report)
+ }
+}
diff --git a/cmd/eval-runner/suite.go b/cmd/eval-runner/suite.go
index f4147a9..1b4d16d 100644
--- a/cmd/eval-runner/suite.go
+++ b/cmd/eval-runner/suite.go
@@ -2,6 +2,7 @@ package main
import (
"fmt"
+ "io"
"os"
"gopkg.in/yaml.v3"
@@ -12,6 +13,7 @@ var allowedPolicyOutcomes = map[string]struct{}{
"deny": {},
"approvalRequired": {},
"expired": {},
+ "upstream_error": {},
}
func LoadSuite(path string) (*EvalSuite, error) {
@@ -21,10 +23,18 @@ func LoadSuite(path string) (*EvalSuite, error) {
}
defer func() { _ = file.Close() }()
- var suite EvalSuite
- if err := yaml.NewDecoder(file).Decode(&suite); err != nil {
+ suite, err := LoadSuiteFromReader(file)
+ if err != nil {
return nil, fmt.Errorf("parse eval suite %q: %w", path, err)
}
+ return suite, nil
+}
+
+func LoadSuiteFromReader(r io.Reader) (*EvalSuite, error) {
+ var suite EvalSuite
+ if err := yaml.NewDecoder(r).Decode(&suite); err != nil {
+ return nil, fmt.Errorf("parse eval suite: %w", err)
+ }
for i, evalCase := range suite.Cases {
if evalCase.Name == "" {
diff --git a/cmd/eval-runner/suite_test.go b/cmd/eval-runner/suite_test.go
index ab53c60..deaeaac 100644
--- a/cmd/eval-runner/suite_test.go
+++ b/cmd/eval-runner/suite_test.go
@@ -212,6 +212,25 @@ func TestLoadSuiteLoadsRepoDefaultFixtureFromRepoRoot(t *testing.T) {
}
}
+func TestLoadSuiteAcceptsUpstreamErrorPolicyOutcome(t *testing.T) {
+ dir := t.TempDir()
+ path := filepath.Join(dir, "suite.yaml")
+ writeTestFile(t, path, `
+cases:
+ - name: mcp-down
+ input: "Show me my recent charges."
+ mustInclude: [list_recent_charges]
+ policyOutcome: upstream_error
+`)
+ suite, err := LoadSuite(path)
+ if err != nil {
+ t.Fatalf("LoadSuite() error = %v, want nil", err)
+ }
+ if suite.Cases[0].PolicyOutcome != "upstream_error" {
+ t.Fatalf("PolicyOutcome = %q, want upstream_error", suite.Cases[0].PolicyOutcome)
+ }
+}
+
func writeTestFile(t *testing.T, path string, contents string) {
t.Helper()
if err := os.WriteFile(path, []byte(strings.TrimLeft(contents, "\n")), 0o600); err != nil {
diff --git a/cmd/eval-runner/types.go b/cmd/eval-runner/types.go
index aebb823..8782278 100644
--- a/cmd/eval-runner/types.go
+++ b/cmd/eval-runner/types.go
@@ -22,13 +22,13 @@ type TraceRow struct {
}
type CheckFailure struct {
- Check string
- Expected string
- Observed string
+ Check string `json:"check"`
+ Expected string `json:"expected"`
+ Observed string `json:"observed"`
}
type CaseResult struct {
- Name string
- Passed bool
- Failures []CheckFailure
+ Name string `json:"name"`
+ Passed bool `json:"passed"`
+ Failures []CheckFailure `json:"failures"`
}
diff --git a/cmd/eval-runner/ui.html b/cmd/eval-runner/ui.html
new file mode 100644
index 0000000..5361e7e
--- /dev/null
+++ b/cmd/eval-runner/ui.html
@@ -0,0 +1,163 @@
+
+
+
+
+
+ ToolGate Eval Runner
+
+
+
+
+
ToolGate Eval Runner
+
+
+
+
+
+
+
Base URL of the agent to evaluate (must expose a /trigger endpoint).
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Results
+
+
+
+
+
+
+ | Case |
+ Status |
+ Failures |
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/cmd/gateway/approval_bridge.go b/cmd/gateway/approval_bridge.go
index 79116b0..18eb9c4 100644
--- a/cmd/gateway/approval_bridge.go
+++ b/cmd/gateway/approval_bridge.go
@@ -108,6 +108,7 @@ func NewRedisApprovalBridge(
tickets *TicketStore,
locker *SessionLocker,
lockTTL time.Duration,
+ approvalTimeout time.Duration,
log *slog.Logger,
) *RedisApprovalBridge {
if log == nil {
@@ -117,7 +118,7 @@ func NewRedisApprovalBridge(
redis: rdb,
tickets: tickets,
locker: locker,
- timeout: 5 * time.Minute,
+ timeout: approvalTimeout,
lockExtendInterval: lockTTL / 2,
log: log,
}
diff --git a/cmd/gateway/approval_bridge_integration_test.go b/cmd/gateway/approval_bridge_integration_test.go
index 3bbbbb7..dac4f29 100644
--- a/cmd/gateway/approval_bridge_integration_test.go
+++ b/cmd/gateway/approval_bridge_integration_test.go
@@ -211,7 +211,7 @@ func newApprovalBridgeIntegrationHarness(t *testing.T, timeout, lockTTL, lockExt
store := NewTicketStore(pool)
locker := NewSessionLocker(redisClient, lockTTL, 250*time.Millisecond)
- bridge := NewRedisApprovalBridge(redisClient, store, locker, lockTTL, slog.New(slog.NewTextHandler(io.Discard, nil)))
+ bridge := NewRedisApprovalBridge(redisClient, store, locker, lockTTL, 5*time.Minute, slog.New(slog.NewTextHandler(io.Discard, nil)))
bridge.timeout = timeout
bridge.lockExtendInterval = lockExtendInterval
diff --git a/cmd/gateway/capability_cache.go b/cmd/gateway/capability_cache.go
new file mode 100644
index 0000000..d90444e
--- /dev/null
+++ b/cmd/gateway/capability_cache.go
@@ -0,0 +1,68 @@
+package main
+
+import (
+ "encoding/json"
+ "sync"
+
+ "github.com/K8Harness/ToolGate/core/mcp"
+)
+
+// capabilityCache stores the last successful initialize and tools/list responses
+// so the gateway can serve them when the upstream MCP server is temporarily unavailable.
+type capabilityCache struct {
+ mu sync.RWMutex
+ initResp json.RawMessage
+ toolResp json.RawMessage
+}
+
+func (c *capabilityCache) setInit(resp *mcp.JSONRPCResponse) {
+ if resp == nil {
+ return
+ }
+ b, err := json.Marshal(resp)
+ if err != nil {
+ return
+ }
+ c.mu.Lock()
+ c.initResp = b
+ c.mu.Unlock()
+}
+
+func (c *capabilityCache) getInit(id json.RawMessage) *mcp.JSONRPCResponse {
+ c.mu.RLock()
+ b := c.initResp
+ c.mu.RUnlock()
+ return unmarshalWithID(b, id)
+}
+
+func (c *capabilityCache) setToolList(resp *mcp.JSONRPCResponse) {
+ if resp == nil {
+ return
+ }
+ b, err := json.Marshal(resp)
+ if err != nil {
+ return
+ }
+ c.mu.Lock()
+ c.toolResp = b
+ c.mu.Unlock()
+}
+
+func (c *capabilityCache) getToolList(id json.RawMessage) *mcp.JSONRPCResponse {
+ c.mu.RLock()
+ b := c.toolResp
+ c.mu.RUnlock()
+ return unmarshalWithID(b, id)
+}
+
+func unmarshalWithID(b json.RawMessage, id json.RawMessage) *mcp.JSONRPCResponse {
+ if b == nil {
+ return nil
+ }
+ var resp mcp.JSONRPCResponse
+ if err := json.Unmarshal(b, &resp); err != nil {
+ return nil
+ }
+ resp.ID = id
+ return &resp
+}
diff --git a/cmd/gateway/config.go b/cmd/gateway/config.go
index 319e643..e06b030 100644
--- a/cmd/gateway/config.go
+++ b/cmd/gateway/config.go
@@ -30,6 +30,7 @@ type Config struct {
SessionTTL time.Duration
SessionLockTTL time.Duration
LockAcquireTimeout time.Duration
+ ApprovalLockTTL time.Duration // APPROVAL_LOCK_TTL (optional, default 5m)
SlackBotToken string // SLACK_BOT_TOKEN (required)
SlackSigningSecret string // SLACK_SIGNING_SECRET (required)
SlackChannel string // SLACK_CHANNEL (required)
@@ -92,6 +93,11 @@ func LoadConfig() (*Config, error) {
return nil, err
}
+ approvalLockTTL, err := envDuration("APPROVAL_LOCK_TTL", 5*time.Minute)
+ if err != nil {
+ return nil, err
+ }
+
return &Config{
ListenPort: listenPort,
PolicyFilePath: envStringWithInfoNotice("POLICY_FILE", defaultPolicyFilePath, "using default policy file path"),
@@ -103,6 +109,7 @@ func LoadConfig() (*Config, error) {
SessionTTL: sessionTTL,
SessionLockTTL: sessionLockTTL,
LockAcquireTimeout: lockAcquireTimeout,
+ ApprovalLockTTL: approvalLockTTL,
SlackBotToken: slackBotToken,
SlackSigningSecret: slackSigningSecret,
SlackChannel: slackChannel,
diff --git a/cmd/gateway/config_test.go b/cmd/gateway/config_test.go
index 128ecc7..1a1e1e6 100644
--- a/cmd/gateway/config_test.go
+++ b/cmd/gateway/config_test.go
@@ -346,6 +346,49 @@ func TestLoadConfigReadsSlackVars(t *testing.T) {
}
}
+func TestLoadConfigReadsApprovalLockTTL(t *testing.T) {
+ setRequiredEnv(t)
+ t.Setenv("APPROVAL_LOCK_TTL", "15s")
+
+ cfg, err := LoadConfig()
+ if err != nil {
+ t.Fatalf("LoadConfig() error = %v", err)
+ }
+ if cfg.ApprovalLockTTL != 15*time.Second {
+ t.Fatalf("ApprovalLockTTL = %v, want 15s", cfg.ApprovalLockTTL)
+ }
+}
+
+func TestLoadConfigDefaultsApprovalLockTTLToFiveMinutes(t *testing.T) {
+ setRequiredEnv(t)
+ t.Setenv("APPROVAL_LOCK_TTL", "")
+
+ cfg, err := LoadConfig()
+ if err != nil {
+ t.Fatalf("LoadConfig() error = %v", err)
+ }
+ if cfg.ApprovalLockTTL != 5*time.Minute {
+ t.Fatalf("ApprovalLockTTL = %v, want 5m0s", cfg.ApprovalLockTTL)
+ }
+}
+
+func setRequiredEnv(t *testing.T) {
+ t.Helper()
+ t.Setenv("GATEWAY_PORT", "")
+ t.Setenv("POLICY_FILE", "")
+ t.Setenv("POSTGRES_DSN", "postgres://gateway:gateway@localhost:5432/gateway?sslmode=disable")
+ t.Setenv("REDIS_DSN", "redis://localhost:6379/0")
+ t.Setenv("UPSTREAM_MCP_URL", "http://upstream.example/mcp")
+ t.Setenv("TURN_ID_HEADER", "")
+ t.Setenv("UPSTREAM_TIMEOUT", "")
+ t.Setenv("SESSION_TTL", "")
+ t.Setenv("SESSION_LOCK_TTL", "")
+ t.Setenv("LOCK_ACQUIRE_TIMEOUT", "")
+ t.Setenv("SLACK_BOT_TOKEN", "xoxb-default-token")
+ t.Setenv("SLACK_SIGNING_SECRET", "default-signing-secret")
+ t.Setenv("SLACK_CHANNEL", "#approvals")
+}
+
func setDefaultLoggerForTest(dst *bytes.Buffer) func() {
previous := slog.Default()
logger := slog.New(slog.NewTextHandler(dst, &slog.HandlerOptions{Level: slog.LevelInfo}))
diff --git a/cmd/gateway/db.go b/cmd/gateway/db.go
index 984a1c7..fde1bb9 100644
--- a/cmd/gateway/db.go
+++ b/cmd/gateway/db.go
@@ -18,11 +18,28 @@ var schemaStatements = []string{
tool_name TEXT NOT NULL,
arguments JSONB NOT NULL,
decision TEXT NOT NULL
- CHECK (decision IN ('allow', 'deny', 'approvalRequired', 'budgetExceeded')),
+ CHECK (decision IN ('allow', 'deny', 'approvalRequired', 'budgetExceeded', 'upstream_error', 'expired')),
reason TEXT,
decided_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
)`,
`CREATE INDEX IF NOT EXISTS audit_log_session_turn ON audit_log (session_id, turn_id)`,
+ // Repair: extend decision check constraint to include upstream_error and expired.
+ // Idempotent: no-op when constraint already covers the full set.
+ `DO $$
+DECLARE
+ cname TEXT;
+BEGIN
+ SELECT conname INTO cname
+ FROM pg_constraint
+ WHERE conrelid = 'audit_log'::regclass
+ AND contype = 'c'
+ AND pg_get_constraintdef(oid) NOT LIKE '%upstream_error%';
+ IF cname IS NOT NULL THEN
+ EXECUTE format('ALTER TABLE audit_log DROP CONSTRAINT %I', cname);
+ ALTER TABLE audit_log ADD CONSTRAINT audit_log_decision_check
+ CHECK (decision IN ('allow', 'deny', 'approvalRequired', 'budgetExceeded', 'upstream_error', 'expired'));
+ END IF;
+END $$`,
`CREATE TABLE IF NOT EXISTS ticket (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
session_id TEXT NOT NULL,
diff --git a/cmd/gateway/main.go b/cmd/gateway/main.go
index d6b5565..e65ff07 100644
--- a/cmd/gateway/main.go
+++ b/cmd/gateway/main.go
@@ -86,7 +86,7 @@ func buildGatewayServer(ctx context.Context, config *Config, logger *slog.Logger
ticketStore := NewTicketStore(pool)
sessionLocker := NewSessionLocker(redisClient, config.SessionLockTTL, config.LockAcquireTimeout)
slackNotifier := NewSlackClient(config.SlackBotToken, config.SlackChannel, config.SlackAPIBaseURL, logger)
- approvalBridge := NewRedisApprovalBridge(redisClient, ticketStore, sessionLocker, config.SessionLockTTL, logger)
+ approvalBridge := NewRedisApprovalBridge(redisClient, ticketStore, sessionLocker, config.SessionLockTTL, config.ApprovalLockTTL, logger)
slackWebhook := NewSlackWebhookHandler(config.SlackSigningSecret, ticketStore, redisClient, logger)
policyGate := NewPolicyGateHandler(policy, budgetTracker, auditWriter, ticketStore, approvalBridge, slackNotifier, logger)
turnRWLock := NewTurnRWLock(redisClient, config.SessionLockTTL, config.LockAcquireTimeout)
@@ -100,6 +100,7 @@ func buildGatewayServer(ctx context.Context, config *Config, logger *slog.Logger
pipeline.Use(policyGate)
server := NewServer(config, pipeline, logger)
+ server.audit = auditWriter
server.forwarder = forwarder
server.guard = guard
server.SetSlackWebhookHandler(slackWebhook)
diff --git a/cmd/gateway/policy_gate.go b/cmd/gateway/policy_gate.go
index 04e6c38..6d81e5b 100644
--- a/cmd/gateway/policy_gate.go
+++ b/cmd/gateway/policy_gate.go
@@ -212,6 +212,14 @@ func (h *PolicyGateHandler) Handle(ctx context.Context, req *mcp.JSONRPCRequest)
decision, err := h.bridge.WaitForDecision(ctx, ticketID, sessionID, turnID)
if errors.Is(err, ErrApprovalTimeout) {
h.log.Error("approval timed out", "ticketID", ticketID, "sessionID", sessionID, "turnID", turnID)
+ h.audit.Write(AuditRecord{
+ SessionID: sessionID,
+ TurnID: turnID,
+ ToolName: toolName,
+ Arguments: arguments,
+ Decision: "expired",
+ Reason: "approval timeout",
+ })
return approvalErrorResponse(req.ID, "approval timeout"), nil
}
if err != nil || !decision.Approved {
diff --git a/cmd/gateway/policy_gate_test.go b/cmd/gateway/policy_gate_test.go
index 5f83f74..0c879e7 100644
--- a/cmd/gateway/policy_gate_test.go
+++ b/cmd/gateway/policy_gate_test.go
@@ -583,12 +583,13 @@ func TestPolicyGateHandlerApprovalHoldBridgeErrorReturnsDenied(t *testing.T) {
}
func TestPolicyGateHandlerApprovalHoldTimeoutReturnsTimeoutError(t *testing.T) {
+ audit := &policyGateAuditStub{}
bridge := &mockApprovalBridge{err: ErrApprovalTimeout}
notifier := newMockSlackNotifier(nil)
handler := newPolicyGateHandler(
&corepolicy.AgentPolicy{Budgets: corepolicy.Budgets{MaxToolCallsPerTurn: 3}},
NewBudgetTracker(),
- &policyGateAuditStub{},
+ audit,
&policyGateTicketStub{},
&policyGateEvaluatorStub{decision: corepolicy.PolicyDecision{Action: corepolicy.ActionApprovalRequired}},
bridge,
@@ -610,6 +611,20 @@ func TestPolicyGateHandlerApprovalHoldTimeoutReturnsTimeoutError(t *testing.T) {
if resp.Error.Message != "approval timeout" {
t.Fatalf("error message = %q, want %q", resp.Error.Message, "approval timeout")
}
+
+ // Verify expired audit record written after the approvalRequired record.
+ var expiredRecord *AuditRecord
+ for i := range audit.records {
+ if audit.records[i].Decision == "expired" {
+ expiredRecord = &audit.records[i]
+ }
+ }
+ if expiredRecord == nil {
+ t.Fatalf("no expired audit record written; got records: %+v", audit.records)
+ }
+ if expiredRecord.SessionID != "session-timeout" {
+ t.Fatalf("expired record SessionID = %q, want session-timeout", expiredRecord.SessionID)
+ }
}
func TestPolicyGateHandlerRedactMasksFieldAndAuditsAllow(t *testing.T) {
diff --git a/cmd/gateway/server.go b/cmd/gateway/server.go
index 49757d1..3b68f9a 100644
--- a/cmd/gateway/server.go
+++ b/cmd/gateway/server.go
@@ -34,6 +34,8 @@ type Server struct {
sessions *SessionRegistry
mux *http.ServeMux
log *slog.Logger
+ audit auditRecorder // nil-safe; set by buildGatewayServer
+ capCache capabilityCache // caches last good initialize/tools/list for upstream-down resilience
}
func NewServer(config *Config, pipeline *mcp.Pipeline, log *slog.Logger) *Server {
@@ -119,9 +121,11 @@ func (s *Server) handleMCPPost(w http.ResponseWriter, r *http.Request) {
}
toolName := ""
+ var toolArguments json.RawMessage
if req.Method == "tools/call" {
- if name, ok := toolNameFromParams(req.Params); ok {
+ if name, args, parseErr := parseToolCallParams(req.Params); parseErr == nil {
toolName = name
+ toolArguments = args
}
}
@@ -129,6 +133,22 @@ func (s *Server) handleMCPPost(w http.ResponseWriter, r *http.Request) {
if err != nil {
if req.Method == "tools/call" {
NewRequestLogger(s.log).LogOutcome(r.Context(), req, nil, err)
+ if toolName != "" && s.audit != nil {
+ s.audit.Write(AuditRecord{
+ SessionID: sessionID,
+ TurnID: mcp.TurnIDFromContext(r.Context()),
+ ToolName: toolName,
+ Arguments: toolArguments,
+ Decision: "upstream_error",
+ Reason: err.Error(),
+ })
+ }
+ }
+ if req.Method == "tools/list" {
+ if cached := s.capCache.getToolList(req.ID); cached != nil {
+ s.writeJSONResponse(w, cached)
+ return
+ }
}
s.errorResponse(w, req.ID, jsonRPCCode(err), err.Error())
return
@@ -137,6 +157,9 @@ func (s *Server) handleMCPPost(w http.ResponseWriter, r *http.Request) {
if req.Method == "tools/call" {
NewRequestLogger(s.log).LogOutcome(r.Context(), req, resp, nil)
}
+ if req.Method == "tools/list" {
+ s.capCache.setToolList(resp)
+ }
s.writeJSONResponse(w, resp)
}
@@ -236,9 +259,14 @@ func (s *Server) writeInitializeResponse(w http.ResponseWriter, ctx context.Cont
resp, err := s.forwarder.Handle(ctx, req)
if err != nil {
+ if cached := s.capCache.getInit(req.ID); cached != nil {
+ s.writeJSONResponse(w, cached)
+ return
+ }
s.errorResponse(w, req.ID, jsonRPCCode(err), err.Error())
return
}
+ s.capCache.setInit(resp)
s.writeJSONResponse(w, resp)
}
diff --git a/cmd/gateway/server_test.go b/cmd/gateway/server_test.go
index 890f610..2aaf0ef 100644
--- a/cmd/gateway/server_test.go
+++ b/cmd/gateway/server_test.go
@@ -5,6 +5,7 @@ import (
"bytes"
"context"
"encoding/json"
+ "fmt"
"io"
"log/slog"
"net/http"
@@ -448,3 +449,38 @@ func assertErrorCode(t *testing.T, rec *httptest.ResponseRecorder, want int) {
t.Fatalf("error.code = %d, want %d", resp.Error.Code, want)
}
}
+
+type captureAuditWriter struct {
+ records []AuditRecord
+}
+
+func (c *captureAuditWriter) Write(r AuditRecord) {
+ c.records = append(c.records, r)
+}
+
+func TestServerToolsCallWritesUpstreamErrorAuditOnForwarderFailure(t *testing.T) {
+ audit := &captureAuditWriter{}
+ server := newTestServer(t, &captureHandler{})
+ server.audit = audit
+ server.pipeline = mcp.NewPipeline(mcp.HandlerFunc(func(ctx context.Context, req *mcp.JSONRPCRequest) (*mcp.JSONRPCResponse, error) {
+ return nil, fmt.Errorf("connection refused")
+ }))
+ session := server.sessions.Create()
+
+ req := httptest.NewRequest(http.MethodPost, "/mcp",
+ strings.NewReader(`{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"list_recent_charges","arguments":{}}}`))
+ req.Header.Set(mcpSessionIDHeader, session.ID)
+ rec := httptest.NewRecorder()
+
+ server.ServeHTTP(rec, req)
+
+ if len(audit.records) != 1 {
+ t.Fatalf("audit records = %d, want 1", len(audit.records))
+ }
+ if audit.records[0].Decision != "upstream_error" {
+ t.Fatalf("Decision = %q, want upstream_error", audit.records[0].Decision)
+ }
+ if audit.records[0].ToolName != "list_recent_charges" {
+ t.Fatalf("ToolName = %q, want list_recent_charges", audit.records[0].ToolName)
+ }
+}
diff --git a/deploy/docker-compose.yml b/deploy/docker-compose.yml
index a93c668..c35a342 100644
--- a/deploy/docker-compose.yml
+++ b/deploy/docker-compose.yml
@@ -80,48 +80,6 @@ services:
networks:
- eval-gate
- fake-stripe:
- build:
- context: ..
- dockerfile: examples/fake-mcp-servers/stripe/Dockerfile
- expose:
- - "8082"
- healthcheck:
- test:
- [
- "CMD-SHELL",
- "wget -q -O /dev/null --header='Content-Type: application/json' --post-data='{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"initialize\",\"params\":{\"protocolVersion\":\"2025-03-26\",\"capabilities\":{},\"clientInfo\":{\"name\":\"healthcheck\",\"version\":\"1.0.0\"}}}' http://127.0.0.1:8082/mcp",
- ]
- interval: 5s
- timeout: 5s
- retries: 12
- start_period: 5s
- networks:
- - eval-gate
-
- fake-upstream:
- build:
- context: ..
- dockerfile_inline: |
- FROM python:3.12-alpine
- WORKDIR /app
- COPY scripts/fake_upstream.py /app/fake_upstream.py
- ENTRYPOINT ["python", "/app/fake_upstream.py"]
- expose:
- - "8081"
- healthcheck:
- test:
- [
- "CMD-SHELL",
- "wget -q -O /dev/null --header='Content-Type: application/json' --post-data='{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"initialize\",\"params\":{\"client\":\"healthcheck\"}}' http://127.0.0.1:8081/mcp",
- ]
- interval: 5s
- timeout: 5s
- retries: 12
- start_period: 5s
- networks:
- - eval-gate
-
localstripe:
build:
context: ../localstripe_demo
@@ -174,37 +132,22 @@ services:
networks:
- eval-gate
- fake-zendesk:
+ localstripe-seed:
build:
- context: ..
- dockerfile: examples/fake-mcp-servers/zendesk/Dockerfile
- expose:
- - "8083"
- healthcheck:
- test:
- [
- "CMD-SHELL",
- "wget -q -O /dev/null --header='Content-Type: application/json' --post-data='{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"initialize\",\"params\":{\"protocolVersion\":\"2025-03-26\",\"capabilities\":{},\"clientInfo\":{\"name\":\"healthcheck\",\"version\":\"1.0.0\"}}}' http://127.0.0.1:8083/mcp",
- ]
- interval: 5s
- timeout: 5s
- retries: 12
- start_period: 5s
- networks:
- - eval-gate
-
- fake-slack:
- build:
- context: ..
- dockerfile: examples/fake-mcp-servers/slack/Dockerfile
- expose:
- - "8084"
- healthcheck:
- test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8084/inspect"]
- interval: 5s
- timeout: 5s
- retries: 12
- start_period: 5s
+ context: ../localstripe_demo
+ dockerfile_inline: |
+ FROM python:3.12-alpine
+ WORKDIR /app
+ COPY . .
+ RUN pip install --no-cache-dir -e ".[webapp]"
+ ENTRYPOINT ["localstripe-seed"]
+ depends_on:
+ localstripe:
+ condition: service_healthy
+ environment:
+ LOCALSTRIPE_URL: http://localstripe:8420
+ LOCALSTRIPE_API_KEY: sk_test_12345
+ restart: "no"
networks:
- eval-gate
@@ -257,49 +200,108 @@ services:
networks:
- eval-gate
- demo-webapp:
+ eval-trigger:
build:
context: ../localstripe_demo
dockerfile_inline: |
FROM python:3.12-alpine
WORKDIR /app
COPY . .
- RUN pip install --no-cache-dir -e ".[webapp]"
- ENTRYPOINT ["demo-webapp"]
+ RUN pip install --no-cache-dir -e ".[agent]"
+ ENTRYPOINT ["localstripe-eval-trigger"]
depends_on:
gateway:
condition: service_healthy
- localstripe:
- condition: service_healthy
environment:
MCP_URL: http://gateway:8080/mcp
- LOCALSTRIPE_URL: http://localstripe:8420
- LOCALSTRIPE_API_KEY: sk_test_12345
- WEBAPP_HOST: "0.0.0.0"
- WEBAPP_PORT: "8422"
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
ANTHROPIC_MODEL: ${ANTHROPIC_MODEL:-claude-sonnet-4-6}
+ TRIGGER_PORT: "8086"
+ healthcheck:
+ test:
+ [
+ "CMD",
+ "python",
+ "-c",
+ "import socket; s = socket.create_connection(('127.0.0.1', 8086), 2); s.close()",
+ ]
+ interval: 5s
+ timeout: 5s
+ retries: 12
+ start_period: 10s
ports:
- - "18422:8422"
+ - "18086:8086"
networks:
- eval-gate
- localstripe-refund-agent:
+ eval-server:
+ build:
+ context: ..
+ dockerfile_inline: |
+ FROM golang:1.25-alpine AS builder
+ WORKDIR /build
+ COPY go.mod go.sum ./
+ RUN go mod download
+ COPY . .
+ RUN CGO_ENABLED=0 GOOS=linux go build -o /eval-server ./cmd/eval-runner
+
+ FROM alpine:latest
+ WORKDIR /app
+ COPY --from=builder /eval-server /eval-server
+ COPY evalsuite/ /app/evalsuite/
+ ENTRYPOINT ["/eval-server", "--serve", "/app/evalsuite/default.yaml"]
+ depends_on:
+ gateway:
+ condition: service_healthy
+ postgres:
+ condition: service_healthy
+ support-agent:
+ condition: service_healthy
+ eval-trigger:
+ condition: service_healthy
+ localstripe-seed:
+ condition: service_completed_successfully
+ environment:
+ POSTGRES_DSN: postgres://gateway:gateway@postgres:5432/gateway?sslmode=disable
+ AGENT_URL: http://support-agent:8085
+ AI_AGENT_URL: http://eval-trigger:8086
+ AI_SUITE_PATH: /app/evalsuite/ai-agent.yaml
+ EVAL_SERVE_PORT: "8099"
+ healthcheck:
+ test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8099/healthz"]
+ interval: 5s
+ timeout: 5s
+ retries: 12
+ start_period: 10s
+ ports:
+ - "18099:8099"
+ networks:
+ - eval-gate
+
+ demo-webapp:
build:
context: ../localstripe_demo
dockerfile_inline: |
FROM python:3.12-alpine
WORKDIR /app
COPY . .
- RUN pip install --no-cache-dir -e ".[agent]"
- ENTRYPOINT ["localstripe-refund-agent"]
+ RUN pip install --no-cache-dir -e ".[webapp]"
+ ENTRYPOINT ["demo-webapp"]
depends_on:
gateway:
condition: service_healthy
+ localstripe:
+ condition: service_healthy
environment:
MCP_URL: http://gateway:8080/mcp
+ LOCALSTRIPE_URL: http://localstripe:8420
+ LOCALSTRIPE_API_KEY: sk_test_12345
+ WEBAPP_HOST: "0.0.0.0"
+ WEBAPP_PORT: "8422"
ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
ANTHROPIC_MODEL: ${ANTHROPIC_MODEL:-claude-sonnet-4-6}
+ ports:
+ - "18422:8422"
networks:
- eval-gate
diff --git a/docker-compose.override.yml b/docker-compose.override.yml
new file mode 100644
index 0000000..3653f54
--- /dev/null
+++ b/docker-compose.override.yml
@@ -0,0 +1,35 @@
+services:
+ gateway:
+ depends_on:
+ localstripe-mcp:
+ condition: service_healthy
+ postgres:
+ condition: service_healthy
+ redis:
+ condition: service_healthy
+ mock-slack:
+ condition: service_started
+ environment:
+ UPSTREAM_MCP_URL: http://localstripe-mcp:8421/mcp
+ healthcheck:
+ test: ["CMD-SHELL", "bash -c 'echo -e \"GET /mcp HTTP/1.0\\r\\nHost: 127.0.0.1\\r\\n\\r\\n\" > /dev/tcp/127.0.0.1/8080' 2>/dev/null"]
+ interval: 5s
+ timeout: 5s
+ retries: 12
+ start_period: 5s
+
+ demo-webapp:
+ depends_on:
+ gateway:
+ condition: service_healthy
+ localstripe:
+ condition: service_healthy
+ environment:
+ MCP_URL: http://gateway:8080/mcp
+
+ eval-trigger:
+ depends_on:
+ gateway:
+ condition: service_healthy
+ environment:
+ MCP_URL: http://gateway:8080/mcp
diff --git a/docker-compose.yml b/docker-compose.yml
index 8b223cb..af2d0e1 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -18,8 +18,10 @@ services:
SLACK_BOT_TOKEN: "xoxb-demo-token"
SLACK_SIGNING_SECRET: "demo-signing-secret"
SLACK_CHANNEL: "C-DEMO-APPROVALS"
+ SLACK_API_BASE_URL: "http://mock-slack:8090/api"
SESSION_LOCK_TTL: "3s"
LOCK_ACQUIRE_TIMEOUT: "5s"
+ APPROVAL_LOCK_TTL: "15s"
UPSTREAM_MCP_URL: http://fake-upstream:8081/mcp
ports:
- "18080:8080"
@@ -47,6 +49,8 @@ services:
timeout: 5s
retries: 12
start_period: 5s
+ ports:
+ - "15432:5432"
volumes:
- postgres-data:/var/lib/postgresql/data
@@ -107,6 +111,47 @@ services:
retries: 12
start_period: 15s
+ eval-trigger:
+ build:
+ context: ./localstripe_demo
+ dockerfile_inline: |
+ FROM python:3.12-alpine
+ WORKDIR /app
+ COPY . .
+ RUN pip install --no-cache-dir -e ".[agent]"
+ ENTRYPOINT ["localstripe-eval-trigger"]
+ depends_on:
+ localstripe-mcp:
+ condition: service_healthy
+ environment:
+ MCP_URL: http://localstripe-mcp:8421/mcp
+ ANTHROPIC_API_KEY: ${ANTHROPIC_API_KEY}
+ ANTHROPIC_MODEL: ${ANTHROPIC_MODEL:-claude-sonnet-4-6}
+ ports:
+ - "18086:8086"
+ healthcheck:
+ test: ["CMD-SHELL", "nc -z 127.0.0.1 8086"]
+ interval: 3s
+ timeout: 3s
+ retries: 15
+ start_period: 10s
+
+ mock-slack:
+ build:
+ context: .
+ dockerfile: examples/mock-slack/Dockerfile
+ environment:
+ GATEWAY_URL: http://gateway:8080
+ SLACK_SIGNING_SECRET: "demo-signing-secret"
+ ports:
+ - "18090:8090"
+ healthcheck:
+ test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8090/healthz 2>/dev/null || exit 0"]
+ interval: 5s
+ timeout: 5s
+ retries: 6
+ start_period: 5s
+
demo-webapp:
build:
context: ./localstripe_demo
diff --git a/docs/superpowers/plans/2026-05-27-truefoundry-resilience-pivot.md b/docs/superpowers/plans/2026-05-27-truefoundry-resilience-pivot.md
new file mode 100644
index 0000000..d7bde1d
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-27-truefoundry-resilience-pivot.md
@@ -0,0 +1,860 @@
+# TrueFoundry Resilience Pivot Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add three resilience demo scenarios to ToolGate showcasing fault-tolerance of the proxy layer, policy gate, and approval flow under infrastructure failures.
+
+**Architecture:** Seven atomic changes across the gateway, eval runner, Docker Compose, and a new demo script. Each task is independently testable and commits on its own. Tasks 1–4 are gateway/eval-runner code changes; Tasks 5–7 are infra and demo wiring.
+
+**Tech Stack:** Go 1.22+, pgx/v5, Redis go-redis/v9, Docker Compose v2, bash
+
+---
+
+## File Map
+
+| File | Change |
+|---|---|
+| `cmd/eval-runner/suite.go` | Add `"upstream_error"` to `allowedPolicyOutcomes` |
+| `cmd/eval-runner/suite_test.go` | Add acceptance test for `upstream_error` outcome |
+| `cmd/gateway/server.go` | Add `audit auditStore` field; write `upstream_error` on forwarder failure |
+| `cmd/gateway/main.go` | Wire `server.audit = auditWriter`; pass `config.ApprovalLockTTL` to bridge |
+| `cmd/gateway/server_test.go` | Add test verifying `upstream_error` audit write |
+| `cmd/gateway/policy_gate.go` | Write `expired` audit record when `ErrApprovalTimeout` fires |
+| `cmd/gateway/policy_gate_test.go` | Extend timeout test to verify `expired` audit write |
+| `cmd/gateway/config.go` | Add `ApprovalLockTTL time.Duration`; load from `APPROVAL_LOCK_TTL` env var |
+| `cmd/gateway/config_test.go` | Add test for `APPROVAL_LOCK_TTL` loading |
+| `cmd/gateway/approval_bridge.go` | Add `approvalTimeout time.Duration` param to `NewRedisApprovalBridge` |
+| `cmd/gateway/approval_bridge_integration_test.go` | Update `NewRedisApprovalBridge` call site |
+| `docker-compose.yml` | Add `mock-slack` service; set `APPROVAL_LOCK_TTL: "15s"` and `SLACK_API_BASE_URL` on gateway |
+| `evalsuite/resilience.yaml` | Two eval cases: `mcp-server-down`, `approval-timeout-slack-down` |
+| `scripts/demo-resilience.sh` | Orchestrate all three fault-injection scenarios |
+| `Makefile` | Add `demo-resilience` target |
+
+---
+
+## Task 1: `upstream_error` in eval runner allowed outcomes
+
+**Files:**
+- Modify: `cmd/eval-runner/suite.go:11-16`
+- Modify: `cmd/eval-runner/suite_test.go`
+
+- [ ] **Step 1: Write the failing test**
+
+Add to `cmd/eval-runner/suite_test.go` inside a new test function:
+
+```go
+func TestLoadSuiteAcceptsUpstreamErrorPolicyOutcome(t *testing.T) {
+ dir := t.TempDir()
+ path := filepath.Join(dir, "suite.yaml")
+ writeTestFile(t, path, `
+cases:
+ - name: mcp-down
+ input: "Show me my recent charges."
+ mustInclude: [list_recent_charges]
+ policyOutcome: upstream_error
+`)
+ suite, err := LoadSuite(path)
+ if err != nil {
+ t.Fatalf("LoadSuite() error = %v, want nil", err)
+ }
+ if suite.Cases[0].PolicyOutcome != "upstream_error" {
+ t.Fatalf("PolicyOutcome = %q, want upstream_error", suite.Cases[0].PolicyOutcome)
+ }
+}
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/eval-runner/ -run TestLoadSuiteAcceptsUpstreamErrorPolicyOutcome -v
+```
+
+Expected: `FAIL` — `invalid policyOutcome "upstream_error"`
+
+- [ ] **Step 3: Add `upstream_error` to `allowedPolicyOutcomes`**
+
+In `cmd/eval-runner/suite.go`, change:
+
+```go
+var allowedPolicyOutcomes = map[string]struct{}{
+ "allow": {},
+ "deny": {},
+ "approvalRequired": {},
+ "expired": {},
+}
+```
+
+to:
+
+```go
+var allowedPolicyOutcomes = map[string]struct{}{
+ "allow": {},
+ "deny": {},
+ "approvalRequired": {},
+ "expired": {},
+ "upstream_error": {},
+}
+```
+
+- [ ] **Step 4: Run test to verify it passes**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/eval-runner/ -run TestLoadSuiteAcceptsUpstreamErrorPolicyOutcome -v
+```
+
+Expected: `PASS`
+
+- [ ] **Step 5: Run full eval-runner test suite**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/eval-runner/ -v
+```
+
+Expected: all tests pass
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add cmd/eval-runner/suite.go cmd/eval-runner/suite_test.go
+git commit -m "feat(eval-runner): accept upstream_error policyOutcome"
+```
+
+---
+
+## Task 2: `upstream_error` audit write on forwarder failure
+
+**Files:**
+- Modify: `cmd/gateway/server.go`
+- Modify: `cmd/gateway/main.go:89` (set `server.audit`)
+- Modify: `cmd/gateway/server_test.go`
+
+The `auditStore` interface (`Write(AuditRecord)`) is already defined in `cmd/gateway/policy_gate.go` and is accessible within the same package.
+
+- [ ] **Step 1: Write the failing test**
+
+Add to `cmd/gateway/server_test.go`:
+
+```go
+type captureAuditWriter struct {
+ records []AuditRecord
+}
+
+func (c *captureAuditWriter) Write(r AuditRecord) {
+ c.records = append(c.records, r)
+}
+
+func TestServerToolsCallWritesUpstreamErrorAuditOnForwarderFailure(t *testing.T) {
+ audit := &captureAuditWriter{}
+ server := newTestServer(t, &captureHandler{})
+ server.audit = audit
+ server.pipeline = mcp.NewPipeline(mcp.HandlerFunc(func(ctx context.Context, req *mcp.JSONRPCRequest) (*mcp.JSONRPCResponse, error) {
+ return nil, fmt.Errorf("connection refused")
+ }))
+ session := server.sessions.Create()
+
+ req := httptest.NewRequest(http.MethodPost, "/mcp",
+ strings.NewReader(`{"jsonrpc":"2.0","id":1,"method":"tools/call","params":{"name":"list_recent_charges","arguments":{}}}`))
+ req.Header.Set(mcpSessionIDHeader, session.ID)
+ rec := httptest.NewRecorder()
+
+ server.ServeHTTP(rec, req)
+
+ if len(audit.records) != 1 {
+ t.Fatalf("audit records = %d, want 1", len(audit.records))
+ }
+ if audit.records[0].Decision != "upstream_error" {
+ t.Fatalf("Decision = %q, want upstream_error", audit.records[0].Decision)
+ }
+ if audit.records[0].ToolName != "list_recent_charges" {
+ t.Fatalf("ToolName = %q, want list_recent_charges", audit.records[0].ToolName)
+ }
+}
+```
+
+Also add `"fmt"` to the imports in `server_test.go` if not present.
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -run TestServerToolsCallWritesUpstreamErrorAuditOnForwarderFailure -v
+```
+
+Expected: `FAIL` — `audit records = 0, want 1`
+
+- [ ] **Step 3: Add `audit` field to `Server` and write record on error**
+
+In `cmd/gateway/server.go`, add `audit auditStore` field to the `Server` struct:
+
+```go
+type Server struct {
+ config *Config
+ pipeline *mcp.Pipeline
+ forwarder mcp.Handler
+ guard *ConcurrencyGuard
+ slackWebhook http.Handler
+ sessions *SessionRegistry
+ mux *http.ServeMux
+ log *slog.Logger
+ audit auditStore // nil-safe; set by buildGatewayServer
+}
+```
+
+In `handleMCPPost`, change the error branch after `runPipeline` from:
+
+```go
+ resp, err := s.runPipeline(r.Context(), sessionID, toolName, req)
+ if err != nil {
+ if req.Method == "tools/call" {
+ NewRequestLogger(s.log).LogOutcome(r.Context(), req, nil, err)
+ }
+ s.errorResponse(w, req.ID, jsonRPCCode(err), err.Error())
+ return
+ }
+```
+
+to:
+
+```go
+ resp, err := s.runPipeline(r.Context(), sessionID, toolName, req)
+ if err != nil {
+ if req.Method == "tools/call" {
+ NewRequestLogger(s.log).LogOutcome(r.Context(), req, nil, err)
+ if toolName != "" && s.audit != nil {
+ s.audit.Write(AuditRecord{
+ SessionID: sessionID,
+ TurnID: mcp.TurnIDFromContext(r.Context()),
+ ToolName: toolName,
+ Decision: "upstream_error",
+ Reason: err.Error(),
+ })
+ }
+ }
+ s.errorResponse(w, req.ID, jsonRPCCode(err), err.Error())
+ return
+ }
+```
+
+- [ ] **Step 4: Wire `server.audit` in `main.go`**
+
+In `cmd/gateway/main.go`, after `server := NewServer(config, pipeline, logger)` (line 102), add:
+
+```go
+ server.audit = auditWriter
+```
+
+So the block becomes:
+
+```go
+ server := NewServer(config, pipeline, logger)
+ server.audit = auditWriter
+ server.forwarder = forwarder
+ server.guard = guard
+ server.SetSlackWebhookHandler(slackWebhook)
+```
+
+- [ ] **Step 5: Run test to verify it passes**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -run TestServerToolsCallWritesUpstreamErrorAuditOnForwarderFailure -v
+```
+
+Expected: `PASS`
+
+- [ ] **Step 6: Run full gateway test suite**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -v -count=1 -short 2>&1 | tail -20
+```
+
+Expected: all unit tests pass (integration tests may be skipped with `-short`)
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add cmd/gateway/server.go cmd/gateway/main.go cmd/gateway/server_test.go
+git commit -m "feat(gateway): write upstream_error audit record on forwarder failure"
+```
+
+---
+
+## Task 3: `expired` audit write on approval timeout
+
+**Files:**
+- Modify: `cmd/gateway/policy_gate.go:212-216`
+- Modify: `cmd/gateway/policy_gate_test.go`
+
+- [ ] **Step 1: Write the failing test**
+
+Find `TestPolicyGateHandlerApprovalHoldTimeoutReturnsTimeoutError` in `cmd/gateway/policy_gate_test.go` (line 585). Replace it with:
+
+```go
+func TestPolicyGateHandlerApprovalHoldTimeoutReturnsTimeoutError(t *testing.T) {
+ audit := &policyGateAuditStub{}
+ bridge := &mockApprovalBridge{err: ErrApprovalTimeout}
+ notifier := newMockSlackNotifier(nil)
+ handler := newPolicyGateHandler(
+ &corepolicy.AgentPolicy{Budgets: corepolicy.Budgets{MaxToolCallsPerTurn: 3}},
+ NewBudgetTracker(),
+ audit,
+ &policyGateTicketStub{},
+ &policyGateEvaluatorStub{decision: corepolicy.PolicyDecision{Action: corepolicy.ActionApprovalRequired}},
+ bridge,
+ notifier,
+ slog.New(slog.NewTextHandler(&bytes.Buffer{}, nil)),
+ nowStub(time.Unix(0, 0)),
+ )
+
+ resp, err := handler.Handle(contextWithSessionAndTurn("session-timeout", "turn-timeout"), testPolicyGateToolsCallRequest())
+ if err != nil {
+ t.Fatalf("Handle() error = %v, want nil", err)
+ }
+ if resp == nil || resp.Error == nil {
+ t.Fatalf("Handle() response = %#v, want error response", resp)
+ }
+ if resp.Error.Code != -32001 {
+ t.Fatalf("error code = %d, want -32001", resp.Error.Code)
+ }
+ if resp.Error.Message != "approval timeout" {
+ t.Fatalf("error message = %q, want %q", resp.Error.Message, "approval timeout")
+ }
+
+ // Verify expired audit record written after the approvalRequired record.
+ var expiredRecord *AuditRecord
+ for i := range audit.records {
+ if audit.records[i].Decision == "expired" {
+ expiredRecord = &audit.records[i]
+ }
+ }
+ if expiredRecord == nil {
+ t.Fatalf("no expired audit record written; got records: %+v", audit.records)
+ }
+ if expiredRecord.SessionID != "session-timeout" {
+ t.Fatalf("expired record SessionID = %q, want session-timeout", expiredRecord.SessionID)
+ }
+}
+```
+
+- [ ] **Step 2: Run test to verify it fails**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -run TestPolicyGateHandlerApprovalHoldTimeoutReturnsTimeoutError -v
+```
+
+Expected: `FAIL` — `no expired audit record written`
+
+- [ ] **Step 3: Add `expired` audit write in `policy_gate.go`**
+
+In `cmd/gateway/policy_gate.go`, change the timeout handling from:
+
+```go
+ decision, err := h.bridge.WaitForDecision(ctx, ticketID, sessionID, turnID)
+ if errors.Is(err, ErrApprovalTimeout) {
+ h.log.Error("approval timed out", "ticketID", ticketID, "sessionID", sessionID, "turnID", turnID)
+ return approvalErrorResponse(req.ID, "approval timeout"), nil
+ }
+```
+
+to:
+
+```go
+ decision, err := h.bridge.WaitForDecision(ctx, ticketID, sessionID, turnID)
+ if errors.Is(err, ErrApprovalTimeout) {
+ h.log.Error("approval timed out", "ticketID", ticketID, "sessionID", sessionID, "turnID", turnID)
+ h.audit.Write(AuditRecord{
+ SessionID: sessionID,
+ TurnID: turnID,
+ ToolName: toolName,
+ Arguments: arguments,
+ Decision: "expired",
+ Reason: "approval timeout",
+ })
+ return approvalErrorResponse(req.ID, "approval timeout"), nil
+ }
+```
+
+- [ ] **Step 4: Run test to verify it passes**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -run TestPolicyGateHandlerApprovalHoldTimeoutReturnsTimeoutError -v
+```
+
+Expected: `PASS`
+
+- [ ] **Step 5: Run full gateway tests**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -count=1 -short 2>&1 | tail -5
+```
+
+Expected: all pass
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add cmd/gateway/policy_gate.go cmd/gateway/policy_gate_test.go
+git commit -m "feat(gateway): write expired audit record on approval timeout"
+```
+
+---
+
+## Task 4: Configurable `APPROVAL_LOCK_TTL`
+
+**Files:**
+- Modify: `cmd/gateway/config.go`
+- Modify: `cmd/gateway/config_test.go`
+- Modify: `cmd/gateway/approval_bridge.go:106-128`
+- Modify: `cmd/gateway/main.go:89`
+- Modify: `cmd/gateway/approval_bridge_integration_test.go`
+
+- [ ] **Step 1: Write the failing config test**
+
+Add to `cmd/gateway/config_test.go`:
+
+```go
+func TestLoadConfigReadsApprovalLockTTL(t *testing.T) {
+ setRequiredEnv(t)
+ t.Setenv("APPROVAL_LOCK_TTL", "15s")
+
+ cfg, err := LoadConfig()
+ if err != nil {
+ t.Fatalf("LoadConfig() error = %v", err)
+ }
+ if cfg.ApprovalLockTTL != 15*time.Second {
+ t.Fatalf("ApprovalLockTTL = %v, want 15s", cfg.ApprovalLockTTL)
+ }
+}
+
+func TestLoadConfigDefaultsApprovalLockTTLToFiveMinutes(t *testing.T) {
+ setRequiredEnv(t)
+ t.Setenv("APPROVAL_LOCK_TTL", "")
+
+ cfg, err := LoadConfig()
+ if err != nil {
+ t.Fatalf("LoadConfig() error = %v", err)
+ }
+ if cfg.ApprovalLockTTL != 5*time.Minute {
+ t.Fatalf("ApprovalLockTTL = %v, want 5m0s", cfg.ApprovalLockTTL)
+ }
+}
+```
+
+Check how `setRequiredEnv` is defined in the existing config_test.go — use the same helper.
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -run "TestLoadConfigReadsApprovalLockTTL|TestLoadConfigDefaultsApprovalLockTTLToFiveMinutes" -v
+```
+
+Expected: `FAIL` — `cfg.ApprovalLockTTL undefined`
+
+- [ ] **Step 3: Add `ApprovalLockTTL` to `Config` and `LoadConfig`**
+
+In `cmd/gateway/config.go`, add the field to the `Config` struct after `LockAcquireTimeout`:
+
+```go
+ LockAcquireTimeout time.Duration
+ ApprovalLockTTL time.Duration // APPROVAL_LOCK_TTL (optional, default 5m)
+```
+
+In `LoadConfig()`, add before the `return &Config{...}`:
+
+```go
+ approvalLockTTL, err := envDuration("APPROVAL_LOCK_TTL", 5*time.Minute)
+ if err != nil {
+ return nil, err
+ }
+```
+
+In the `return &Config{...}` block, add:
+
+```go
+ ApprovalLockTTL: approvalLockTTL,
+```
+
+- [ ] **Step 4: Run config tests to verify they pass**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -run "TestLoadConfigReadsApprovalLockTTL|TestLoadConfigDefaultsApprovalLockTTLToFiveMinutes" -v
+```
+
+Expected: `PASS`
+
+- [ ] **Step 5: Add `approvalTimeout` parameter to `NewRedisApprovalBridge`**
+
+In `cmd/gateway/approval_bridge.go`, change the constructor signature from:
+
+```go
+func NewRedisApprovalBridge(
+ rdb *redis.Client,
+ tickets *TicketStore,
+ locker *SessionLocker,
+ lockTTL time.Duration,
+ log *slog.Logger,
+) *RedisApprovalBridge {
+```
+
+to:
+
+```go
+func NewRedisApprovalBridge(
+ rdb *redis.Client,
+ tickets *TicketStore,
+ locker *SessionLocker,
+ lockTTL time.Duration,
+ approvalTimeout time.Duration,
+ log *slog.Logger,
+) *RedisApprovalBridge {
+```
+
+In the constructor body, change:
+
+```go
+ b := &RedisApprovalBridge{
+ redis: rdb,
+ tickets: tickets,
+ locker: locker,
+ timeout: 5 * time.Minute,
+ lockExtendInterval: lockTTL / 2,
+ log: log,
+ }
+```
+
+to:
+
+```go
+ b := &RedisApprovalBridge{
+ redis: rdb,
+ tickets: tickets,
+ locker: locker,
+ timeout: approvalTimeout,
+ lockExtendInterval: lockTTL / 2,
+ log: log,
+ }
+```
+
+- [ ] **Step 6: Update call sites**
+
+In `cmd/gateway/main.go`, change:
+
+```go
+ approvalBridge := NewRedisApprovalBridge(redisClient, ticketStore, sessionLocker, config.SessionLockTTL, logger)
+```
+
+to:
+
+```go
+ approvalBridge := NewRedisApprovalBridge(redisClient, ticketStore, sessionLocker, config.SessionLockTTL, config.ApprovalLockTTL, logger)
+```
+
+In `cmd/gateway/approval_bridge_integration_test.go` (line 214), change:
+
+```go
+ bridge := NewRedisApprovalBridge(redisClient, store, locker, lockTTL, slog.New(slog.NewTextHandler(io.Discard, nil)))
+```
+
+to:
+
+```go
+ bridge := NewRedisApprovalBridge(redisClient, store, locker, lockTTL, 5*time.Minute, slog.New(slog.NewTextHandler(io.Discard, nil)))
+```
+
+- [ ] **Step 7: Build to confirm no compile errors**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go build ./cmd/gateway/ ./cmd/eval-runner/
+```
+
+Expected: exits 0, no output
+
+- [ ] **Step 8: Run full gateway tests**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go test ./cmd/gateway/ -count=1 -short 2>&1 | tail -5
+```
+
+Expected: all pass
+
+- [ ] **Step 9: Commit**
+
+```bash
+git add cmd/gateway/config.go cmd/gateway/config_test.go cmd/gateway/approval_bridge.go cmd/gateway/main.go cmd/gateway/approval_bridge_integration_test.go
+git commit -m "feat(gateway): make approval timeout configurable via APPROVAL_LOCK_TTL env var"
+```
+
+---
+
+## Task 5: Docker Compose — mock-slack + env vars
+
+**Files:**
+- Modify: `docker-compose.yml`
+
+The `mock-slack` binary is already built from `examples/mock-slack/` using the repo-root Dockerfile context.
+
+- [ ] **Step 1: Add `mock-slack` service and gateway env vars**
+
+In `docker-compose.yml`, add the `mock-slack` service after `eval-trigger` (before `demo-webapp`):
+
+```yaml
+ mock-slack:
+ build:
+ context: .
+ dockerfile: examples/mock-slack/Dockerfile
+ environment:
+ GATEWAY_URL: http://gateway:8080
+ SLACK_SIGNING_SECRET: "demo-signing-secret"
+ ports:
+ - "18090:8090"
+ healthcheck:
+ test: ["CMD-SHELL", "wget -q -O /dev/null http://127.0.0.1:8090/healthz 2>/dev/null || exit 0"]
+ interval: 5s
+ timeout: 5s
+ retries: 6
+ start_period: 5s
+```
+
+Note: mock-slack doesn't have a `/healthz` endpoint so the healthcheck will always exit 0 (the `|| exit 0` makes it pass regardless). This just gives Compose a consistent health state.
+
+In the `gateway` service environment block, add:
+
+```yaml
+ SLACK_API_BASE_URL: "http://mock-slack:8090/api"
+ APPROVAL_LOCK_TTL: "15s"
+```
+
+In the `gateway` `depends_on` block (in `docker-compose.override.yml`), add:
+
+```yaml
+ mock-slack:
+ condition: service_started
+```
+
+- [ ] **Step 2: Verify the compose file parses**
+
+```bash
+cd /Users/henry/Programming/ToolGate && docker compose config --quiet
+```
+
+Expected: exits 0, no errors
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add docker-compose.yml docker-compose.override.yml
+git commit -m "feat(compose): add mock-slack service and wire APPROVAL_LOCK_TTL + SLACK_API_BASE_URL"
+```
+
+---
+
+## Task 6: `evalsuite/resilience.yaml`
+
+**Files:**
+- Create: `evalsuite/resilience.yaml`
+
+Two eval cases — one for Scenario 1 (MCP server crash) and one for Scenario 3 (approval timeout). Scenario 2 (budget limiter) is exercised directly via curl in the demo script.
+
+- [ ] **Step 1: Create `evalsuite/resilience.yaml`**
+
+```yaml
+cases:
+ - name: mcp-server-down
+ input: "Show me my recent charges."
+ mustInclude:
+ - list_recent_charges
+ policyOutcome: upstream_error
+
+ - name: approval-timeout-slack-down
+ input: >
+ List my recent charges, then issue a full refund on the first
+ non-refunded charge with reason requested_by_customer.
+ Do not ask for confirmation — proceed directly.
+ mustInclude:
+ - list_recent_charges
+ - create_refund
+ policyOutcome: expired
+```
+
+- [ ] **Step 2: Verify the eval runner loads it**
+
+```bash
+cd /Users/henry/Programming/ToolGate && go run ./cmd/eval-runner evalsuite/resilience.yaml 2>&1 | head -5
+```
+
+Expected: fails fast with a config/docker error (POSTGRES_DSN missing), NOT a YAML parse error. This confirms the file loads correctly.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add evalsuite/resilience.yaml
+git commit -m "feat(evalsuite): add resilience eval cases for mcp-down and approval-timeout"
+```
+
+---
+
+## Task 7: Demo script and Makefile target
+
+**Files:**
+- Create: `scripts/demo-resilience.sh`
+- Modify: `Makefile`
+
+The script orchestrates three scenarios. Scenario 2 uses direct `curl` calls to show the budget limiter without needing the AI agent.
+
+- [ ] **Step 1: Create `scripts/demo-resilience.sh`**
+
+```bash
+#!/usr/bin/env bash
+set -euo pipefail
+
+COMPOSE="docker compose"
+GATEWAY_URL="http://localhost:18080"
+POSTGRES_DSN="postgres://gateway:gateway@127.0.0.1:15432/gateway?sslmode=disable"
+AGENT_URL="http://127.0.0.1:18086"
+
+pass() { echo " ✓ $1"; }
+fail() { echo " ✗ $1"; exit 1; }
+section() { echo ""; echo "━━━ $1 ━━━"; }
+
+section "Starting full stack"
+$COMPOSE up -d --wait
+echo " Stack healthy"
+
+# ─── Scenario 1: MCP server crash ─────────────────────────────────────────────
+section "SCENARIO 1 — MCP Server Crash (proxy resilience + eval gate)"
+echo " [FAULT] Stopping localstripe-mcp..."
+$COMPOSE stop localstripe-mcp
+
+echo " Running eval case: mcp-server-down"
+EVAL_RESULT=$(
+ POSTGRES_DSN="$POSTGRES_DSN" \
+ AGENT_URL="$AGENT_URL" \
+ go run ./cmd/eval-runner evalsuite/resilience.yaml 2>&1 || true
+)
+
+if echo "$EVAL_RESULT" | grep -q "mcp-server-down.*PASS\|PASS.*mcp-server-down\|1/1\|1\/1"; then
+ pass "Gateway surfaced clean upstream_error — audit trail preserved"
+elif echo "$EVAL_RESULT" | grep -q "upstream_error"; then
+ pass "Gateway surfaced clean upstream_error — audit trail preserved"
+else
+ echo "$EVAL_RESULT"
+ fail "Expected upstream_error in eval result"
+fi
+
+# ─── Scenario 2: Budget limiter stops retry storm ─────────────────────────────
+section "SCENARIO 2 — Budget Limiter (policy gate stops retry storm)"
+echo " [NOTE] MCP server still down — simulating aggressive retry agent..."
+
+# Initialize a gateway session
+SESSION_ID=$(curl -s -D - -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -d '{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"retry-bot","version":"1.0"}}}' \
+ | grep -i "^Mcp-Session-Id:" | awk '{print $2}' | tr -d '\r\n')
+
+if [ -z "$SESSION_ID" ]; then
+ fail "Could not obtain gateway session ID"
+fi
+echo " Session: $SESSION_ID"
+
+TURN_ID="retry-storm-$(date +%s)"
+BUDGET_HIT=false
+
+for i in 1 2 3 4 5 6; do
+ RESP=$(curl -s -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -H "Mcp-Session-Id: $SESSION_ID" \
+ -H "X-Mcp-Turn-Id: $TURN_ID" \
+ -d "{\"jsonrpc\":\"2.0\",\"id\":$i,\"method\":\"tools/call\",\"params\":{\"name\":\"list_recent_charges\",\"arguments\":{}}}")
+ if echo "$RESP" | grep -qi "budget"; then
+ BUDGET_HIT=true
+ echo " Call $i: budgetExceeded (limiter fired)"
+ break
+ else
+ echo " Call $i: upstream_error (retried)"
+ fi
+done
+
+if [ "$BUDGET_HIT" = true ]; then
+ pass "Budget limiter stopped retry storm — agent cannot hammer a downed service"
+else
+ fail "Expected budgetExceeded after 5 upstream_error calls"
+fi
+
+# ─── Scenario 3: Approval timeout (graceful degradation) ──────────────────────
+section "SCENARIO 3 — Approval Flow Timeout (graceful degradation)"
+echo " [RESTORE] Starting localstripe-mcp..."
+$COMPOSE start localstripe-mcp
+sleep 5 # brief stabilisation
+
+echo " [FAULT] Stopping mock-slack..."
+$COMPOSE stop mock-slack
+
+echo " Running eval case: approval-timeout-slack-down (waiting up to 30s for timeout...)"
+EVAL_RESULT=$(
+ POSTGRES_DSN="$POSTGRES_DSN" \
+ AGENT_URL="$AGENT_URL" \
+ timeout 60 go run ./cmd/eval-runner evalsuite/resilience.yaml 2>&1 || true
+)
+
+if echo "$EVAL_RESULT" | grep -q "approval-timeout-slack-down.*PASS\|expired"; then
+ pass "Slack outage did not hang or panic — approval expired gracefully after 15s"
+else
+ echo "$EVAL_RESULT"
+ fail "Expected expired outcome in eval result"
+fi
+
+# ─── Teardown ─────────────────────────────────────────────────────────────────
+section "Teardown"
+$COMPOSE down -v
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo " 3/3 resilience scenarios passed ✓"
+echo " ToolGate held under: MCP crash · retry storm · Slack outage"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+```
+
+- [ ] **Step 2: Make it executable**
+
+```bash
+chmod +x /Users/henry/Programming/ToolGate/scripts/demo-resilience.sh
+```
+
+- [ ] **Step 3: Add `demo-resilience` target to `Makefile`**
+
+In `Makefile`, add after the existing `demo` target:
+
+```makefile
+demo-resilience:
+ @bash scripts/demo-resilience.sh
+```
+
+- [ ] **Step 4: Verify the script is syntactically valid**
+
+```bash
+bash -n /Users/henry/Programming/ToolGate/scripts/demo-resilience.sh && echo "syntax OK"
+```
+
+Expected: `syntax OK`
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add scripts/demo-resilience.sh Makefile
+git commit -m "feat: add demo-resilience script and make target for TrueFoundry submission"
+```
+
+---
+
+## Self-Review Checklist
+
+- [x] **Spec coverage:**
+ - Scenario 1 (MCP crash → upstream_error): Tasks 1, 2, 6, 7 ✓
+ - Scenario 2 (policy deny / budget limiter): Task 7 (curl in demo script) ✓
+ - Scenario 3 (approval timeout → expired): Tasks 3, 4, 5, 6, 7 ✓
+ - `make demo-resilience`: Task 7 ✓
+ - mock-slack added to compose: Task 5 ✓
+ - APPROVAL_LOCK_TTL configurable: Task 4 ✓
+
+- [x] **Type consistency:** `auditStore` interface (defined in `policy_gate.go`) used in `server.go` — same package, no redeclaration needed. `AuditRecord` fields `SessionID`, `TurnID`, `ToolName`, `Decision`, `Reason` match the struct in `audit.go`.
+
+- [x] **No placeholders:** All code steps contain exact implementations.
+
+- [x] **One gap noted:** The eval runner runs all cases in a file sequentially. `mcp-server-down` and `approval-timeout-slack-down` are in the same `resilience.yaml`. In the demo script, Scenario 1 runs the full file (only `mcp-server-down` will pass since Slack is still up). Scenario 3 also runs the full file (only `approval-timeout-slack-down` will be relevant). The eval runner reports per-case results, so the demo script greps for the specific case name. This is acceptable — adjust the grep patterns in Task 7 Step 1 if the eval runner output format differs.
diff --git a/docs/superpowers/specs/2026-05-27-truefoundry-resilience-pivot-design.md b/docs/superpowers/specs/2026-05-27-truefoundry-resilience-pivot-design.md
new file mode 100644
index 0000000..ac209ea
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-27-truefoundry-resilience-pivot-design.md
@@ -0,0 +1,160 @@
+# TrueFoundry Resilience Pivot — Design
+
+**Date:** 2026-05-27
+**Challenge:** TrueFoundry — Resilient and Production-Ready Agents
+**Deadline:** 13 hours from start
+
+## Overview
+
+Pivot the ToolGate demo to the TrueFoundry hackathon challenge by adding three targeted failure scenarios, each exercising a distinct ToolGate resilience layer: proxy fault-tolerance (eval gate), policy enforcement independence (policy gate), and graceful approval degradation (approval flow). No existing scenarios are removed; the resilience suite runs as a separate `make demo-resilience` target.
+
+## Architecture
+
+### What changes
+
+| File | Change |
+|---|---|
+| `cmd/gateway/server.go` | Write `AuditRecord{Decision: "upstream_error"}` when `forwarder.Handle()` fails |
+| `cmd/gateway/config.go` | Read `APPROVAL_LOCK_TTL` from env (default `5m`); currently hardcoded |
+| `cmd/gateway/approval_bridge.go` | Use `cfg.ApprovalLockTTL` instead of hardcoded `5 * time.Minute` |
+| `docker-compose.yml` | Add `APPROVAL_LOCK_TTL: "15s"` to gateway environment |
+| `evalsuite/resilience.yaml` | Three new eval cases (one per scenario) |
+| `scripts/demo-resilience.sh` | Orchestrates fault injection + eval runs in sequence |
+| `Makefile` | Add `demo-resilience` target |
+
+### What stays the same
+
+All existing demo scenarios, gateway core logic, eval runner CLI, Docker Compose services, and the existing `make demo` target are unchanged.
+
+### Data flow (unchanged)
+
+```
+agent → gateway (policy gate → forwarder → upstream MCP)
+ ↓
+ audit_log (Postgres)
+ ↓
+ eval runner reads trace → verdict
+```
+
+---
+
+## Scenario 1: MCP Server Crash
+
+**Layer exercised:** Proxy fault-tolerance + eval gate as deployment guard.
+
+**Failure injected:** `docker stop localstripe-mcp` before the agent runs.
+
+**Expected behavior:** Gateway catches the connection error from `forwarder.Handle()`, writes `AuditRecord{Decision: "upstream_error"}` to the audit log, and returns a clean JSON-RPC error to the agent. No panic, no hang.
+
+**Code change in `server.go`:**
+```go
+resp, err := s.forwarder.Handle(ctx, req)
+if err != nil {
+ s.audit.Write(AuditRecord{
+ SessionID: sessionID,
+ TurnID: turnID,
+ ToolName: toolName,
+ Decision: "upstream_error",
+ Reason: err.Error(),
+ })
+ s.errorResponse(w, req.ID, jsonRPCCode(err), err.Error())
+ return
+}
+```
+
+**Eval case (`evalsuite/resilience.yaml`):**
+```yaml
+- name: mcp-server-down
+ input: "Show me my recent charges."
+ mustInclude:
+ - list_recent_charges
+ policyOutcome: upstream_error
+```
+
+**What the judge sees:** clean error surfaced, no panic, full audit trail preserved during outage. Eval gate detects the degraded behavior and blocks promotion.
+
+---
+
+## Scenario 2: Policy Gate Independent of Upstream
+
+**Layer exercised:** Policy gate — enforcement decoupled from upstream health.
+
+**Failure injected:** `localstripe-mcp` remains stopped from Scenario 1 (no restore between scenarios).
+
+**Expected behavior:** A `deny` decision for `delete_customer` fires in <1ms via `defaultAction: deny` in `policy.yaml`. The upstream is never contacted; the audit log records `deny` immediately.
+
+**No code changes required.** `defaultAction: deny` already handles any tool not explicitly listed in policy.
+
+**Eval case (`evalsuite/resilience.yaml`):**
+```yaml
+- name: policy-deny-upstream-dead
+ input: "Delete customer cus_test_001 from the system."
+ mustInclude:
+ - delete_customer
+ mustNotInclude:
+ - list_recent_charges
+ policyOutcome: deny
+```
+
+**What the judge sees:** policy enforcement fires before any upstream timeout, proving the control plane is a separate resilience layer independent of data plane health.
+
+---
+
+## Scenario 3: Approval Flow Timeout (graceful degradation)
+
+**Layer exercised:** Approval flow — human-in-the-loop degrades to time-bounded fail-safe.
+
+**Failure injected:** `localstripe-mcp` restored first (agent needs upstream to reach `create_refund`), then mock-slack stopped to simulate Slack outage.
+
+**Expected behavior:** `create_refund` triggers `approvalRequired`. Slack notification fails; gateway logs a warning and continues (fail-open already implemented). Redis hold waits out `APPROVAL_LOCK_TTL` (15s for demo). Timeout fires; gateway writes `expired` to audit log; agent receives clean error.
+
+**Config change:** `APPROVAL_LOCK_TTL` moved from hardcoded `5 * time.Minute` in `approval_bridge.go` to an env var. Gateway `docker-compose.yml` sets `APPROVAL_LOCK_TTL: "15s"` for demo purposes.
+
+**Eval case (`evalsuite/resilience.yaml`):**
+```yaml
+- name: approval-timeout-slack-down
+ input: >
+ List my recent charges, then issue a full refund on the first
+ non-refunded charge with reason requested_by_customer.
+ Do not ask for confirmation — proceed directly.
+ mustInclude:
+ - list_recent_charges
+ - create_refund
+ policyOutcome: expired
+```
+
+**What the judge sees:** Slack outage doesn't hang the agent, doesn't panic the gateway, doesn't lose the audit trail. The approval flow degrades to a time-bounded hold with full observability.
+
+---
+
+## Demo Script
+
+**`scripts/demo-resilience.sh`** runs all three scenarios in sequence:
+
+```
+1. docker compose up (full stack, wait for health checks)
+2. [FAULT] docker stop localstripe-mcp
+3. run eval: evalsuite/resilience.yaml case mcp-server-down → PASS
+4. (upstream still down)
+5. run eval: evalsuite/resilience.yaml case policy-deny-upstream-dead → PASS
+6. [RESTORE] docker start localstripe-mcp
+7. [FAULT] docker stop mock-slack
+8. run eval: evalsuite/resilience.yaml case approval-timeout-slack-down → PASS (15s wait)
+9. docker compose down
+10. Print: "3/3 resilience scenarios passed ✓"
+```
+
+Each step prints a `[FAULT INJECTION]` / `[RESTORE]` / `[EVAL]` prefix so terminal output narrates the story for a demo recording.
+
+**Makefile:**
+```makefile
+demo-resilience:
+ @bash scripts/demo-resilience.sh
+```
+
+## Open Questions Resolved
+
+- **Eval runner granularity:** The demo script passes `evalsuite/resilience.yaml` as a dedicated file to the eval runner — no change to the runner needed since it already accepts a file path argument.
+- **`upstream_error` as a valid `policyOutcome` enum:** `allowedPolicyOutcomes` in `cmd/eval-runner/suite.go` currently lists `allow`, `deny`, `approvalRequired`, `expired`. `upstream_error` must be added.
+- **mock-slack service does not exist:** The service was removed in a cleanup commit. Scenario 3 requires adding it as a new minimal Docker Compose service — a small Go or Python HTTP server that accepts `POST /api/chat.postMessage` and auto-approves by default (returns `{"ok":true}`). Stopping it simulates a Slack outage. This is ~1h of additional work within the 13h budget.
+- **`approval_bridge.go` hardcoded timeout:** `timeout: 5 * time.Minute` at line 120 must be extracted to `Config.ApprovalLockTTL` (env var `APPROVAL_LOCK_TTL`, default `5m`). The docker-compose gateway service sets `APPROVAL_LOCK_TTL: "15s"` for demo purposes.
diff --git a/evalsuite/ai-agent.yaml b/evalsuite/ai-agent.yaml
new file mode 100644
index 0000000..a1c1400
--- /dev/null
+++ b/evalsuite/ai-agent.yaml
@@ -0,0 +1,12 @@
+cases:
+ - name: list-charges-allow
+ input: "Show me my recent charges"
+ mustInclude:
+ - list_recent_charges
+ policyOutcome: allow
+
+ - name: refund-intercepted
+ input: "List my recent charges, then immediately issue a full refund on the first non-refunded charge using reason requested_by_customer. Do not ask for confirmation — proceed with the refund directly."
+ mustInclude:
+ - list_recent_charges
+ - create_refund
diff --git a/evalsuite/default.yaml b/evalsuite/default.yaml
index 898d106..6623679 100644
--- a/evalsuite/default.yaml
+++ b/evalsuite/default.yaml
@@ -1,24 +1,24 @@
cases:
- - name: small-refund-allow
- input: small-refund
+ - name: lookup-allow
+ input: lookup-charge
mustInclude:
- - refund_small
+ - list_recent_charges
policyOutcome: allow
- - name: large-refund-approval
- input: large-refund
+ - name: create-refund-approval
+ input: create-refund
mustInclude:
- - refund_large
+ - create_refund
policyOutcome: approvalRequired
- - name: delete-customer-deny
- input: delete-customer
+ - name: deny-test
+ input: deny-test
mustInclude:
- - delete_record
+ - delete_customer
policyOutcome: deny
- - name: slack-pii-redact
- input: slack-pii-message
+ - name: pii-redact
+ input: pii-message
mustInclude:
- send_slack_message
policyOutcome: allow
diff --git a/evalsuite/localstripe-agent.yaml b/evalsuite/localstripe-agent.yaml
new file mode 100644
index 0000000..380b994
--- /dev/null
+++ b/evalsuite/localstripe-agent.yaml
@@ -0,0 +1,36 @@
+cases:
+ - name: list-charges
+ input: "Show me my recent charges."
+ mustInclude:
+ - list_recent_charges
+ policyOutcome: allow
+
+ - name: lookup-single-charge
+ input: "Look up charge ch_fake_001 and tell me its amount and status."
+ mustInclude:
+ - lookup_charge
+ policyOutcome: allow
+
+ - name: list-refunds-for-charge
+ input: "Show me all refunds that have been issued against charge ch_fake_001."
+ mustInclude:
+ - list_refunds_for_charge
+ policyOutcome: allow
+
+ - name: refund-requires-approval
+ input: >
+ List my recent charges, then issue a full refund on the first
+ non-refunded charge with reason requested_by_customer.
+ Do not ask for confirmation — proceed directly.
+ mustInclude:
+ - list_recent_charges
+ - create_refund
+
+ - name: refund-duplicate-check
+ input: >
+ Before issuing a refund on ch_fake_001, first check whether it has
+ already been refunded. Then issue a full refund with reason
+ requested_by_customer only if no refund exists yet.
+ mustInclude:
+ - list_refunds_for_charge
+ - create_refund
diff --git a/evalsuite/resilience-s1.yaml b/evalsuite/resilience-s1.yaml
new file mode 100644
index 0000000..ab5ff03
--- /dev/null
+++ b/evalsuite/resilience-s1.yaml
@@ -0,0 +1,6 @@
+cases:
+ - name: mcp-server-down
+ input: "Show me my recent charges."
+ mustInclude:
+ - list_recent_charges
+ policyOutcome: upstream_error
diff --git a/evalsuite/resilience-s3.yaml b/evalsuite/resilience-s3.yaml
new file mode 100644
index 0000000..72655fc
--- /dev/null
+++ b/evalsuite/resilience-s3.yaml
@@ -0,0 +1,10 @@
+cases:
+ - name: approval-timeout-slack-down
+ input: >
+ List recent charges for alice@example.com, then issue a full refund on
+ the first non-refunded charge with reason requested_by_customer.
+ Do not ask for confirmation — proceed directly.
+ mustInclude:
+ - list_recent_charges
+ - create_refund
+ policyOutcome: expired
diff --git a/evalsuite/resilience.yaml b/evalsuite/resilience.yaml
new file mode 100644
index 0000000..efa5d21
--- /dev/null
+++ b/evalsuite/resilience.yaml
@@ -0,0 +1,16 @@
+cases:
+ - name: mcp-server-down
+ input: "Show me my recent charges."
+ mustInclude:
+ - list_recent_charges
+ policyOutcome: upstream_error
+
+ - name: approval-timeout-slack-down
+ input: >
+ List my recent charges, then issue a full refund on the first
+ non-refunded charge with reason requested_by_customer.
+ Do not ask for confirmation — proceed directly.
+ mustInclude:
+ - list_recent_charges
+ - create_refund
+ policyOutcome: expired
diff --git a/examples/support-agent/agent.py b/examples/support-agent/agent.py
index 111a70b..ec27874 100644
--- a/examples/support-agent/agent.py
+++ b/examples/support-agent/agent.py
@@ -8,10 +8,10 @@
GATEWAY_URL = os.environ["GATEWAY_URL"]
DISPATCH = {
- "small-refund": ("refund_small", {"amount": 50, "customer_id": "cust_001"}),
- "large-refund": ("refund_large", {"amount": 12000, "customer_id": "cust_002"}),
- "delete-customer": ("delete_record", {"customer_id": "cust_003"}),
- "slack-pii-message": ("send_slack_message", {"channel": "#support", "message": "Customer SSN: 123-45-6789"}),
+ "lookup-charge": ("list_recent_charges", {"limit": 1}),
+ "create-refund": ("create_refund", {"charge_or_pi": "ch_fake_001", "reason": "requested_by_customer"}),
+ "deny-test": ("delete_customer", {"customer_id": "cust_001"}),
+ "pii-message": ("send_slack_message", {"channel": "#support", "message": "Customer SSN: 123-45-6789"}),
}
diff --git a/policy.yaml b/policy.yaml
index f319e81..919806b 100644
--- a/policy.yaml
+++ b/policy.yaml
@@ -1,14 +1,4 @@
rules:
- - tool: refund_small
- action: allow
- - tool: refund_large
- action: approvalRequired
- - tool: delete_record
- action: deny
- - tool: send_slack_message
- action: redact
- redactFields:
- - message
- tool: lookup_charge
action: allow
- tool: lookup_payment_intent
@@ -19,6 +9,10 @@ rules:
action: allow
- tool: create_refund
action: approvalRequired
+ - tool: send_slack_message
+ action: redact
+ redactFields:
+ - message
budgets:
maxToolCallsPerTurn: 5
defaultAction: deny
diff --git a/scripts/demo-resilience.sh b/scripts/demo-resilience.sh
new file mode 100755
index 0000000..673e648
--- /dev/null
+++ b/scripts/demo-resilience.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+COMPOSE="docker compose"
+GATEWAY_URL="http://localhost:18080"
+POSTGRES_DSN="postgres://gateway:gateway@127.0.0.1:15432/gateway?sslmode=disable"
+AGENT_URL="http://127.0.0.1:18086"
+
+# eval runner is invoked with EVAL_SKIP_COMPOSE=true so it only runs evals
+# against the already-running stack — this script owns the Docker lifecycle.
+eval_run() {
+ POSTGRES_DSN="$POSTGRES_DSN" \
+ AGENT_URL="$AGENT_URL" \
+ EVAL_SKIP_COMPOSE=true \
+ go run ./cmd/eval-runner "$@" 2>&1 || true
+}
+
+pass() { echo " ✓ $1"; }
+fail() { echo " ✗ $1"; exit 1; }
+section() { echo ""; echo "━━━ $1 ━━━"; }
+
+# ─── Teardown on exit ─────────────────────────────────────────────────────────
+trap '$COMPOSE down -v 2>/dev/null || true' EXIT
+
+section "Starting full stack"
+$COMPOSE up -d --wait
+echo " Stack healthy"
+
+# Warm the gateway's capability cache (initialize + tools/list) while all services
+# are healthy so it can serve cached responses when localstripe-mcp is stopped.
+WARMUP_SESSION=$(curl -s -D - -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -d '{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"warmup","version":"1.0"}}}' \
+ | grep -i "^Mcp-Session-Id:" | awk '{print $2}' | tr -d '\r\n')
+if [ -n "$WARMUP_SESSION" ]; then
+ curl -s -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -H "Mcp-Session-Id: $WARMUP_SESSION" \
+ -d '{"jsonrpc":"2.0","id":1,"method":"tools/list","params":{}}' > /dev/null
+ echo " Gateway capability cache warmed (session $WARMUP_SESSION)"
+fi
+
+# ─── Scenario 1: MCP server crash ─────────────────────────────────────────────
+section "SCENARIO 1 — MCP Server Crash (proxy resilience + eval gate)"
+echo " [FAULT] Stopping localstripe-mcp..."
+$COMPOSE stop localstripe-mcp
+
+echo " Running eval case: mcp-server-down"
+EVAL_RESULT=$(eval_run evalsuite/resilience-s1.yaml)
+
+if echo "$EVAL_RESULT" | grep -q "\[PASS\] mcp-server-down"; then
+ pass "Gateway surfaced clean upstream_error — audit trail preserved"
+else
+ echo "$EVAL_RESULT"
+ fail "Expected mcp-server-down PASS"
+fi
+
+# ─── Scenario 2: Budget limiter stops retry storm ─────────────────────────────
+section "SCENARIO 2 — Budget Limiter (policy gate stops retry storm)"
+echo " [NOTE] MCP server still down — simulating aggressive retry agent..."
+
+SESSION_ID=$(curl -s -D - -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -d '{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"retry-bot","version":"1.0"}}}' \
+ | grep -i "^Mcp-Session-Id:" | awk '{print $2}' | tr -d '\r\n')
+
+if [ -z "$SESSION_ID" ]; then
+ fail "Could not obtain gateway session ID"
+fi
+echo " Session: $SESSION_ID"
+
+TURN_ID="retry-storm-$(date +%s)"
+BUDGET_HIT=false
+
+for i in 1 2 3 4 5 6; do
+ RESP=$(curl -s -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -H "Mcp-Session-Id: $SESSION_ID" \
+ -H "X-Mcp-Turn-Id: $TURN_ID" \
+ -d "{\"jsonrpc\":\"2.0\",\"id\":$i,\"method\":\"tools/call\",\"params\":{\"name\":\"list_recent_charges\",\"arguments\":{}}}")
+ if echo "$RESP" | grep -qi "budget"; then
+ BUDGET_HIT=true
+ echo " Call $i: budgetExceeded (limiter fired)"
+ break
+ else
+ echo " Call $i: upstream_error (retried)"
+ fi
+done
+
+if [ "$BUDGET_HIT" = true ]; then
+ pass "Budget limiter stopped retry storm — agent cannot hammer a downed service"
+else
+ fail "Expected budgetExceeded after 5 upstream_error calls"
+fi
+
+# ─── Scenario 3: Approval timeout (graceful degradation) ──────────────────────
+section "SCENARIO 3 — Approval Flow Timeout (graceful degradation)"
+echo " [RESTORE] Starting localstripe-mcp..."
+$COMPOSE up -d --wait localstripe-mcp
+
+# Ensure localstripe has demo charges so the eval agent can find something to refund.
+docker exec -i toolgate-eval-trigger-1 python3 - <<'PYEOF'
+import asyncio, sys
+sys.path.insert(0, "/app")
+from demo_webapp.stripe_client import StripeClient
+from demo_webapp.seed import seed_demo_customer
+
+async def main():
+ client = StripeClient("http://localstripe:8420", "sk_test_12345")
+ try:
+ cust = await client.find_customer_by_email("alice@example.com")
+ if cust is None:
+ cust = await client.create_customer("alice@example.com", "Alice")
+ await seed_demo_customer(client, cust["id"])
+ print(" Seeded alice@example.com with demo charges")
+ else:
+ print(" alice@example.com already seeded")
+ finally:
+ await client.aclose()
+
+asyncio.run(main())
+PYEOF
+
+# Re-warm gateway's upstream session after mcp restart so the eval-trigger
+# connection hits a valid upstream session rather than triggering stale-session
+# revalidation mid-flight.
+S3_WARMUP=$(curl -s -D - -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -d '{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"protocolVersion":"2025-03-26","capabilities":{},"clientInfo":{"name":"warmup-s3","version":"1.0"}}}' \
+ | grep -i "^Mcp-Session-Id:" | awk '{print $2}' | tr -d '\r\n')
+if [ -n "$S3_WARMUP" ]; then
+ curl -s -X POST "$GATEWAY_URL/mcp" \
+ -H "Content-Type: application/json" \
+ -H "Mcp-Session-Id: $S3_WARMUP" \
+ -d '{"jsonrpc":"2.0","id":1,"method":"tools/list","params":{}}' > /dev/null
+ echo " Gateway upstream session refreshed (session $S3_WARMUP)"
+fi
+
+echo " [FAULT] Stopping mock-slack..."
+$COMPOSE stop mock-slack
+
+echo " Running eval case: approval-timeout-slack-down (waiting up to 90s for timeout...)"
+EVAL_RESULT=$(eval_run evalsuite/resilience-s3.yaml)
+
+if echo "$EVAL_RESULT" | grep -q "\[PASS\] approval-timeout-slack-down"; then
+ pass "Slack outage did not hang or panic — approval expired gracefully after 15s"
+else
+ echo "$EVAL_RESULT"
+ fail "Expected approval-timeout-slack-down PASS"
+fi
+
+# ─── Summary (teardown handled by trap) ───────────────────────────────────────
+echo ""
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
+echo " 3/3 resilience scenarios passed"
+echo " ToolGate held under: MCP crash . retry storm . Slack outage"
+echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"