Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
99ebae5
cleaned up fake infrastructure (fake mcp, fake upstream, old placehol…
Tom-Shuhong-Tang May 26, 2026
5329a0e
chore: update localstripe_demo to b2d7273 (eval-trigger service)
Tom-Shuhong-Tang May 26, 2026
48d1eff
fixed eval to run test
Tom-Shuhong-Tang May 28, 2026
cd00680
feat: add eval web UI and custom eval endpoint
Tom-Shuhong-Tang May 28, 2026
dfb44e1
chore: update localstripe_demo to 9fc10bc (seed entrypoint + demo cha…
Tom-Shuhong-Tang May 28, 2026
a35dbed
docs: add TrueFoundry resilience pivot design spec
henryqingmo May 28, 2026
4c608a0
docs: add TrueFoundry resilience pivot implementation plan
henryqingmo May 28, 2026
2c3fb8d
feat(eval-runner): accept upstream_error policyOutcome
henryqingmo May 28, 2026
50c0c74
feat(gateway): write upstream_error audit record on forwarder failure
henryqingmo May 28, 2026
489a0ec
feat(gateway): write expired audit record on approval timeout
henryqingmo May 28, 2026
33457b3
feat(gateway): make approval timeout configurable via APPROVAL_LOCK_T…
henryqingmo May 28, 2026
6b618eb
feat(compose): add mock-slack service and wire APPROVAL_LOCK_TTL + SL…
henryqingmo May 28, 2026
396a831
feat(evalsuite): add resilience eval cases for mcp-down and approval-…
henryqingmo May 28, 2026
56fdf00
feat: add demo-resilience script and make target for TrueFoundry subm…
henryqingmo May 28, 2026
0c471a8
fix(eval-runner): require policyOutcome field in eval cases
henryqingmo May 28, 2026
8f62c6f
Apply suggestion from @gemini-code-assist[bot]
henryqingmo May 28, 2026
d183066
Apply suggestion from @gemini-code-assist[bot]
henryqingmo May 28, 2026
aefa200
Apply suggestion from @gemini-code-assist[bot]
henryqingmo May 28, 2026
49bed8e
fix: resolve port conflict in demo-resilience script
henryqingmo May 28, 2026
d351d23
feat: cache initialize/tools-list responses for upstream-down resilience
henryqingmo May 28, 2026
86172cf
fix: warm gateway capability cache before fault injection
henryqingmo May 28, 2026
b47751b
fix: add eval-trigger healthcheck and Makefile build-compose-bins target
henryqingmo May 28, 2026
cb592c6
fix: poll audit_log until terminal decision record appears
henryqingmo May 28, 2026
21b0362
fix: allow upstream_error and expired in audit_log decision constraint
henryqingmo May 28, 2026
653291e
fix: remove timeout command (not available on macOS)
henryqingmo May 28, 2026
854456d
fix: wait for localstripe-mcp health before Scenario 3
henryqingmo May 28, 2026
7c4ebb3
fix: make demo-resilience 3/3 pass — seed data, per-scenario YAML, se…
henryqingmo May 28, 2026
f0e596c
merge: integrate remote Gemini suggestions, keep tested resilience fixes
henryqingmo May 28, 2026
c857c9d
Merge remote-tracking branch 'origin/main' into eval-gate2
henryqingmo May 28, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
.PHONY: demo
.PHONY: demo demo-resilience

demo:
EVAL_COMPOSE_FILE=deploy/docker-compose.yml \
POSTGRES_DSN=postgres://gateway:gateway@127.0.0.1:15432/gateway?sslmode=disable \
AGENT_URL=http://127.0.0.1:18085 \
go run ./cmd/eval-runner evalsuite/default.yaml

demo-resilience: build-compose-bins
@bash scripts/demo-resilience.sh

build-compose-bins:
@mkdir -p .compose-bin
@GOOS=linux GOARCH=$$(go env GOARCH) CGO_ENABLED=0 go build -o .compose-bin/gateway ./cmd/gateway
2 changes: 2 additions & 0 deletions cmd/eval-runner/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ type Config struct {
PostgresDSN string
ComposeFile string
AgentURL string
SkipCompose bool
}

func LoadConfig() (*Config, error) {
Expand All @@ -29,6 +30,7 @@ func LoadConfig() (*Config, error) {
PostgresDSN: postgresDSN,
ComposeFile: envStringWithInfoNotice("EVAL_COMPOSE_FILE", defaultComposeFilePath, "using default compose file path"),
AgentURL: agentURL,
SkipCompose: os.Getenv("EVAL_SKIP_COMPOSE") == "true",
}, nil
}

Expand Down
19 changes: 18 additions & 1 deletion cmd/eval-runner/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,31 @@ type evalRunnerDeps struct {
}

func main() {
args := os.Args[1:]

if len(args) > 0 && args[0] == "--serve" {
suitePath := defaultSuitePath
if len(args) > 1 {
suitePath = args[1]
}
if err := serve(suitePath); err != nil {
_, _ = fmt.Fprintln(os.Stderr, err.Error())
os.Exit(1)
}
return
}

os.Exit(run(evalRunnerDeps{
args: os.Args[1:],
args: args,
stdout: os.Stdout,
stderr: os.Stderr,
lookPath: exec.LookPath,
loadConfig: LoadConfig,
loadSuite: LoadSuite,
newOrch: func(cfg *Config) stackOrchestrator {
if cfg.SkipCompose {
return noopOrchestrator{}
}
return NewOrchestrator(cfg.ComposeFile, defaultComposeProjectName)
},
openDB: openPostgresPool,
Expand Down
5 changes: 5 additions & 0 deletions cmd/eval-runner/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@ func (o *Orchestrator) runCompose(ctx context.Context, args ...string) (string,
return combined.String(), nil
}

type noopOrchestrator struct{}

func (noopOrchestrator) Up(_ context.Context) error { return nil }
func (noopOrchestrator) Down(_ context.Context) error { return nil }

func tailLines(text string, count int) string {
lines := strings.Split(strings.TrimSpace(text), "\n")
if len(lines) == 0 || lines[0] == "" {
Expand Down
29 changes: 28 additions & 1 deletion cmd/eval-runner/runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
"github.com/jackc/pgx/v5/pgxpool"
)

const caseRunnerHTTPTimeout = 60 * time.Second
const caseRunnerHTTPTimeout = 90 * time.Second

type CaseRunner struct {
AgentBaseURL string
Expand All @@ -31,12 +31,39 @@ func NewCaseRunner(agentBaseURL string, db *pgxpool.Pool) *CaseRunner {
}
}

const auditPollInterval = 300 * time.Millisecond
const auditPollTimeout = 30 * time.Second

func (r *CaseRunner) Run(ctx context.Context, c EvalCase) ([]TraceRow, error) {
sessionID, err := r.trigger(ctx, c.Input)
if err != nil {
return nil, err
}

// AuditWriter is async. Poll until the last row's decision matches the
// expected policyOutcome (or until timeout). This avoids returning before
// the terminal record (e.g. upstream_error written after allow) is flushed.
deadline := time.Now().Add(auditPollTimeout)
for {
trace, err := r.queryTrace(ctx, sessionID)
if err != nil {
return nil, err
}
if len(trace) > 0 && trace[len(trace)-1].Decision == c.PolicyOutcome {
return trace, nil
}
if time.Now().After(deadline) {
return trace, nil
}
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(auditPollInterval):
}
}
}

func (r *CaseRunner) queryTrace(ctx context.Context, sessionID string) ([]TraceRow, error) {
rows, err := r.DB.Query(
ctx,
`SELECT tool_name, decision, arguments
Expand Down
170 changes: 170 additions & 0 deletions cmd/eval-runner/serve.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package main

import (
"context"
_ "embed"
"encoding/json"
"fmt"
"log/slog"
"net/http"
"os"
"strings"

"github.com/jackc/pgx/v5/pgxpool"
)

//go:embed ui.html
var uiHTML []byte

type evalResponse struct {
Passed bool `json:"passed"`
PassCount int `json:"pass_count"`
TotalCount int `json:"total_count"`
Cases []CaseResult `json:"cases"`
Report string `json:"report"`
}

func serve(suitePath string) error {
cfg, err := LoadConfig()
if err != nil {
return err
}

suite, err := LoadSuite(suitePath)
if err != nil {
return fmt.Errorf("load suite: %w", err)
}

ctx := context.Background()
db, err := openPostgresPool(ctx, cfg.PostgresDSN)
if err != nil {
return fmt.Errorf("connect to postgres: %w", err)
}
defer db.Close()

pool, ok := db.(*pgxpool.Pool)
if !ok {
return fmt.Errorf("database connection is not a *pgxpool.Pool")
}
runner := NewCaseRunner(cfg.AgentURL, pool)

// AI agent runner — optional, only active when AI_AGENT_URL is set
aiAgentURL := os.Getenv("AI_AGENT_URL")
var aiRunner caseExecutor
var aiSuite *EvalSuite
if aiAgentURL != "" {
aiRunner = NewCaseRunner(aiAgentURL, pool)
aiSuitePath := os.Getenv("AI_SUITE_PATH")
if aiSuitePath == "" {
aiSuitePath = "evalsuite/ai-agent.yaml"
}
aiSuite, err = LoadSuite(aiSuitePath)
if err != nil {
return fmt.Errorf("load AI suite: %w", err)
}
}

port := os.Getenv("EVAL_SERVE_PORT")
if port == "" {
port = "8099"
}

http.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
_, _ = w.Write(uiHTML)
})

http.HandleFunc("POST /run-eval", makeEvalHandler(runner, suite, pool))

http.HandleFunc("POST /run-eval/ai", func(w http.ResponseWriter, r *http.Request) {
if aiRunner == nil {
http.Error(w, `{"error":"AI_AGENT_URL not configured"}`, http.StatusServiceUnavailable)
return
}
makeEvalHandler(aiRunner, aiSuite, pool)(w, r)
})

http.HandleFunc("POST /run-eval/custom", makeCustomEvalHandler(pool))

http.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
})

slog.Info("eval server listening", "port", port)
return http.ListenAndServe(":"+port, nil)
Comment on lines +72 to +94

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using http.HandleFunc registers handlers on the global http.DefaultServeMux, which is a security risk as any package in the dependency tree can register routes on it. Additionally, passing nil to http.ListenAndServe uses this global mux.

Consider using a local http.NewServeMux to isolate your routes.

	mux := http.NewServeMux()
	mux.HandleFunc("GET /", func(w http.ResponseWriter, r *http.Request) {
		w.Header().Set("Content-Type", "text/html; charset=utf-8")
		_, _ = w.Write(uiHTML)
	})

	mux.HandleFunc("POST /run-eval", makeEvalHandler(runner, suite, pool))

	mux.HandleFunc("POST /run-eval/ai", func(w http.ResponseWriter, r *http.Request) {
		if aiRunner == nil {
			http.Error(w, `{"error":"AI_AGENT_URL not configured"}`, http.StatusServiceUnavailable)
			return
		}
		makeEvalHandler(aiRunner, aiSuite, pool)(w, r)
	})

	mux.HandleFunc("POST /run-eval/custom", makeCustomEvalHandler(pool))

	mux.HandleFunc("GET /healthz", func(w http.ResponseWriter, r *http.Request) {
		w.WriteHeader(http.StatusOK)
	})

	slog.Info("eval server listening", "port", port)
	return http.ListenAndServe(":"+port, mux)

}

func makeCustomEvalHandler(pool *pgxpool.Pool) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
var body struct {
Suite string `json:"suite"`
AgentURL string `json:"agent_url"`
}
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
http.Error(w, fmt.Sprintf("invalid request: %v", err), http.StatusBadRequest)
return
}
if body.AgentURL == "" {
http.Error(w, "missing agent_url", http.StatusBadRequest)
return
}
if body.Suite == "" {
http.Error(w, "missing suite", http.StatusBadRequest)
return
}

suite, err := LoadSuiteFromReader(strings.NewReader(body.Suite))
if err != nil {
http.Error(w, fmt.Sprintf("invalid suite: %v", err), http.StatusBadRequest)
return
}

runner := NewCaseRunner(body.AgentURL, pool)
makeEvalHandler(runner, suite, pool)(w, r)
}
}

func makeEvalHandler(runner caseExecutor, suite *EvalSuite, _ *pgxpool.Pool) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
results := make([]CaseResult, 0, len(suite.Cases))
for _, testCase := range suite.Cases {
trace, err := runner.Run(r.Context(), testCase)
result := CaseResult{Name: testCase.Name}
if err != nil {
result.Failures = []CheckFailure{{
Check: "run",
Expected: "case completes successfully",
Observed: err.Error(),
}}
} else {
result = Evaluate(testCase, trace)
}
results = append(results, result)
}

passCount := 0
for _, r := range results {
if r.Passed {
passCount++
}
}

report := GenerateReport(results)

if r.Header.Get("Accept") == "application/json" {
resp := evalResponse{
Passed: passCount == len(results),
PassCount: passCount,
TotalCount: len(results),
Cases: results,
Report: report,
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(resp)
return
}

w.Header().Set("Content-Type", "text/plain")
_, _ = fmt.Fprint(w, report)
}
}
14 changes: 12 additions & 2 deletions cmd/eval-runner/suite.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"fmt"
"io"
"os"

"gopkg.in/yaml.v3"
Expand All @@ -12,6 +13,7 @@ var allowedPolicyOutcomes = map[string]struct{}{
"deny": {},
"approvalRequired": {},
"expired": {},
"upstream_error": {},
}

func LoadSuite(path string) (*EvalSuite, error) {
Expand All @@ -21,10 +23,18 @@ func LoadSuite(path string) (*EvalSuite, error) {
}
defer func() { _ = file.Close() }()

var suite EvalSuite
if err := yaml.NewDecoder(file).Decode(&suite); err != nil {
suite, err := LoadSuiteFromReader(file)
if err != nil {
return nil, fmt.Errorf("parse eval suite %q: %w", path, err)
}
return suite, nil
}

func LoadSuiteFromReader(r io.Reader) (*EvalSuite, error) {
var suite EvalSuite
if err := yaml.NewDecoder(r).Decode(&suite); err != nil {
return nil, fmt.Errorf("parse eval suite: %w", err)
}

for i, evalCase := range suite.Cases {
if evalCase.Name == "" {
Expand Down
19 changes: 19 additions & 0 deletions cmd/eval-runner/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,25 @@ func TestLoadSuiteLoadsRepoDefaultFixtureFromRepoRoot(t *testing.T) {
}
}

func TestLoadSuiteAcceptsUpstreamErrorPolicyOutcome(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "suite.yaml")
writeTestFile(t, path, `
cases:
- name: mcp-down
input: "Show me my recent charges."
mustInclude: [list_recent_charges]
policyOutcome: upstream_error
`)
suite, err := LoadSuite(path)
if err != nil {
t.Fatalf("LoadSuite() error = %v, want nil", err)
}
if suite.Cases[0].PolicyOutcome != "upstream_error" {
t.Fatalf("PolicyOutcome = %q, want upstream_error", suite.Cases[0].PolicyOutcome)
}
}

func writeTestFile(t *testing.T, path string, contents string) {
t.Helper()
if err := os.WriteFile(path, []byte(strings.TrimLeft(contents, "\n")), 0o600); err != nil {
Expand Down
12 changes: 6 additions & 6 deletions cmd/eval-runner/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ type TraceRow struct {
}

type CheckFailure struct {
Check string
Expected string
Observed string
Check string `json:"check"`
Expected string `json:"expected"`
Observed string `json:"observed"`
}

type CaseResult struct {
Name string
Passed bool
Failures []CheckFailure
Name string `json:"name"`
Passed bool `json:"passed"`
Failures []CheckFailure `json:"failures"`
}
Loading
Loading