PurpleAILAB · VoidChecksum · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/.github/workflows/zfp-bench.yml b/.github/workflows/zfp-bench.yml
@@ -0,0 +1,130 @@
+name: zfp-bench
+
+# Runs the Vigilo ScaBench regression suite on every push to the ZFP branch +
+# PRs into main. Fails the job if valid-finding rate regresses >2% vs the
+# recorded baseline.
+#
+# The bench runner uses `packages/bench` which scores Vigilo against
+# Code4rena ground truth. This workflow does NOT invoke live LLMs — it
+# replays previously-cached audit outputs + re-scores. Live-LLM regression
+# is a separate nightly workflow (not shipped in this PR — see roadmap).
+
+on:
+  push:
+    branches: [main, "zfp-*"]
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+    inputs:
+      baseline_ref:
+        description: "Git ref to compare against"
+        required: false
+        default: "main"
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  bench:
+    runs-on: ubuntu-latest
+    timeout-minutes: 25
+    defaults:
+      run:
+        working-directory: packages/bench
+
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+
+      - uses: oven-sh/setup-bun@v2
+        with:
+          bun-version: "1.3.12"
+
+      - uses: actions/setup-node@v5
+        with:
+          node-version: "22"
+
+      # bun install has a name conflict with the `install` script slot on this
+      # bun version — use npm for dependency install.
+      - name: install deps
+        run: npm ci --no-audit --no-fund
+
+      - name: typecheck
+        run: npx tsc --noEmit
+
+      - name: build bench runner
+        run: npm run build
+
+      - name: verify bench CLI
+        run: node dist/cli.js --help
+
+      # ── Replay-only regression (fast, no live LLM) ────────────────────────
+      - name: run ScaBench replay
+        id: bench
+        run: |
+          node dist/cli.js run \
+            --dataset ./data/dataset.json \
+            --baselines ./data/baselines \
+            --out ./data/results-current.json \
+            --mode replay \
+            2>&1 | tee bench-output.log
+          # Extract headline metrics for step summary
+          node dist/cli.js summarize \
+            --results ./data/results-current.json \
+            --out ./data/summary.md \
+            || echo "summary step skipped (no summarize subcommand)"
+
+      - name: post summary
+        if: always()
+        run: |
+          if [ -f ./data/summary.md ]; then
+            cat ./data/summary.md >> "$GITHUB_STEP_SUMMARY"
+          else
+            echo "## Bench output" >> "$GITHUB_STEP_SUMMARY"
+            echo '```' >> "$GITHUB_STEP_SUMMARY"
+            tail -60 bench-output.log >> "$GITHUB_STEP_SUMMARY"
+            echo '```' >> "$GITHUB_STEP_SUMMARY"
+          fi
+
+      - name: regression gate
+        env:
+          BENCH_MAX_REGRESSION_PCT: "2"
+        run: |
+          if [ ! -f ./data/baseline-summary.json ]; then
+            echo "::notice::No baseline recorded yet — skipping regression gate"
+            exit 0
+          fi
+          node - <<'JS'
+          import { readFileSync } from "node:fs"
+          const maxRegressionPct = Number(process.env.BENCH_MAX_REGRESSION_PCT || "2")
+          const base = JSON.parse(readFileSync("./data/baseline-summary.json", "utf8"))
+          const curr = JSON.parse(readFileSync("./data/results-current.json", "utf8"))
+          // Score shape depends on bench CLI output. Guard for missing fields.
+          const baseRate = Number(base.validFindingRate ?? base.valid_rate ?? 0)
+          const currRate = Number(curr.validFindingRate ?? curr.valid_rate ?? 0)
+          if (!Number.isFinite(baseRate) || !Number.isFinite(currRate) || baseRate === 0) {
+            console.log(`No usable baseline (base=${baseRate}, curr=${currRate}) — skipping gate`)
+            process.exit(0)
+          }
+          const delta = ((currRate - baseRate) / baseRate) * 100
+          console.log(`Baseline valid-rate: ${(baseRate * 100).toFixed(2)}%`)
+          console.log(`Current  valid-rate: ${(currRate * 100).toFixed(2)}%`)
+          console.log(`Delta: ${delta >= 0 ? "+" : ""}${delta.toFixed(2)}%`)
+          if (delta < -maxRegressionPct) {
+            console.error(`::error::Valid-finding rate regressed ${delta.toFixed(2)}% (gate: -${maxRegressionPct}%)`)
+            process.exit(1)
+          }
+          JS
+
+      - name: upload results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: zfp-bench-results-${{ github.run_id }}
+          path: |
+            packages/bench/data/results-current.json
+            packages/bench/data/summary.md
+            packages/bench/bench-output.log
+          retention-days: 30
diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,4 @@ coverage/
 reference/
 nul
 .sisyphus/
+.omc/
diff --git a/docs/INSTALL-LOCAL.md b/docs/INSTALL-LOCAL.md
@@ -0,0 +1,222 @@
+# Local Vigilo Development — pointing OpenCode / Claude Code at the local build
+
+This guide wires a local Vigilo source tree (e.g. `zfp-overhaul` branch) into
+an existing OpenCode / opencode-web3 / Claude Code session so you can iterate
+on agents, skills, and routing without publishing to npm.
+
+## Prerequisites
+
+- `bun ≥ 1.3.12`
+- `node ≥ 22`
+- `forge ≥ 1.5`
+- (optional) `slither`, `halmos`, `medusa`, `semgrep`, `aderyn`
+- Live worktree at `/home/void/Vigilo-zfp` (or your chosen path)
+
+## 1 — Build the plugin
+
+```bash
+cd /home/void/Vigilo-zfp/packages/opencode
+npm ci                    # bun install conflicts with `build` script name on bun 1.3
+bun build.mjs             # uses Bun.build() API (see note below)
+npx tsc --noEmit          # typecheck
+```
+
+### Note: bun script-name conflict
+
+The `build` script in `package.json` and the `bun build` CLI subcommand
+conflict on bun ≥ 1.3. This repo's `build.mjs` sidesteps the conflict by
+using `Bun.build()` + `npx tsc` directly. Run `bun build.mjs`, not
+`bun run build`.
+
+## 2 — Option A: symlink into opencode-web3
+
+```bash
+# Back up your config
+cp ~/.config/opencode-web3/opencode/opencode.json{,.bak}
+
+# Edit opencode.json — replace "vigilo@latest" with local file reference
+```
+
+Replace the plugin line in `~/.config/opencode-web3/opencode/opencode.json`:
+
+```diff
+  "plugin": [
+    "opencode-claude-auth",
+    "opencode-openai-codex-auth",
+-   "vigilo@latest"
++   "file:/home/void/Vigilo-zfp/packages/opencode"
+  ],
+```
+
+Restart opencode-web3. The local build is now loaded.
+
+## 3 — Option B: Claude Code plugin path
+
+Claude Code auto-discovers agents from `packages/claude/agents/*.md`. Point
+at the local plugin via `~/.claude/settings.json`:
+
+```jsonc
+{
+  "extraKnownMarketplaces": {
+    "vigilo-local": {
+      "source": {
+        "source": "local",
+        "path": "/home/void/Vigilo-zfp/packages/claude"
+      }
+    }
+  }
+}
+```
+
+Then run `/plugin install vigilo@vigilo-local` from a Claude Code session.
+
+## 4 — Verify new agents are registered
+
+From an OpenCode / Claude Code session:
+
+```
+/agents list
+```
+
+Expected new agents (9):
+
+- `verifier`
+- `judge` (and `judge-gpt` variant once wired)
+- `griller`
+- `poc-generator`
+- `patcher`
+- `re-verifier`
+- `economic-auditor`
+- `invariant-tester`
+- `dup-detector`
+
+Plus existing: `vigilo`, `quaestor`, `explorator`, `speculator`, and the 8
+specialist auditors.
+
+## 5 — Run a smoke audit on alchemix-v3
+
+```bash
+cd /home/void/alchemix-v3
+
+# Run the Phase 2.5 static pre-pass alone (no LLM cost)
+/home/void/Vigilo-zfp/packages/claude/scripts/static-prepass.sh .
+cat .vigilo/prepass.md
+
+# Full audit (live LLMs — budget ~$3-8 per run for alchemix-v3 size)
+# From opencode-web3 / Claude Code:
+/audit
+```
+
+Expected pipeline:
+
+1. Phase -1 classify → FULL_AUDIT
+2. Phase 0 scope (scope.md already exists)
+3. Phase 1 recon (explorator + speculator parallel)
+4. Phase 1.5 risk-priority map
+5. Phase 2 deep analysis (reentrancy + oracle + economic + … — parallel ≤3)
+6. **Phase 2.5 static pre-pass** (parallel, non-blocking)
+7. **Phase 3 ZFP pipeline** — PoC → verifier → dup-check → judge → griller →
+   patcher → re-verifier
+8. Phase 4 quality review
+9. Phase 5 report → `.vigilo/reports/`
+
+## 6 — Compare to prior findings
+
+alchemix-v3 already has a `.vigilo/` from a prior run. After ZFP audit:
+
+```bash
+# Snapshot the new output
+cp -r .vigilo .vigilo.zfp
+
+# Diff
+diff -r .vigilo.prior/findings .vigilo.zfp/findings | head -60
+```
+
+Metrics to extract:
+
+- New findings vs prior (potential improvement)
+- Prior findings dropped by ZFP (potential FP rejection or quality gate)
+- Severity distribution shift
+
+## 7 — Configure the corpus (optional but recommended)
+
+```bash
+# Bootstrap ~/.vigilo-corpus/ with top-60 C4 + 60 Sherlock findings repos
+python3 packages/claude/scripts/corpus-ingest.py --top-n 60 --workers 12
+
+# Stats
+packages/claude/scripts/corpus-stats.sh
+
+# Test query
+python3 packages/claude/scripts/dup-query.py \
+  --title "Reentrancy in withdraw" --protocol vault --k 5
+```
+
+## 8 — Configure pgvector (optional, v2 semantic dup-detect)
+
+```bash
+# pgvector container (already running if set up during install)
+docker run -d --name vigilo-pgvector \
+  -e POSTGRES_PASSWORD=vigilo -e POSTGRES_DB=vigilo \
+  -p 5433:5432 pgvector/pgvector:pg17
+
+# Initialize schema
+packages/claude/scripts/corpus-bootstrap.sh --pgvector
+```
+
+Connection string: `postgres://postgres:vigilo@localhost:5433/vigilo`
+
+## 9 — Troubleshooting
+
+### "agent `verifier` not found"
+- Check `/agents list` — if missing, verify plugin is loaded (`/plugin list`)
+- Restart opencode session after changing config
+- Confirm `packages/claude/agents/verifier.md` exists in the linked path
+
+### Slither compile error
+The default filter `(/|^)(test|mock|script|lib|node_modules)(/|$)` excludes
+common test paths. If your project has nested test dirs (e.g. `src/test/`),
+they're included via the `\.t\.sol$` suffix rule. If Slither still fails on
+`Type not found`, it may be a project-specific crytic-compile issue —
+configure `slither.config.json` at the project root.
+
+### `bun install` fails with "Script not found"
+Use `npm ci` or `npm install` — bun ≥ 1.3 interprets `install` as a script
+run due to conflict with the `build` script slot.
+
+### OpenCode doesn't pick up local changes
+- Rebuild: `cd packages/opencode && bun build.mjs`
+- Clear OpenCode plugin cache (location depends on version)
+- Restart opencode-web3
+
+## 10 — Run benchmark locally
+
+```bash
+cd packages/bench
+npm ci
+npm run build
+node dist/cli.js --help
+node dist/cli.js run --dataset ./data/dataset.json --baselines ./data/baselines \
+  --out ./data/results-local.json --mode replay
+```
+
+## 11 — Cost budgeting
+
+Expected LLM spend per full audit with new ZFP pipeline:
+
+| Role | Calls/finding | Model | Est. cost/call |
+|------|---------------|-------|----------------|
+| Specialist auditors | 1 | Sonnet 4.6 | $0.15 |
+| poc-generator | 1–3 | gpt-5.2-codex high | $0.08 |
+| verifier | 1 | Opus 4.6 xhigh | $0.40 |
+| judge | 1 | Opus 4.6 xhigh | $0.20 |
+| griller | 3 rounds | Opus 4.6 **max** | $0.60 × 3 |
+| patcher | 1–2 | gpt-5.2-codex high | $0.05 |
+| re-verifier | 1 | Opus 4.5 high | $0.15 |
+| dup-detector | 1 | Haiku 4.5 | $0.01 |
+
+Per **candidate finding**: ~$3 end-to-end. Per full audit (~10 candidates):
+~$30. Rejected findings save griller cost (~$1.80 saved per reject).
+
+Budget the griller carefully — it's the single most expensive role. Disable
+via `--no-grill` flag if iterating on non-Critical findings.
-Original file line number
+Diff line change
@@ Expand Up / @@ -36,3 +36,4 @@ coverage/ @@
     reference/
     nul
     .sisyphus/
+    .omc/