From 39e287cbc99765998c446d9e5476b87a56cb35aa Mon Sep 17 00:00:00 2001 From: Asa Baylus Date: Sun, 3 May 2026 13:03:00 -0400 Subject: [PATCH 1/6] feat(assess): add Agent Maturity Assessment as a first-class command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `teamhero assess` (CLI) and a new TUI subcommand alongside report / setup / doctor. Scores an engineering org against the 12-criterion Agent Maturity Assessment (4 weighted categories: engineering basics, knowledge & context 1.5×, AI governance & quality 1.25×, hiring), producing a weighted percentage, a raw /12, item-level evidence, top-3 fixes, strengths, and a maturity band marker. Pipeline: preflight (gh / GitHub MCP / git-only) → adjacent-repo detection → Phase-1 interview (7 questions, asked one at a time over a bidirectional JSON-lines protocol) → 12 deterministic evidence collectors → AI scoring (OpenAI Responses API + strict JSON schema, with tier-3 caps enforced post-hoc on items 2/3/9/11) → audit markdown matching the canonical template + sibling .json artifact + docs/audits/CONFIG.md round-trip. The TUI flow uses the same visual design as `teamhero report`: two- pane Bubble Tea progress display with monotonic progress bar, spinner-driven step list (✔/✖/○ icons), right-side configuration summary with AI/dry-run badge, and a tabbed Glamour-rendered preview (Audit / Evidence / JSON Data) that mirrors the report preview. Scope is configurable: local repo, GitHub org, or both. Headless mode accepts a JSON file of pre-supplied interview answers; interactive mode round-trips each question through huh forms one at a time. Rubric is hardcoded in src/services/maturity/rubric.ts (RUBRIC_VERSION participates in the cache key) so the binary doesn't depend on external skill files at runtime. Tests: 81 TS specs (rubric, scoring, interview, audit-writer, audit- store, evidence-collectors, adjacent-repos, maturity-prompts, stdin- interview, end-to-end dry-run) + Go specs for the TUI (progress state machine, summary panel, preview tab bar, config round-trip, runner glue). All passing. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.schema | 8 + CLAUDE.md | 23 + README.md | 99 +++ .../skills/agent-maturity-assessment/SKILL.md | 145 ++++ ...26-05-03-agent-maturity-assessment-plan.md | 204 +++++ docs/maturity-skill-ref/SKILL.md | 105 +++ .../maturity-skill-ref/references/criteria.md | 203 +++++ .../references/interview.md | 108 +++ .../references/output-template.md | 102 +++ .../references/preflight.md | 107 +++ justfile | 4 + scripts/run-assess.ts | 153 ++++ src/cli/index.ts | 18 +- src/core/types.ts | 41 + src/services/maturity/adjacent-repos.ts | 110 +++ src/services/maturity/ai-scorer.ts | 212 +++++ src/services/maturity/audit-store.ts | 134 ++++ src/services/maturity/audit-writer.ts | 223 ++++++ src/services/maturity/evidence-collectors.ts | 747 ++++++++++++++++++ src/services/maturity/fs-utils.ts | 122 +++ src/services/maturity/interview.ts | 122 +++ src/services/maturity/maturity-prompts.ts | 176 +++++ src/services/maturity/maturity.service.ts | 206 +++++ src/services/maturity/preflight.ts | 58 ++ src/services/maturity/rubric.ts | 331 ++++++++ src/services/maturity/scoring.ts | 123 +++ src/services/maturity/stdin-interview.ts | 135 ++++ src/services/maturity/types.ts | 193 +++++ tests/fixtures/maturity/teamhero-answers.json | 9 + tests/integration/maturity-end-to-end.spec.ts | 45 ++ .../services/maturity/adjacent-repos.spec.ts | 25 + .../services/maturity/audit-store.spec.ts | 80 ++ .../services/maturity/audit-writer.spec.ts | 185 +++++ .../maturity/evidence-collectors.spec.ts | 86 ++ .../unit/services/maturity/interview.spec.ts | 63 ++ .../maturity/maturity-prompts.spec.ts | 77 ++ tests/unit/services/maturity/rubric.spec.ts | 83 ++ tests/unit/services/maturity/scoring.spec.ts | 138 ++++ .../services/maturity/stdin-interview.spec.ts | 154 ++++ tui/assess.go | 146 ++++ tui/assess_config.go | 78 ++ tui/assess_config_test.go | 137 ++++ tui/assess_flags.go | 120 +++ tui/assess_preview.go | 366 +++++++++ tui/assess_preview_test.go | 100 +++ tui/assess_progress.go | 581 ++++++++++++++ tui/assess_progress_test.go | 205 +++++ tui/assess_protocol.go | 34 + tui/assess_runner.go | 166 ++++ tui/assess_runner_test.go | 56 ++ tui/assess_summary.go | 172 ++++ tui/assess_summary_test.go | 137 ++++ tui/assess_wizard.go | 252 ++++++ tui/main.go | 15 +- tui/protocol.go | 8 + 55 files changed, 7727 insertions(+), 3 deletions(-) create mode 100644 claude-plugin/skills/agent-maturity-assessment/SKILL.md create mode 100644 docs/2026-05-03-agent-maturity-assessment-plan.md create mode 100644 docs/maturity-skill-ref/SKILL.md create mode 100644 docs/maturity-skill-ref/references/criteria.md create mode 100644 docs/maturity-skill-ref/references/interview.md create mode 100644 docs/maturity-skill-ref/references/output-template.md create mode 100644 docs/maturity-skill-ref/references/preflight.md create mode 100644 scripts/run-assess.ts create mode 100644 src/services/maturity/adjacent-repos.ts create mode 100644 src/services/maturity/ai-scorer.ts create mode 100644 src/services/maturity/audit-store.ts create mode 100644 src/services/maturity/audit-writer.ts create mode 100644 src/services/maturity/evidence-collectors.ts create mode 100644 src/services/maturity/fs-utils.ts create mode 100644 src/services/maturity/interview.ts create mode 100644 src/services/maturity/maturity-prompts.ts create mode 100644 src/services/maturity/maturity.service.ts create mode 100644 src/services/maturity/preflight.ts create mode 100644 src/services/maturity/rubric.ts create mode 100644 src/services/maturity/scoring.ts create mode 100644 src/services/maturity/stdin-interview.ts create mode 100644 src/services/maturity/types.ts create mode 100644 tests/fixtures/maturity/teamhero-answers.json create mode 100644 tests/integration/maturity-end-to-end.spec.ts create mode 100644 tests/unit/services/maturity/adjacent-repos.spec.ts create mode 100644 tests/unit/services/maturity/audit-store.spec.ts create mode 100644 tests/unit/services/maturity/audit-writer.spec.ts create mode 100644 tests/unit/services/maturity/evidence-collectors.spec.ts create mode 100644 tests/unit/services/maturity/interview.spec.ts create mode 100644 tests/unit/services/maturity/maturity-prompts.spec.ts create mode 100644 tests/unit/services/maturity/rubric.spec.ts create mode 100644 tests/unit/services/maturity/scoring.spec.ts create mode 100644 tests/unit/services/maturity/stdin-interview.spec.ts create mode 100644 tui/assess.go create mode 100644 tui/assess_config.go create mode 100644 tui/assess_config_test.go create mode 100644 tui/assess_flags.go create mode 100644 tui/assess_preview.go create mode 100644 tui/assess_preview_test.go create mode 100644 tui/assess_progress.go create mode 100644 tui/assess_progress_test.go create mode 100644 tui/assess_protocol.go create mode 100644 tui/assess_runner.go create mode 100644 tui/assess_runner_test.go create mode 100644 tui/assess_summary.go create mode 100644 tui/assess_summary_test.go create mode 100644 tui/assess_wizard.go diff --git a/.env.schema b/.env.schema index e1cbaf5..fdfd91c 100644 --- a/.env.schema +++ b/.env.schema @@ -42,6 +42,9 @@ VISIBLE_WINS_AI_MODEL= AI_DISCREPANCY_ANALYSIS_MODEL= # @type=string AI_TECHNICAL_WINS_MODEL= +# Override AI model for the Agent Maturity Assessment scorer (falls back to AI_MODEL). +# @type=string +MATURITY_AI_MODEL= # Custom OpenAI-compatible endpoint # @sensitive @type=url @@ -130,6 +133,11 @@ TECHNICAL_WINS_AUDIENCE= # @type=string TEAMHERO_TUI_PATH= +# Set to "1" when a GitHub MCP server is connected so `teamhero assess` chooses +# Tier 2 evidence fidelity instead of falling back to git-only. +# @type=enum(1,) +TEAMHERO_GITHUB_MCP= + # Override GitHub OAuth App client ID (development/testing only) # @type=string GITHUB_OAUTH_CLIENT_ID= diff --git a/CLAUDE.md b/CLAUDE.md index 6b25093..b65ee98 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -103,6 +103,29 @@ teamhero report --headless --foreground --flush-cache loc # Force re-fetch LOC - Scan for leaked secrets: `npx varlock scan` (or `npx varlock scan --staged` in pre-commit) - The `.env.schema` is safe for AI tools — it contains types and descriptions but never secret values +## Maturity Assessment (`teamhero assess`) + +- The 12-item rubric is **hardcoded** in `src/services/maturity/rubric.ts`. + `RUBRIC_VERSION` is part of the cache key — bump it when rubric text or + scoring math changes so audits don't surface stale results. +- The 7 Phase-1 interview questions in `src/services/maturity/interview.ts` + are **verbatim from references/interview.md** — the wording is calibrated, + do not paraphrase. The skill rule "ask one question at a time, wait for + the answer" is enforced by the bidirectional JSON-lines protocol — don't + batch them. +- Tier-3 (git-only) audits **must cap items 2, 3, 9, 11 at 0.5** even when the + AI awards 1.0. `ai-scorer.ts::applyTier3Caps` enforces this post-hoc. +- `scripts/run-assess.ts` is bidirectional: stdin stays open after the initial + config line so the Go TUI can write `interview-answer` events. + `RunAssessServiceRunner` in `tui/assess_runner.go` keeps the stdin pipe + open intentionally — do not close it. +- The `MaturityProvider`, `InterviewTransport`, `AuditStore` ports live in + `src/core/types.ts` (same rule as every other port). Concrete value types + live in `src/services/maturity/types.ts`. +- `docs/maturity-skill-ref/` is a **reference copy** of the upstream skill + (extracted from the original zip) — kept for human readers. The canonical + rubric is the TS code, not these files. + ## Landing Changes - Always use `/land` to commit, push, and open PRs — never do these steps manually. diff --git a/README.md b/README.md index 61c64e6..d6587ca 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,104 @@ Or set `OPENAI_SERVICE_TIER=flex` in `~/.config/teamhero/.env`. --- +## Run a maturity assessment + +Score an engineering organization (or a single repo) against the 12-criterion +**Agent Maturity Assessment** — reproducible dev environments, integration +cadence, testability, observability, design discipline, deep modules, +repo-local agent context, sanctioned AI tooling, human review, evals, +blast-radius controls, and judgment under AI augmentation. + +The audit produces a weighted percentage, a raw `/12` score, item-level +evidence sentences, the top-3 fixes, and strengths to preserve. Output lands +in the current directory as `teamhero-maturity--.md` plus a +JSON sidecar with the full data. + +**Bands:** **Excellent** (90%+) · **Healthy** (75–89%) · **Functional but +slow** (60–74%) · **Significant dysfunction** (40–59%) · **Triage** (<40%). + +### Interactive TUI + +```bash +teamhero assess +``` + +The wizard asks for scope (local repo / GitHub org / both), then walks you +through the 7 Phase-1 interview questions one at a time (AI tooling, hiring, +DORA visibility, design discipline, evals, blast-radius red-teaming, adjacent +repos). Each question has a small set of pre-written answer options plus a +free-text "Other" choice; "I don't know" maps the linked criterion to `n/a`. + +### Headless / scripted + +```bash +# Audit the current repo (no interview — uses CONFIG.md or "unknown") +teamhero assess --headless --path . + +# Audit with pre-supplied interview answers +teamhero assess --headless --path . \ + --interview-answers ./answers.json + +# Org-wide audit +teamhero assess --headless --target-org acme \ + --interview-answers ./answers.json + +# Smoke test without an OpenAI call (placeholder scores) +teamhero assess --headless --path . --dry-run +``` + +`answers.json` shape — keys map to question IDs, value is verbatim text or +`"unknown"`: + +```json +{ + "q1": "Company-paid Claude with policy", + "q2": "AI allowed; interviewers trained", + "q3": "DORA via Grafana", + "q4": "Consistent ADR step before agent code", + "q5": "LLMs in dev loop, retro-tracked", + "q6": "unknown", + "q7": "No" +} +``` + +### Useful flags + +| Flag | Purpose | +|------|---------| +| `--scope-mode {org\|local-repo\|both}` | Override scope (auto-inferred from other flags) | +| `--evidence-tier {auto\|gh\|github-mcp\|git-only}` | Pin the evidence tier; default auto-detects | +| `--audit-output ` | Override the markdown output path | +| `--audit-output-format {markdown\|json\|both}` | Default: `both` | +| `--dry-run` | Skip the AI scorer; emit a placeholder audit | +| `--show-assess-config` | Print saved configuration as JSON and exit | + +Run `teamhero assess --help` for the full list. + +### How the score is built + +1. **Preflight** — auto-detects evidence tier (`gh` CLI authed → Tier 1, + GitHub MCP available → Tier 2, otherwise → Tier 3 git+filesystem only). +2. **Adjacent repos** — scans the local repo for workflow `uses:`, Terraform + module sources, submodules, and README cross-refs to find sibling repos + that should be in scope. +3. **Interview** — captures the 7 Phase-1 answers (interactively, from + `--interview-answers`, or from `docs/audits/CONFIG.md` if it exists in + the repo). Persists the confirmed answers back to `CONFIG.md` after + every successful run so re-audits can confirm-or-refresh. +4. **Evidence** — 12 deterministic detectors run against the local repo + (test files, CI workflows, dependency manifests, ADRs, agent context + files, CODEOWNERS, OIDC vs. long-lived secrets, Terraform IaC, etc.). +5. **AI scoring** — OpenAI Responses API with a strict JSON schema returns + per-item scores, ≤25-word evidence sentences, top-3 fixes, and + strengths. Tier-3 audits cap items 2/3/9/11 at 0.5 because the + GitHub-side evidence isn't observable. +6. **Output** — markdown rendered against the canonical template + + matching `.json` with the full artifact (rubric version, evidence + facts, category subtotals). + +--- + ## Learn more - [Configuration Reference](docs/CONFIG_FORMAT.md) — all settings, credentials, and user identity mapping @@ -153,6 +251,7 @@ just # List all available recipes | `just test-all` | Run all tests (TypeScript + Go) | | `just lint` | Format and lint (Biome) | | `just report` | Run a report | +| `just assess` | Run a maturity assessment | | `just reset` | Clean all build artifacts | ### Secure credential setup with varlock diff --git a/claude-plugin/skills/agent-maturity-assessment/SKILL.md b/claude-plugin/skills/agent-maturity-assessment/SKILL.md new file mode 100644 index 0000000..e0ccd30 --- /dev/null +++ b/claude-plugin/skills/agent-maturity-assessment/SKILL.md @@ -0,0 +1,145 @@ +--- +name: agent-maturity-assessment +description: Run the Agent Maturity Assessment via Team Hero — a 12-criterion diagnostic for engineering organization readiness in the AI-agentic coding era. Items score 0/0.5/1 across four weighted categories (engineering basics 1.0×, knowledge & context 1.5×, AI governance & quality 1.25×, hiring 1.0×), producing a weighted percentage and a raw /12. Use whenever the user wants to audit, diagnose, or score an engineering organization, team, repo, or recently acquired company for AI readiness. Trigger on phrases like "agent maturity", "agent readiness", "AI maturity", "engineering org health", "engineering maturity", "audit the team", "score this repo", "diagnose dev experience", "is this team ready for AI", "is this team modern", "how healthy is this org", or any onboarding-era assessment. Produces a scored audit with item-level evidence, category subtotals, weighted overall score, top fixes, and strengths to preserve. +--- + +# Run an Agent Maturity Assessment via Team Hero + +Team Hero ships a first-class implementation of the Agent Maturity Assessment. +It scores a 12-criterion diagnostic across four weighted categories using a +hybrid pipeline: deterministic detectors gather evidence from the local repo / +GitHub / Asana, a Phase-1 interview captures the org-level signals that aren't +visible in code, and an AI scorer (OpenAI Responses API + strict JSON schema) +produces the final scores, evidence sentences (≤25 words each), top-3 fixes, +and strengths to preserve. + +## Detect runtime mode + +1. **Binary mode (preferred when available)** — If `teamhero` (or `teamhero-tui`) + is installed and the user has `OPENAI_API_KEY` configured, use Path A. +2. **Pure-Claude fallback** — Otherwise fall back to the standalone + `anthropic-skills:agent-maturity-assessment` skill if available, or to + running the rubric manually using the references below. + +```bash +teamhero --version 2>/dev/null || teamhero-tui --version 2>/dev/null +``` + +## Path A — Team Hero binary mode + +### Step 1: Ensure credentials + +```bash +teamhero doctor # confirms ~/.config/teamhero/.env is healthy +``` + +If `OPENAI_API_KEY` is missing, ask the user to run `teamhero setup` (or write +the key into `~/.config/teamhero/.env`). + +### Step 2: Pick the scope + +Ask the user one of: +- A **local repo path** they want audited (default: `cwd`). +- A **GitHub org** name — for an org-wide audit. +- **Both** — when the user wants to assess an org and a representative checkout. + +### Step 3: Run the assessment + +Headless invocation (preferred when running on behalf of the user): + +```bash +# Local repo audit, no interview, dry-run for a quick smoke test +teamhero assess --headless --path . --dry-run + +# Real audit against a local repo with interview answers in a JSON file +teamhero assess --headless --path . \ + --interview-answers /path/to/answers.json \ + --audit-output ./audit.md + +# Org-wide audit +teamhero assess --headless --target-org acme \ + --interview-answers /path/to/answers.json +``` + +Interactive (the user walks through scope + the 7 Phase-1 questions one at a +time in the TUI): + +```bash +teamhero assess +``` + +### Step 4: Surface the result + +The runner emits two files: +- `.md` — full audit using the canonical template (per-category + tables, summary, maturity-scale row marker, top-3 fixes, strengths, + adjacent repos consulted, notes for re-audit). +- `.json` — full data including item scores, evidence facts, + rubric version, and tier. + +Read the markdown back to the user — do not just say "done." Highlight the +band (Excellent / Healthy / Functional but slow / Significant dysfunction / +Triage), the weighted percentage, and the top-3 fixes. + +### Notes on the interview + +Phase-1 has 7 questions about org-level facts the repo can't answer (AI +tooling, hiring, DORA visibility, design discipline, evals, blast-radius +red-teaming, adjacent repos). The skill's invariant is **one question at a +time** — do not pre-answer or batch them. In `--headless` mode, supply +`--interview-answers ` with shape: + +```json +{ + "q1": "Company-paid Claude with policy", + "q2": "AI allowed in interviews, interviewers trained", + "q3": "DORA tracked via Grafana", + "q4": "Consistent ADR step before agent code", + "q5": "LLMs in dev loop, tracked in retro metrics", + "q6": "Worst-case red-teamed, rollbacks documented", + "q7": "unknown" +} +``` + +Use `"unknown"` (or `"I don't know"`) to mark a question as unanswered — the +linked criterion will be scored `n/a` and excluded from numerator and max. + +### Tier behavior + +The runner auto-detects the evidence tier (`gh` CLI authenticated → Tier 1, +GitHub MCP available → Tier 2, git+filesystem only → Tier 3). At Tier 3, +items 2, 3, 9, and 11 are capped at 0.5 because GitHub-side evidence is +needed to award 1.0 confidently. + +Override with `--evidence-tier {auto|gh|github-mcp|git-only}` when needed. + +## Path B — Pure-Claude fallback + +If the binary is not available, defer to the standalone skill bundle (e.g., +`anthropic-skills:agent-maturity-assessment`) which contains the same rubric, +interview, output template, and preflight references but runs entirely from +within Claude. + +## Reference: the rubric + +The Team Hero implementation hardcodes the rubric at +`src/services/maturity/rubric.ts` (RUBRIC_VERSION export). The 12 items map +to 4 categories: + +| # | Item | Category | Weight | +|---|------|----------|--------| +| 1 | Reproducible dev environments | A. Engineering basics | 1.0× | +| 2 | Sub-day integration cadence with measured outcomes | A. Engineering basics | 1.0× | +| 3 | Testability and the agent inner loop | A. Engineering basics | 1.0× | +| 4 | Observability before features | A. Engineering basics | 1.0× | +| 5 | Design discipline as a first-class practice | B. Knowledge & context | 1.5× | +| 6 | Codebase composed of deep modules | B. Knowledge & context | 1.5× | +| 7 | Repo-local agent context | B. Knowledge & context | 1.5× | +| 8 | Sanctioned, governed AI tooling | C. AI governance & quality | 1.25× | +| 9 | Human review on every PR | C. AI governance & quality | 1.25× | +| 10 | Evals for AI-touched code paths | C. AI governance & quality | 1.25× | +| 11 | Blast-radius controls for agent actions | C. AI governance & quality | 1.25× | +| 12 | Interviews assess judgment under AI augmentation | D. Hiring | 1.0× | + +Maximum weighted score: 14.5. Bands: 90%+ Excellent · 75–89% Healthy · +60–74% Functional but slow · 40–59% Significant dysfunction · <40% Triage. diff --git a/docs/2026-05-03-agent-maturity-assessment-plan.md b/docs/2026-05-03-agent-maturity-assessment-plan.md new file mode 100644 index 0000000..7726317 --- /dev/null +++ b/docs/2026-05-03-agent-maturity-assessment-plan.md @@ -0,0 +1,204 @@ +# Plan — First-class Agent Maturity Assessment in Team Hero + +## Context + +Team Hero today produces a weekly developer-contribution **report**. We're adding a sibling deliverable: an **Agent Maturity Assessment** — a 12-criterion diagnostic that scores an engineering organization for AI-agentic-coding readiness, producing a weighted % and raw /12 score, item-level evidence, top fixes, strengths, and a maturity band. + +The complete skill (rubric, interview questions, output template, preflight tier system, multi-repo handling) was extracted from `C:\Users\Asa\Desktop\agent-maturity-assessment.zip` and is the source of truth for the rubric content. Reference copies live at `docs/maturity-skill-ref/` for review during implementation; the implementation will hardcode the rubric in TS so the binary doesn't depend on those reference files at runtime. + +This is a first-class feature: a new `assess` top-level command in both the CLI (`teamhero assess …`) and the Go TUI (sibling to `report` / `setup` / `doctor`), with interactive (wizard) and headless modes, hybrid scoring (deterministic detectors + AI judgment), JSON + markdown output, caching, and an updatable on-disk audit history. + +## Scope decisions (already confirmed) + +- **Rubric source:** hardcoded in TS (`src/services/maturity/rubric.ts`) — single source of truth, versioned with the code, includes a `RUBRIC_VERSION` so cached results invalidate when criteria change. +- **Inputs (all four):** GitHub org/repos (reuses Team Hero's existing GitHub fetchers), local repo path (`--path`), free-text questionnaire (the 7 Phase-1 interview questions, presented one at a time in the TUI), Asana signals (reuse existing Asana adapter for Q5 dev-loop tracking signals). +- **Output shape:** new top-level command. `teamhero-maturity--.md` + matching `.json`. Results are also appended to `docs/audits/CONFIG.md` (interview answers) and `docs/audits/-.md` (the audit) when run inside a repo. +- **Scoring:** hybrid. Deterministic detectors run first against the local repo + GitHub data; an AI pass (OpenAI Responses API, `text.format.json_schema` strict) takes the deterministic evidence + interview answers and produces final scores, evidence sentences (≤25 words per the template), top-3 fixes, and strengths. + +## Architecture + +The existing pattern: TS CLI → spawns Go TUI binary → Go TUI either runs interactive wizard or invokes the TS service via JSON-lines stdin/stdout. We follow that pattern exactly. + +``` +teamhero assess [flags] (TS Commander wrapper, src/cli/index.ts) + ↓ spawns +tui/teamhero-tui assess [flags] (Go subcommand, tui/main.go + tui/assess.go) + ↓ wizard or headless → marshal AssessConfig as JSON + ↓ subprocess: scripts/run-assess.ts (or compiled service binary) + ↓ JSON-lines events on stdout: progress | interview-question | interview-answer | result | error +src/services/maturity/maturity.service.ts (orchestrator) + ├── PreflightProbe (gh / GitHub MCP / git-only tier detection) + ├── EvidenceCollector (deterministic detectors per item) + ├── AdjacentRepoDetector (multi-repo scope) + ├── InterviewCoordinator (round-trips questions through TUI) + ├── AIScorer (OpenAI Responses API, strict JSON schema) + └── AuditWriter (markdown + JSON output, CONFIG.md update) +``` + +A novel piece: the **interview round-trip**. Today the TUI is upstream of the service (it spawns it). For the maturity assessment, the service needs answers from the human *during* its run — one question at a time, blocking. We add two new event types to the JSON-lines protocol: `interview-question` (service → TUI) and the existing stdin channel is reused for `interview-answer` (TUI → service). The Go side renders each question via a `huh` prompt with the suggested option set + free-text override, then writes the answer back over stdin as a JSON line. In headless mode, the service reads pre-supplied answers from `docs/audits/CONFIG.md` (or `--interview-answers `), or marks every question `unknown` and proceeds. + +## Files to create + +### TypeScript service layer + +| File | Purpose | +|------|---------| +| `src/services/maturity/rubric.ts` | Hardcoded 12-criterion rubric (id, title, category, weight, score levels, repo checks, diagnostic commands, why-it-matters). Exports `RUBRIC_VERSION` for cache busting. | +| `src/services/maturity/interview.ts` | The 7 Phase-1 questions (verbatim), suggested option sets, criterion mapping. | +| `src/services/maturity/preflight.ts` | Tier detection (gh / GitHub MCP / git-only). Returns `EvidenceTier`. | +| `src/services/maturity/evidence-collectors.ts` | Per-item deterministic detectors. Each item has a collector that runs the diagnostic commands from `criteria.md` and emits `EvidenceFact` records. | +| `src/services/maturity/adjacent-repos.ts` | Multi-repo scope detection (parse `.github/workflows/`, `infra/`, submodules, doc references). | +| `src/services/maturity/maturity.service.ts` | Orchestrator. Composes preflight → adjacent repos → evidence collection → interview round-trip → AI scoring → write output. Mirrors `report.service.ts` shape. | +| `src/services/maturity/ai-scorer.ts` | AI integration (Responses API + strict json_schema). Builds prompt from rubric + collected evidence + interview answers. | +| `src/services/maturity/audit-writer.ts` | Renders the audit markdown using the exact output template; writes JSON sibling. Updates `docs/audits/CONFIG.md` if the run is inside a repo. | +| `src/services/maturity/maturity-prompts.ts` | The AI prompt builder + `MATURITY_ASSESSMENT_SCHEMA` (json_schema for strict mode). | +| `src/services/maturity/scoring.ts` | Pure scoring math: weighted sum, band classification, `n/a` handling. | +| `src/services/maturity/types.ts` | `AssessCommandInput`, `AssessResult`, `EvidenceFact`, `ItemScore`, `EvidenceTier`, `InterviewAnswer`, etc. (Note: per CLAUDE.md, *port interfaces* go in `src/core/types.ts`; concrete value types specific to this feature live here.) | +| `scripts/run-assess.ts` | Headless service runner — sibling to `scripts/run-report.ts`. Reads `AssessCommandInput` from stdin, emits JSON-lines events. | +| `tests/unit/services/maturity/*.spec.ts` | Per-module unit tests (`bun:test`, `.spec.ts`). | +| `tests/integration/maturity-end-to-end.spec.ts` | Headless run against a fixture repo, assert output structure. | +| `tests/contract/cli.assess.spec.ts` | Verify `teamhero assess` registers and forwards args to the TUI binary. | + +### Port interfaces (added to existing file) + +Add to `src/core/types.ts`: +- `MaturityProvider` — interface for an evidence collector (one per criterion). +- `InterviewTransport` — interface for asking questions (TUI-backed in normal runs, file-backed for headless). +- `AuditStore` — interface for reading/writing `docs/audits/CONFIG.md`. + +### Go TUI layer + +| File | Purpose | +|------|---------| +| `tui/assess.go` | New subcommand entrypoint: `runAssessInteractive()`, `runAssessHeadless()`, `printAssessUsage()`. | +| `tui/assess_wizard.go` | Wizard for the `assess` flow: scope picker (org / local repo / both), scope target inputs, options (date window, output path), and the interview round-trip handler. | +| `tui/assess_config.go` | `AssessConfig` struct + load/save (separate from `ReportConfig` to avoid coupling the two flows; saved at `~/.config/teamhero/assess-config.json`). | +| `tui/assess_runner.go` | Service-runner glue: marshals `AssessConfig`, spawns `scripts/run-assess.ts` (or `teamhero-service --mode=assess`), handles bidirectional JSON-lines (questions over stdin). | +| `tui/assess_progress.go` | Progress display for the assess run — reuses existing `progressModel` shape but with assess-specific step list. | +| `tui/assess_preview.go` | Tabbed preview of the audit output (Audit / Evidence / JSON Data tabs). Reuses Glamour like `preview.go`. | +| `tui/assess_test.go` | Subcommand routing, wizard transitions, event handling. | +| `tui/assess_runner_test.go` | Bidirectional protocol tests with a fake subprocess. | + +### Files to modify + +| File | Change | +|------|--------| +| `src/cli/index.ts` | Register `assess` subcommand (delegates to TUI binary, mirrors the `report` block at lines 146–166). Add `assess` to the `subcommands` arrays at lines 157 and 223. | +| `tui/main.go` | Add `"assess"` to the subcommand-detection block at lines 134–138. Add `case "assess":` to the help-routing switch at lines 148–158 and the dispatch switch at lines 181–195. | +| `tui/flags.go` | Add `--scope-mode` (org/local/both), `--path` (local repo path), `--target-org`, `--target-repos`, `--rubric-version` (read-only flag for diagnostics), `--interview-answers `, `--evidence-tier `, `--audit-output `. Headless flags only — interactive flow uses the wizard. | +| `tui/protocol.go` | Add `InterviewQuestionEvent` (service → TUI) and `InterviewAnswerEvent` (TUI → service). Extend `GenericEvent` with the new fields (`questionId`, `questionText`, `options`, `allowFreeText`). | +| `tui/runner.go` | Generalize `RunServiceRunner` to support bidirectional stdin (currently stdin is one-shot config JSON, then closed). Add `RunAssessServiceRunner` that keeps stdin open for answer events; or expose a shared helper. | +| `claude-plugin/skills/agent-maturity-assessment/SKILL.md` | New skill that documents how to invoke `teamhero assess` (parallel to `generate-report` and `maintenance` skills). Include both binary mode and a fallback that calls the bundled Anthropic skill if the binary isn't installed. | +| `docs/ARCHITECTURE.md` | Add a "Maturity Assessment" section documenting the new flow. | +| `.env.schema` | No changes needed — reuses existing `GITHUB_PERSONAL_ACCESS_TOKEN`, `OPENAI_API_KEY`, `ASANA_API_TOKEN`. | +| `justfile` | Add `just assess ` recipe for convenience. | +| `README.md` | Add a short "Run a maturity assessment" section. | + +## Existing utilities to reuse (do not recreate) + +- `src/lib/env.ts::getEnv()` — credential lookup. Per CLAUDE.md, never use `process.env` directly. +- `src/lib/octokit.ts::loadOctokitFromEnv()` — GitHub client (Tier 1 evidence). +- `src/services/asana.service.ts` — Asana data for Q5 dev-loop signals. +- `src/lib/paths.ts::cacheDir()`, `configDir()` — XDG-compliant paths. +- `src/lib/date-utils.ts` — date boundary handling (audits don't strictly need a date window but the cache key benefits). +- `src/adapters/cache/` — existing `FileSystemCacheStore` pattern. New namespace: `~/.cache/teamhero/data-cache/maturity-assessment/`. Cache key includes `RUBRIC_VERSION`, scope (org+repos+path), evidence tier, and interview-answers-hash so changes invalidate. +- `src/lib/json-lines-progress.ts` — JSON-lines emit helpers; extend with `interview-question` event. +- `tui/progress.go` — `progressModel` Bubble Tea integration; `assess_progress.go` reuses the pattern. +- `tui/preview.go` — Glamour markdown preview; `assess_preview.go` mirrors it. +- `tui/forms.go` — `huh` form helpers (`boolSelect`, `validateDate`, `splitCSV`). +- `src/services/ai.service.ts` — OpenAI Responses API call pattern; the new `ai-scorer.ts` follows the same shape (cache the call, log to `ai-batches.log`, use `text.format.json_schema` with `strict: true`). +- `src/lib/renderer-registry.ts` — pattern reference only (we don't register the audit as a report renderer; the audit writer is standalone). +- `tests/helpers/mocked.ts` — test utility for `mock.module()` setups. + +## Detailed flow + +### 1. CLI invocation +`teamhero assess --org acme --path . --until 2026-05-03 --headless` → `src/cli/index.ts` spawns `tui/teamhero-tui assess `. + +### 2. Go TUI dispatch +`main.go` detects `"assess"` subcommand, parses flags via `flags.go`. If `isHeadless()` → `runAssessHeadless()`; else → `runAssessInteractive()` which runs the wizard, then calls `runAssessHeadless()` with the wizard's config. + +### 3. Wizard (interactive only) +`assess_wizard.go` collects: +1. **Scope mode** — Org / local repo / both. (`huh.Select`) +2. **Org name + repo list** if org-mode. Reuses the existing scope-discovery flow (`tui/discover.go`) — same shape as the report wizard. +3. **Local repo path** if local-mode. Defaults to `cwd`. Validate that it's a git repo. +4. **Audit output path** — defaults to `./teamhero-maturity--.md`. +5. **Confirmation screen** showing planned scope + adjacent repos detected. +The wizard does **not** ask interview questions yet — those happen during the run, after preflight, so the user sees them in context with progress feedback. + +### 4. Subprocess launch & preflight +`assess_runner.go` marshals `AssessConfig` as JSON, spawns `scripts/run-assess.ts` (or compiled service), pipes stdin/stdout. The service: +1. Loads `getEnv()` credentials. +2. Runs `preflight.ts::detectTier()` — checks `gh auth status`, MCP availability (env-var hint: `TEAMHERO_GITHUB_MCP=1`), or falls back to git-only. +3. Emits `progress` event: `{ step: "preflight", status: "complete", message: "Tier 1 (gh)" }`. +4. Reads `docs/audits/CONFIG.md` if running against a local repo to seed prior interview answers. + +### 5. Adjacent repo detection +`adjacent-repos.ts` runs the four detection greps from `preflight.md` against the local checkout (or shallow-clones the primary repo if org-mode without `--path`). Surfaces a list to the TUI as a `progress` event with the repo names for transparency. + +### 6. Interview round-trip +For each Phase-1 question that doesn't have a fresh answer in CONFIG.md: +1. Service emits `interview-question` event with the question text + suggested options. +2. Go TUI pauses the progress display, renders a `huh.Select` (with free-text "Other" option), captures the answer. +3. Go TUI writes `{"type":"interview-answer","questionId":"q1","value":"…"}` to the subprocess stdin. +4. Service receives, validates, persists in memory, advances to next question. +5. Headless mode: service reads `--interview-answers` JSON file, falls back to `unknown` per the rules in `interview.md`. + +### 7. Evidence collection +`evidence-collectors.ts` runs each criterion's deterministic detector (12 collectors). Each emits a `progress` event with the item id and a structured `EvidenceFact[]` payload. Tier-3 (git-only) collectors cap items #2, #3, #9, #11 at 0.5 per the preflight rules. + +Asana-backed Q5 signals: if `ASANA_API_TOKEN` is present and the user said "tracked in Asana" in Q5, the collector queries the existing `AsanaService` for AI-related task labels/projects. + +### 8. AI scoring +`ai-scorer.ts` builds a single Responses-API call: rubric (full criterion text + score levels), evidence per item, interview answers, scope description. Uses `text.format.json_schema` with `strict: true`. Schema: `{ items: ItemScore[12], topFixes: Fix[3], strengths: string[], oneLineTake: string, notesForReaudit: string[] }` where `ItemScore = { id, score: 0|0.5|1|"n/a", whyThisScore: string }` and the prompt enforces `whyThisScore ≤ 25 words, single sentence`. + +A second sanity-check pass runs `scoring.ts::computeWeightedScore()` on the AI's per-item scores to compute the weighted % and band — we don't trust the AI for arithmetic. + +### 9. Audit output +`audit-writer.ts` renders the markdown using the exact template from `output-template.md` (table per category, summary, maturity-scale row marker, top fixes, strengths, adjacent repos consulted, notes for re-audit). Writes: +- `.md` +- `.json` (full data: scope, tier, rubric version, item scores, evidence, prompts hash for reproducibility) +- Updates `docs/audits/CONFIG.md` with the confirmed/updated interview answers (only when run inside a git repo). + +### 10. Caching +Cache key: `sha256(rubricVersion + scope + evidenceTier + interviewAnswersHash + sinceUntil)`. Stored at `~/.cache/teamhero/data-cache/maturity-assessment/.json`. `--flush-cache maturity` invalidates. The wizard offers "use cached audit" if a fresh one (≤ 7 days) exists. + +### 11. TUI preview +After the run, `assess_preview.go` opens a tabbed Glamour preview: **Audit** (rendered markdown), **Evidence** (per-item raw evidence JSON), **JSON Data** (full report). User can `q` to quit, `e` to open in `$EDITOR`. + +## Verification + +1. **Unit tests** — `just test tests/unit/services/maturity/` should pass; each scoring/collector module has its own `.spec.ts` with golden-file fixtures for representative repos (a high-maturity sample, a low-maturity sample, a tier-3 sample). +2. **Go unit tests** — `cd tui && go test ./...` should pass; `assess_test.go` covers subcommand routing, wizard state machine, interview round-trip protocol against a fake subprocess. +3. **Integration test** — `tests/integration/maturity-end-to-end.spec.ts` runs `scripts/run-assess.ts` with a stubbed AI client (returns a fixed schema-valid response), against a fixture repo, asserts the output markdown matches a golden file (modulo dates). +4. **Live smoke test** — Run against this repo: + ```bash + just build-all + teamhero assess --headless --path . --interview-answers tests/fixtures/maturity/teamhero-answers.json --no-confirm + ``` + Expect a `teamhero-maturity-teamhero-cli-2026-05-03.md` file in cwd, weighted score in the "Healthy" band, item #7 scoring 1.0 (this repo has CLAUDE.md / AGENTS.md). +5. **Live interactive smoke** — `teamhero assess` (no flags), walk through the wizard, see the 7 questions appear one at a time with `huh` UI. +6. **Coverage** — TS coverage thresholds (85% lines/funcs/stmts, 80% branches per CLAUDE.md) hold for the new `src/services/maturity/` directory; Go coverage stays ≥ 85% for `tui/`. +7. **Lint + security** — `bun run lint` clean, `npx varlock scan` clean. +8. **Docs** — `docs/maturity-skill-ref/` is a build-time reference only; remove from git after the rubric is encoded into `rubric.ts`, OR move to `docs/maturity-skill-ref/README.md` documenting that the rubric is the canonical version while keeping the original SKILL.md/criteria.md/etc. for human readers. + +## Sequencing (suggested implementation order) + +1. **TS scaffolding** — `rubric.ts`, `types.ts`, `scoring.ts` (pure, easy to test, small). +2. **Audit writer** — `audit-writer.ts` driven by hand-built fixture data; iterate until output matches the template byte-for-byte. +3. **Evidence collectors** — start with the easy 4 (items 1, 7 — repo-file presence; items 5, 6 — file globs). Add gh-based collectors (2, 3, 9, 11) once tier-1 plumbing is in. +4. **Preflight + adjacent repos** — small, isolated. +5. **AI scorer + prompts** — Responses API integration with strict JSON schema; cache + log. +6. **Service runner** — `scripts/run-assess.ts` glues 1–5 together with JSON-lines emit. +7. **Go TUI subcommand** — `assess.go`, `assess_config.go`, headless mode first. +8. **Bidirectional protocol** — `protocol.go` event types, `runner.go` stdin extension, integration test. +9. **Wizard + preview** — `assess_wizard.go`, `assess_preview.go` last (smallest user-facing surface). +10. **Skill + docs + justfile + README**. + +## Open questions / risks + +- **Bidirectional stdin** is the biggest unknown — the existing `RunServiceRunner` closes stdin after sending config. Option A: keep stdin open for the lifetime of the subprocess (preferred). Option B: use a named pipe / Unix socket (more code). I'll prototype A first; if Bun's stdin handling causes issues we can fall back to B. +- **Anthropic skill conflict** — there's a global `anthropic-skills:agent-maturity-assessment` skill that does the same thing in pure-Claude mode (no binary). The new `claude-plugin/skills/agent-maturity-assessment/SKILL.md` should mention both paths so users in Co-Work / Claude Code without the Team Hero binary still get a working assessment via the Anthropic skill. +- **AI determinism** — even with strict schema, score values can drift between runs. The cache hash includes inputs, so re-running with same inputs returns identical results. For "movement matters more than absolute level" (per the skill), this is acceptable. +- **Local-repo path semantics** — when `--path .` is used inside this very repo, does the assessment count `docs/maturity-skill-ref/` as evidence of item #7 (repo-local agent context)? Probably yes — the rubric says "skill files checked into the repo" qualifies. We'll let the AI judge based on the evidence. diff --git a/docs/maturity-skill-ref/SKILL.md b/docs/maturity-skill-ref/SKILL.md new file mode 100644 index 0000000..e4e8e3d --- /dev/null +++ b/docs/maturity-skill-ref/SKILL.md @@ -0,0 +1,105 @@ +--- +name: agent-maturity-assessment +description: Run the Agent Maturity Assessment — a 12-criterion diagnostic for engineering organization readiness in the AI-agentic coding era. Items score 0/0.5/1 across four weighted categories (engineering basics 1.0×, knowledge & context 1.5×, AI governance & quality 1.25×, hiring 1.0×), producing a weighted percentage and a raw /12. Use whenever the user wants to audit, diagnose, or score an engineering organization, team, repo, or recently acquired company for AI readiness. Trigger on phrases like "agent maturity", "agent readiness", "AI maturity", "engineering org health", "engineering maturity", "audit the team", "score this repo", "diagnose dev experience", "is this team ready for AI", "is this team modern", "how healthy is this org", or any onboarding-era assessment — even when the user doesn't say "skill". Produces a scored audit with item-level evidence, category subtotals, weighted overall score, top fixes, and strengths to preserve. +--- + +# Agent Maturity Assessment + +A diagnostic for engineering organization health in the AI-agentic coding era. The question this assessment answers: **is this org capable of shipping safely with humans and agents working in parallel, on a codebase that doesn't degrade with every iteration?** + +This skill owns the criteria, the scoring rubric, and the audit output format. It runs against either a whole organization or a specific scope (team, product line, repo). + +## When to use + +- **One-shot audit**: assess an organization's current state during onboarding, or a specific team / repo / acquired company. +- **Recurring**: re-run quarterly against the same org to track movement, or against new sub-teams as they form or get acquired. +- **Spot-check**: a single repo or service can be scored against just the items that apply (note which items were skipped and why). + +The artifact is the deliverable. Always produce the written audit using the template in `references/output-template.md` — never just give a verbal summary. + +## The 12 criteria at a glance + +|# |Item |Category |Weight| +|--|--------------------------------------------------|--------------------------|------| +|1 |Reproducible dev environments |A. Engineering basics |1.0× | +|2 |Sub-day integration cadence with measured outcomes|A. Engineering basics |1.0× | +|3 |Testability and the agent inner loop |A. Engineering basics |1.0× | +|4 |Observability before features |A. Engineering basics |1.0× | +|5 |Design discipline as a first-class practice |B. Knowledge & context |1.5× | +|6 |Codebase composed of deep modules |B. Knowledge & context |1.5× | +|7 |Repo-local agent context |B. Knowledge & context |1.5× | +|8 |Sanctioned, governed AI tooling |C. AI governance & quality|1.25× | +|9 |Human review on every PR |C. AI governance & quality|1.25× | +|10|Evals for AI-touched code paths |C. AI governance & quality|1.25× | +|11|Blast-radius controls for agent actions |C. AI governance & quality|1.25× | +|12|Interviews assess judgment under AI augmentation |D. Hiring |1.0× | + +Each item scores **1.0** (pass), **0.5** (partial), or **0.0** (fail). Be conservative: if it's not visibly true, it's 0.5. If there's no evidence at all, it's 0. + +**For full score levels, repo checks, and diagnostic commands per item, read `references/criteria.md`.** + +Category B is weighted highest because it compounds — a team that gets B right tends to fix everything else. + +## How to run an audit + +1. **Decide scope.** Whole org, one product line, one repo, or one team. Score the appropriate level — don't average across heterogeneous teams. A 14-person backend team and a 3-person ML team should be scored separately. +2. **Environment preflight.** Read `references/preflight.md`. Probe for `gh` CLI / GitHub MCP / git access and select an evidence-fidelity tier before running any diagnostics. **Always announce the tier you're running at** so the audit is reproducible. +3. **Phase 1 — Org-level interview.** Read `references/interview.md` first. Read `docs/audits/CONFIG.md` for stored answers, present them for confirmation or refresh, ask fresh for any missing. Do this before evidence gathering so the answers can inform scoring on items 2, 5, 8, 10, 11, 12. **Critical:** ask one question at a time and wait for the answer before asking the next — even in auto / autonomous modes. Use the structured question UI (e.g., `AskUserQuestion`) when available with the option sets in `references/interview.md`. Dumping all 7 questions in one message and proceeding without answers produces a hollow audit; treat each question as a hard checkpoint. +4. **Map adjacent repos.** Read `references/preflight.md` (multi-repo section). CI templates, Terraform modules, QA suites, runbooks, and shared agent context often live in sibling repos. Capture the list before scoring; merge in any out-of-band repos surfaced by Phase 1 question 7. +5. **Gather evidence per item.** Don't take anyone's word for it. For each item, do at least one of: read the repo (and its adjacents), run the diagnostic commands listed in `references/criteria.md` at the highest fidelity tier available, ask a non-leadership IC the diagnostic question, or check the relevant dashboard/settings page. Combine repo evidence with Phase 1 answers using the mapping table in `references/interview.md`. +6. **Score conservatively.** When in doubt, 0.5. Revise up next quarter if evidence appears. If a Phase 1 answer was "I don't know", score that item `n/a` — never `0`. +7. **Write the audit** using the template in `references/output-template.md`. The artifact is the deliverable. Each "Why this score" cell is one sentence, ≤ 25 words. +8. **Update CONFIG.md** with confirmed/updated Phase 1 answers and today's date (see `references/interview.md` for format). +9. **Decide on distribution.** First audit at a new role is usually best kept internal until the calibration has been validated. Re-run in 90 days. + +## Scoring + +**Raw score**: sum of all 12 item scores. Max 12. + +**Weighted score** (recommended primary metric): + +``` +A_total = sum(items 1–4) × 1.00 // max 4.00 +B_total = sum(items 5–7) × 1.50 // max 4.50 +C_total = sum(items 8–11) × 1.25 // max 5.00 +D_total = sum(item 12) × 1.00 // max 1.00 + ────────── +weighted = A + B + C + D +max = 14.50 +score% = (weighted / 14.50) × 100 +``` + +If any item is scored `n/a`, drop it from both numerator and max for that audit and note it in the Summary. + +**Bands**: + +|Band |Score %|Interpretation | +|-----------------------|-------|---------------------------------------------------------------------------------------| +|Excellent |90%+ |Genuinely rare. Confirm with a second pass — first audits often score too generously. | +|Healthy |75–89% |Targeted fixes will compound. | +|Functional but slow |60–74% |Real risk of being out-shipped by AI-native competitors. Where most orgs actually live.| +|Significant dysfunction|40–59% |Treat as a turnaround. | +|Triage |<40% |Stop new feature work until basics are in. | + +The bar: **<11/12 raw and <80% weighted means there's leverage to capture.** + +## Operating principles + +- **Score conservatively.** Better to score 0.5 and revise up than to over-score on day one and have to explain why everything got "worse". +- **Evidence beats assertions.** A team that says they have ADRs but the last one was committed two years ago scores 0.5, not 1.0. +- **Unknown ≠ failing.** If a criterion can't be answered from the repo and the human indicates the answer is unknown or out of scope, score it `n/a`, drop it from numerator and max, and note what would resolve it. Do not default to 0 for absence of context. +- **Don't average heterogeneous teams.** Score them separately and report side-by-side. +- **Use it as a conversation tool, not a club.** The point is to find leverage, not to grade people. +- **Re-score quarterly.** Movement matters more than absolute level. +- **Calibrate against itself, not against other companies.** The first audit is the baseline; trends are the signal. + +## Adapting the assessment + +As organizations mature and the AI tooling landscape shifts, expect items to be added, dropped, or re-weighted. Track changes to the assessment itself (not just individual audits) in an `audits/CHANGELOG.md` so historical scores remain interpretable. + +## Reference files + +- `references/preflight.md` — Environment preflight, evidence tiers, multi-repo scope handling, host-side probe script. +- `references/criteria.md` — Full text of all 12 criteria: score levels, repo checks, diagnostic commands, why each matters. +- `references/interview.md` — Phase 1 questions, internal Q→criterion mapping, CONFIG.md storage format. +- `references/output-template.md` — Audit output template, rules for filling it out, worked example of a "Why this score" cell. diff --git a/docs/maturity-skill-ref/references/criteria.md b/docs/maturity-skill-ref/references/criteria.md new file mode 100644 index 0000000..c6da4fb --- /dev/null +++ b/docs/maturity-skill-ref/references/criteria.md @@ -0,0 +1,203 @@ +# Criteria reference + +Full text of all 12 criteria for the Agent Maturity Assessment: score levels, repo checks, diagnostic commands, and rationale per item. Read this when gathering evidence (step 5 of *How to run an audit* in `SKILL.md`). + +Each item scores **1.0** (pass), **0.5** (partial), or **0.0** (fail). Be conservative: if it’s not visibly true, it’s 0.5. If there’s no evidence at all, it’s 0. If a criterion can’t be assessed from the repo and the user indicated unknown in Phase 1, score it `n/a` (see *Unknown ≠ failing* in `SKILL.md`). + +## Category A — Engineering basics (weight 1.0×) + +Non-negotiable foundations. Failure here multiplies risk on everything else. + +### 1. Reproducible dev environments + +- 1.0 — Clone-to-green-build in <30 min via devcontainer, Nix, or a single setup script. Same path works for an agent. +- 0.5 — README exists but bootstrap takes >2 hours or has known broken steps. +- 0.0 — “Ask Bob, he knows the trick.” + +**Repo check:** `.devcontainer/`, `flake.nix`, `setup.sh`, or equivalent. Run it from a clean machine. + +**Diagnostic commands:** + +- `ls .devcontainer/ flake.nix setup.sh scripts/bootstrap* 2>/dev/null` — bootstrap surface +- `time bash ` on a clean machine to verify the <30 min claim +- `gh repo view / 2>/dev/null` for any external bootstrap repo identified during scope mapping + +**Why it matters:** Onboarding latency is the first multiplier on team velocity, and agents need bootstrappable environments too. If a human can’t get green in 30 minutes, an agent definitely can’t. + +### 2. Sub-day integration cadence with measured outcomes + +- 1.0 — Code integrates to mainline at least daily. PRs are small and merge sub-day. All four DORA metrics (deployment frequency, lead time, change-fail rate, MTTR) are tracked and visible. Branching model can be trunk-based, GitHub flow, or short-lived Git flow — what matters is the absence of long-lived branches and the presence of measured integration discipline. +- 0.5 — Some metrics tracked, but cadence is weekly, PRs sit for days, or feature branches routinely outlive a sprint. +- 0.0 — Long-lived feature branches as the norm, release trains measured in months, no metrics. + +**Repo check:** age distribution of merged PRs over the last 90 days; presence of any DORA dashboard. + +**Diagnostic commands:** + +- `gh pr list --state merged --limit 200 --search "merged:>$(date -d '90 days ago' +%Y-%m-%d)" --json mergedAt,createdAt,additions,deletions,reviews,author` — cadence + lead time + PR size + review counts in one call +- `gh api "repos/{owner}/{repo}/branches?per_page=100" --paginate --jq '.[] | {name, last_commit_sha: .commit.sha}'` then resolve commit dates → branch staleness distribution +- `gh run list --workflow=deploy*.yml --limit 100 --json conclusion,createdAt,name --branch ` — deployment frequency proxy and change-fail rate (failed conclusions / total) +- For monorepos with deploys in adjacent infra/CD repos: rerun the `gh run list` against `/` + +**Combine with Phase 1 Q3** (DORA visibility): repo evidence covers cadence; the interview answer covers whether the four metrics are *actually visible to the team*. + +**Why it matters:** Integration cadence is the leading indicator of engineering performance. With agents in the loop the case is stronger — agents work fastest when changes validate against current main immediately, and long-lived branches accumulate integration debt humans have to resolve later. + +### 3. Testability and the agent inner loop + +- 1.0 — The application is *built* to be tested: real seams (DI, ports/adapters, deep modules with clean interfaces) so behaviors can be verified at module boundaries without spinning up the world. Unit tests are sub-second; the full suite runs in minutes; flaky tests are treated as bugs and fixed within a sprint. A single command runs the suite headlessly with machine-parseable output. TDD-style inner loops — write the test, make it pass, refactor — are the *default* mode of working with AI. +- 0.5 — Tests exist and mostly run, but the application has known untestable areas, the suite is slow enough to break flow, flaky tests get re-run rather than fixed, or TDD with agents is occasional rather than default. +- 0.0 — Manual QA, flaky-and-ignored test suite, or no seams in the application — agents can technically run `npm test` but the signal is garbage. + +**Repo check:** run the suite, time it, check failure rate over the last 50 CI runs; sample a recent feature PR and look at whether tests were written before or after the implementation. + +**Diagnostic commands:** + +- `time ` (e.g. `time pnpm test`, `time dotnet test`) — full suite duration +- `find . -name "*.test.*" -o -name "*.spec.*" -o -name "*Tests.cs" 2>/dev/null | wc -l` — test file count as a sanity floor +- `gh run list --workflow=ci.yml --limit 50 --json conclusion --jq '[.[] | .conclusion] | group_by(.) | map({status: .[0], count: length})'` — flake/fail rate +- `grep -rE "\\|\\|\\s*true|continue-on-error:\\s*true" .github/workflows/ 2>/dev/null` — CI swallowing failures (any hit = item probably 0.0 regardless of test count) +- For QA in adjacent repo (e.g. `/qa-e2e`): `gh repo view /` and inspect its CI run history the same way + +**Why it matters:** Humans can reason around bad tests (“yeah, that test is garbage, but I know the code works”). Agents can’t — they follow the signal. The test suite is the rate limit on agent throughput; agents without fast, trustworthy feedback outrun their headlights and produce thrash. + +### 4. Observability before features + +- 1.0 — Structured logs, distributed traces, error budgets defined, on-call with runbooks. New features ship instrumented. +- 0.5 — Logs and metrics exist but tracing is partial; runbooks stale. +- 0.0 — “We grep CloudWatch when something breaks.” + +**Repo check:** OTel libraries in deps, dashboards exist, error budget docs, recency of last runbook update. + +**Diagnostic commands:** + +- `grep -rEh "OpenTelemetry|opentelemetry|Microsoft\\.ApplicationInsights|datadog|prometheus|grafana|loki|tempo|sentry|honeycomb|newrelic|splunk" --include="*.csproj" --include="package.json" --include="go.mod" --include="requirements*.txt" --include="Cargo.toml" --include="pom.xml" --include="build.gradle*" 2>/dev/null` — instrumentation / agent libs (Grafana itself is viz; this catches the Grafana Cloud agent, faro SDK, Loki/Tempo clients that feed it) +- `find . \( -path "*/grafana/*.json" -o -path "*/dashboards/*.json" -o -name "*.libsonnet" -o -path "*/prometheus/*.yml" -o -path "*/alerts/*.yml" \) -not -path "*/node_modules/*" 2>/dev/null` — committed Grafana dashboards, Jsonnet, Prometheus alert rules +- `find . -ipath "*runbook*" -o -ipath "*incident*" -o -ipath "*sli*" -o -ipath "*slo*" 2>/dev/null` — runbook / SLO presence +- `git log --since="180 days ago" --oneline -- docs/runbooks/ docs/ops/ 2>/dev/null | wc -l` — recency of operational docs +- For dashboards/alerts in an adjacent repo (e.g. `/observability`, `/grafana-dashboards`): rerun the dashboard-file `find` there — score across both + +**Why it matters:** You can’t fix what you can’t see. AI accelerates ship rate, which accelerates incident rate — observability is the safety net that makes acceleration survivable. + +## Category B — Knowledge & context (weight 1.5×) + +This is what’s gotten *more* important with LLMs, not less. Agents perform at the level of context the org provides them, and codebase shape determines whether agents can navigate it at all. Weighted highest because this category compounds — a team that gets B right tends to fix everything else. + +### 5. Design discipline as a first-class practice + +- 1.0 — ADRs are current and dated. ARCHITECTURE.md exists per active repo. A **ubiquitous language glossary** is checked in, referenced in agent context, and the team enforces its terms in code, docs, and conversation. Design happens *before* code generation: agents are pointed at planning skills (e.g., “interview-me-until-shared-understanding” patterns) that force a shared design concept before any code is written. ADR/glossary commits are visible in the last 90 days — design is an ongoing investment, not a one-time write. +- 0.5 — Some design artifacts exist but are stale; ubiquitous language is implicit (people just know the terms); planning happens informally before some agent work but not consistently. +- 0.0 — Tribal knowledge. Architecture lives in one staff engineer’s head. Agents are turned loose without shared design concept and produce confidently wrong code. + +**Repo check:** `docs/adr/`, `ARCHITECTURE.md`, glossary or ubiquitous-language file; check git log on those paths for recency; sample an agent-driven PR for evidence of upfront design vs. straight-to-code. + +**Diagnostic commands:** + +- `find . -ipath "*adr*" -name "*.md" 2>/dev/null | head; find . -iname "ARCHITECTURE.md" -o -iname "GLOSSARY.md" -o -iname "*ubiquitous*" 2>/dev/null` — design surface +- `git log --since="90 days ago" --oneline -- docs/adr/ ARCHITECTURE.md 2>/dev/null | wc -l` — ongoing investment vs. one-time write +- For ADRs in a central docs repo: `gh api "repos///contents/adr" --jq '.[].name'` + +**Combine with Phase 1 Q4** (design before code): files prove artifacts exist; the interview answer proves design happens *before* code generation in practice. + +**Why it matters:** Specs-to-code without design discipline produces software entropy — each iteration makes the codebase worse. Investing in design daily is what keeps tactical AI execution aligned with strategic intent. The ubiquitous language is the bridge between domain experts, engineers, and agents — without it, every translation step introduces drift. + +### 6. Codebase composed of deep modules + +- 1.0 — The codebase is structured as **deep modules**: few large modules, each with substantial functionality hidden behind a simple, stable interface. Public interfaces are small and intentional; implementations can be sizeable but encapsulated. When agents add code, they add it inside an existing deep module’s boundary or create a new module with a clear interface — they don’t sprinkle helpers across the codebase. +- 0.5 — Some areas well-modularized; others are shallow / sprinkly. Agents tend to add code in surface-level helpers rather than respecting boundaries. A handful of god-classes exist but are known and bounded. +- 0.0 — Sprawling shallow modules with leaky interfaces; 4000-line god files alongside 30-line helper files with no clear pattern. Agents can’t navigate the module map and produce code that crosses arbitrary boundaries. + +**Repo check:** file size distribution, public API surface per module, sample two random modules and see whether you can summarize each one’s purpose in a sentence; drop one into an LLM and ask it to explain. + +**Why it matters:** AI excels at filling in implementation when given a clean interface; it produces sprawl when given no constraints. Deep modules give agents the right *shape* of problem to solve. Shallow codebases compound entropy with every agent-driven change. + +### 7. Repo-local agent context + +- 1.0 — `CLAUDE.md` / `AGENTS.md` / skill files checked into the repo. Team-level prompt and skill libraries are versioned. Agents joining the team get the same onboarding humans get. Agent context references the ubiquitous language and the module map (items 5 + 6). +- 0.5 — Some individuals have personal CLAUDE.md files; nothing shared at the repo level. +- 0.0 — No agent context anywhere; people copy-paste instructions into chat each time. + +**Repo check:** `CLAUDE.md`, `AGENTS.md`, `.claude/`, `.cursor/rules/`, `.skills/`, or equivalent. Read one — does it teach the agent something the engineer wouldn’t have to be told? + +**Diagnostic commands:** + +- `find . -maxdepth 4 \( -iname "CLAUDE.md" -o -iname "AGENTS.md" -o -name ".claude" -o -name ".cursor" -o -name ".skills" -o -name "memory-bank" \) -not -path "./node_modules/*" -not -path "./.git/*" 2>/dev/null` — agent-context surface +- For each found file/dir: `wc -l` and `git log -1 --format="%ar" -- ` to gauge depth and recency +- For shared agent context in adjacent repo (e.g. `/claude-skills`, `/.github`): `gh repo view /` and check whether this repo references it + +**Why it matters:** Agents perform at the level of context the repo provides them. Ad-hoc personal prompts mean each engineer’s agent operates at a different standard; checked-in context means everyone (and every agent) gets the same baseline. + +## Category C — AI governance & quality (weight 1.25×) + +The new control plane. + +### 8. Sanctioned, governed AI tooling + +- 1.0 — Approved model list, ZDR posture documented, secrets scanning on agent outputs, clear policy on what can / can’t be sent to third parties, paid seats budgeted. +- 0.5 — Tooling is paid for but governance is loose; or governance is tight but everyone uses personal accounts anyway. +- 0.0 — Shadow AI. People paste prod data into free-tier chatbots. + +**Diagnostic:** primary signal is the user interview answer (Phase 1 Q1). Cross-check against any policy docs in `/.github` or an internal handbook if reachable. If the user said “I don’t know”, score `n/a`. + +**Why it matters:** Shadow AI is shadow IT with worse confidentiality and IP risk. Governance now is cheaper than recovering from a leak later. + +### 9. Human review on every PR regardless of authorship + +- 1.0 — AI-generated code is reviewed by a human who understands it well enough to defend it in a postmortem. “The agent wrote it” is not a shield. +- 0.5 — Reviews happen but are cursory; AI-authored PRs get rubber-stamped. +- 0.0 — Auto-merge on agent PRs, or no review process at all. + +**Repo check:** PR review settings, review depth on a sample of recent AI-tagged PRs. + +**Diagnostic commands:** + +- `find . -name "CODEOWNERS" 2>/dev/null` — review enforcement file +- `gh api "repos/{owner}/{repo}/branches//protection" 2>/dev/null` — branch protection rules (auth scope permitting) +- `gh pr list --state merged --limit 50 --json reviews,author,additions,deletions --jq '[.[] | {pr: .number, author: .author.login, reviewers: [.reviews[].author.login] | unique, lines: (.additions + .deletions)}]'` — review depth and non-author reviewer presence per PR +- For org-level review policy in `/.github`: `gh api "repos//.github/contents/" --jq '.[].name'` + +**Why it matters:** AI-authored code that no human can defend is technical debt with no owner. Review discipline is what keeps the org accountable for what it ships. + +### 10. Evals for AI-touched code paths + +- 1.0 — If LLMs are in the product → offline eval suite + prod telemetry. If LLMs are in the dev loop → adoption, throughput, and defect rate measured honestly (not just “everyone loves it”). +- 0.5 — Vibes-based confidence; some metrics but no rigor. +- 0.0 — No evals, no measurement, no idea if the AI helps or hurts. + +**Repo check:** `evals/`, `benchmarks/`, internal AI tooling dashboards. + +**Combine with Phase 1 Q5** (eval coverage): repo evidence covers product-side evals; the interview answer covers dev-loop measurement, which rarely lives in the repo. If the user said “I don’t know” *and* no `evals/` or `benchmarks/` directory exists, score `n/a`. + +**Why it matters:** Without evals, you can’t tell whether AI is helping or hurting — you’re managing on vibes. Evals are also the only way to catch silent regressions in AI-driven product features. + +### 11. Blast-radius controls for agent actions + +- 1.0 — Scoped credentials per agent, dry-run modes, audit logs of every agent-triggered write, documented rollback paths. The “agent shipped a migration to prod at 2am” scenario has been red-teamed. +- 0.5 — Some controls exist but are inconsistent; audit logs partial. +- 0.0 — Agents have prod write access via human-equivalent creds; no audit trail. + +**Diagnostic question:** “what’s the dumbest possible agent action that could break prod, and would we know within 5 minutes?” + +**Diagnostic commands:** + +- `grep -rEh "azure/login@|aws-actions/configure-aws-credentials@|google-github-actions/auth@" .github/workflows/ 2>/dev/null` — OIDC adoption (presence of `with: client-id:` rather than `secrets.AWS_ACCESS_KEY_ID` is the green flag) +- `gh api "repos/{owner}/{repo}/environments" --jq '.environments[] | {name: .name, has_protection: (.protection_rules | length > 0)}' 2>/dev/null` — env-scoped deploys with reviewers +- `find infra/ terraform/ -name "*.tf" 2>/dev/null | xargs grep -lE "service_account|workload_identity|managed_identity|user_assigned_identity" 2>/dev/null` — scoped per-workload identities +- `grep -rEh "azurerm_role_assignment|google_project_iam|aws_iam_role" infra/ terraform/ 2>/dev/null | wc -l` — IAM blast-radius posture +- For Terraform/IAM in adjacent infra repo (e.g. `/infra`): clone shallow and rerun the same greps there + +**Combine with Phase 1 Q6** (red-team posture): files prove technical posture; the interview answer proves the worst-case scenario has been thought through. + +**Why it matters:** Autonomous agents will eventually do something stupid. The question is whether the blast radius is bounded by design or by luck. + +## Category D — Hiring (weight 1.0×) + +### 12. Interviews assess judgment under AI augmentation + +- 1.0 — Candidates use AI in interviews and are evaluated on critique, decomposition, recognizing wrong answers, and shipping correct work. The bar is “great judgment with AI”, not “no AI allowed”. +- 0.5 — AI is allowed but interviewers don’t know how to assess its use; or it’s banned for “purity” reasons. +- 0.0 — Old-style whiteboard-only interviews; or no real technical bar at all. + +**Diagnostic:** primary signal is the user interview answer (Phase 1 Q2). If a rubric is reachable in an internal repo, cross-check. If the user said “I don’t know”, score `n/a`. + +**Why it matters:** Hiring is a forward-looking bet. The skill that matters in the AI-agentic era isn’t “can write code without AI” — it’s “can use AI well.” Interviews that don’t measure that bet on the wrong skill. \ No newline at end of file diff --git a/docs/maturity-skill-ref/references/interview.md b/docs/maturity-skill-ref/references/interview.md new file mode 100644 index 0000000..436d385 --- /dev/null +++ b/docs/maturity-skill-ref/references/interview.md @@ -0,0 +1,108 @@ +# Phase 1: Org-level interview + +Several criteria can't be answered from the codebase alone — they're behavioral, organizational, or policy facts. Phase 1 collects those answers from a human before scoring begins, and persists them to `docs/audits/CONFIG.md` so re-audits can confirm-or-refresh rather than re-interview from scratch. + +Read this when running step 3 of *How to run an audit* in `SKILL.md`. + +## How to ask the questions — read this carefully + +**Phase 1 is a real interview, not a form-dump.** The signal you get back depends entirely on the human actually engaging with each question. If you paste all seven questions in one message and then proceed, the human will skim, give you `n/a` to most, and the audit will be hollow. **Don't do that.** + +### The rule + +**Ask one question. Stop. Wait for the answer. Only then move to the next question.** This applies even in auto / autonomous modes — the interview is the rare place where blocking on a human is the *correct* behavior, because there is no other source for these answers. Treat each question as a hard checkpoint. + +If the user has not yet replied to question N, you may not ask question N+1 and you may not begin evidence gathering. The only exception is if the user explicitly says "skip the rest" or "just score what you can without me" — in which case mark every remaining question `unknown` and proceed. + +### Use the structured question UI when it's available + +If you have access to a tool that presents the user with a question + a small set of pre-written answer options (in Claude Desktop / Claude Code this is the `AskUserQuestion` tool — the user sees buttons or a list they can click; in other harnesses it may have a different name), **use it for every Phase 1 question**. It dramatically increases response rates and gives you cleaner answers to persist into CONFIG.md. + +For each question: + +1. Frame the question itself (verbatim from the list below — don't paraphrase, the wording is calibrated). +2. Provide 3-4 answer options that map cleanly to the score levels for the corresponding criterion. Always include an "I don't know / not sure" option — that maps to `n/a`, never `0`. +3. Allow a free-text override so the user can give nuance the options miss. + +If no structured-question tool is available in this harness, fall back to plain chat — but still **one question per message, and wait for the reply before sending the next one**. + +### Suggested option sets + +These are starting points — adapt the wording to the org if you have context, but keep the spread of "good / partial / bad / unknown" intact. + +|Q#|Suggested options | +|--|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|1 |• Company-paid managed seats + documented data-handling policy
• Company-paid seats but governance is loose / no written policy
• Mostly personal accounts or free tier; no policy
• I don't know | +|2 |• AI allowed in interviews, interviewers trained to assess judgment with AI
• AI allowed but assessment is informal / uncalibrated
• AI banned, or interviews don't really test technical judgment
• I don't know | +|3 |• All four DORA metrics tracked on a dashboard the team actually uses
• Some DORA metrics tracked but not actively watched
• Not really tracked / vibes-based
• I don't know | +|4 |• Consistent upfront design step (ADR / spec / shared-understanding) before agent code
• Some engineers do it, others prompt straight into code
• No design step — agents are pointed at problems and turned loose
• I don't know | +|5 |• LLMs in product with offline evals + prod telemetry
• LLMs in dev loop with tracked metrics — any deliberate tracking counts (Asana, spreadsheet, sprint retro numbers, GitHub label analysis, etc.)
• LLMs used but purely gut-feel — no numbers anyone could point to
• No LLMs in product or dev loop
• I don't know | +|6 |• Worst-case agent scenarios have been red-teamed; rollback paths documented
• Some controls in place but no explicit red-teaming
• No red-teaming; agents share human-equivalent prod creds
• I don't know | +|7 |• Yes — list the repos
• No, scope is just the primary repo(s) you've found
• I don't know | + +## Behavior on each run + +1. **Read `docs/audits/CONFIG.md`** for an `## Org-level answers` section. +2. **If the section exists**, present each stored answer to the user verbatim, with the `last_updated` date, and ask: *"Still accurate? (yes / updated answer / I don't know)"*. For confirmation-or-refresh you may batch the stored answers into a single review message — that's a different mode from a fresh interview, because the user is *editing* known state rather than producing it cold. +3. **For any question without a stored answer** (or where the user said the stored answer is no longer accurate), conduct the fresh interview using the **one-question-at-a-time** rule above. +4. **For any question with no stored answer and no fresh answer either** (user says "I don't know"), record the answer as `unknown` in CONFIG.md and score the mapped criterion as `n/a` for this run. +5. **After scoring, write back** the confirmed/updated answers to `docs/audits/CONFIG.md` under `## Org-level answers`, with `last_updated: `. If CONFIG.md doesn't exist, create a minimal version with just this section and add a line to *Notes for re-audit* recommending the user run `setup-agent-maturity-assessment` for full setup. + +## Questions to ask (verbatim, in order, one at a time) + +Before the first question, send a short framing message: *"I'm going to ask 7 quick questions one at a time — they cover the parts of the audit that aren't visible in the repo. 'I don't know' or 'n/a' is a valid answer to any of them and will mark that criterion as not assessed, not failed."* + +1. What AI tooling do engineers actually use day-to-day (Claude, Copilot, Cursor, etc.)? Is it company-paid with managed accounts, or are people using personal accounts or free tiers? Is there a documented policy on what data can be sent to third-party AI providers? +2. Do technical interviews allow candidates to use AI, and are interviewers trained to evaluate *how well* they use it (critique, decomposition, catching wrong outputs)? Or is AI either banned or effectively unassessed? +3. Are all four DORA metrics (deployment frequency, lead time, change failure rate, MTTR) actively tracked and visible to the team — e.g., a dashboard engineers actually look at? Or are some tracked in theory but not used? +4. When engineers hand work to AI agents, is there a consistent upfront design step (ADR, shared-understanding session, spec) before code generation? Or is it ad hoc — some engineers do it, others prompt straight into code? +5. Are LLMs in the product (user-facing features), in the dev loop only, or both? If in the product: is there an offline eval suite plus production telemetry? If dev-loop only: is AI impact tracked deliberately — even a spreadsheet, Asana board, or sprint retro metric counts — or is it purely gut-feel with no numbers anyone could point to? +6. Has anyone explicitly red-teamed a worst-case agent scenario in prod (bad migration, runaway infra change, secret exfiltration)? Are rollback paths for agent-triggered writes documented? +7. Are there adjacent repos I should treat as in-scope that automated detection might miss — e.g., an internal handbook, security/IT policy repo, org-wide `.github` repo, shared skill library? + +## Internal mapping (for scoring — do not show to the user) + +|Q#|Criterion |How to combine with repo evidence | +|--|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------| +|1 |C8 — Sanctioned AI tooling |Primary signal. Cross-check `/.github` policies if reachable. | +|2 |D12 — Judgment under AI augmentation |Primary signal. Cross-check rubric repo if reachable. | +|3 |A2 — Sub-day integration cadence |Combine with `gh pr list` / `gh run list` evidence. Repo evidence covers cadence; interview covers metric *visibility*. | +|4 |B5 — Design discipline |Combine with ADR / glossary file evidence. Files prove artifacts exist; interview proves design happens *before* code. | +|5 |C10 — Evals for AI-touched code paths|Repo evidence covers product-side evals (`evals/`, `benchmarks/`); interview covers dev-loop measurement, which rarely lives in the repo. | +|6 |C11 — Blast-radius controls |Combine with OIDC / IAM / branch-protection grep evidence. Files prove technical posture; interview proves the scenario has been thought through.| +|7 |Scope expansion |Merge into the adjacent-repo detection list before evidence gathering. Not a scored criterion. | + +If the user answers "I don't know" to any question, score the mapped criterion as `n/a`, exclude it from numerator and max, and add a line to *Notes for re-audit* in the audit output describing exactly what info would resolve it. + +## CONFIG.md storage format + +Append to or create `docs/audits/CONFIG.md`: + +```markdown +## Org-level answers + +last_updated: 2026-05-02 + +### AI tooling (Q1) + + +### Hiring (Q2) + + +### DORA visibility (Q3) + + +### Design before code (Q4) + + +### Eval coverage (Q5) + + +### Blast-radius red-teaming (Q6) + + +### Out-of-band adjacent repos (Q7) + +``` + +Use `unknown` as the answer text when the user said "I don't know". Do not delete previous answers — update in place so the file's git history shows movement over time. diff --git a/docs/maturity-skill-ref/references/output-template.md b/docs/maturity-skill-ref/references/output-template.md new file mode 100644 index 0000000..6f44d2a --- /dev/null +++ b/docs/maturity-skill-ref/references/output-template.md @@ -0,0 +1,102 @@ +# Audit output template + +Read this when running step 7 of *How to run an audit* in `SKILL.md`. Always produce this exact structure. The per-criterion tables ARE the report — they should be readable in one pass, especially when comparing audits across multiple repos. + +## Rules for filling out the score tables + +- Fill in every row. Use `n/a` with a one-line reason if an item genuinely doesn’t apply to the scope or the user marked the corresponding Phase 1 answer as unknown (then exclude that item from both numerator and max in the score math). +- The *Why this score* column is **one sentence, ≤ 25 words**. State the single most decisive piece of evidence — the thing that pushed the score up or down. No bullet lists, no multi-clause sentences stitched with semicolons, no “but also” hedging. +- If you have more to say, save it for *Top 3 fixes*, *Strengths to preserve*, or *Notes for re-audit*. The table is for the verdict, not the working. +- Score in the column as `0`, `0.5`, `1`, or `n/a` — nothing else. + +## Template + +```markdown +# Agent Maturity Assessment — + +## Summary +- Raw score: X / 12 +- Weighted score: XX.X% +- Band: **** () +- Evidence tier: **<1: gh / 2: GitHub MCP / 3: git-only>** (see references/preflight.md) +- One-line take: + +### Maturity scale (where this audit lands) + +| Band | % range | This audit | +|------|---------|:----------:| +| Excellent | 90%+ | | +| Healthy | 75–89% | | +| Functional but slow | 60–74% | | +| Significant dysfunction | 40–59% | | +| Triage | <40% | | + +Mark the row this audit falls in with `◉` in the right column; leave the others blank. This makes relative position visible at a glance and survives copy-paste to Slack / a doc / a slide. + +## Scores + +### A. Engineering basics (weight 1.0×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 1 | Reproducible dev environments | 0/0.5/1 | | +| 2 | Sub-day integration cadence with measured outcomes | 0/0.5/1 | | +| 3 | Testability and agent inner loop | 0/0.5/1 | | +| 4 | Observability before features | 0/0.5/1 | | + +Subtotal: X.X × 1.00 = X.X / 4.00 + +### B. Knowledge & context (weight 1.5×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 5 | Design discipline as a practice | 0/0.5/1 | | +| 6 | Codebase composed of deep modules | 0/0.5/1 | | +| 7 | Repo-local agent context | 0/0.5/1 | | + +Subtotal: X.X × 1.50 = X.X / 4.50 + +### C. AI governance & quality (weight 1.25×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 8 | Sanctioned, governed AI tooling | 0/0.5/1 | | +| 9 | Human review on every PR | 0/0.5/1 | | +| 10 | Evals for AI-touched code paths | 0/0.5/1 | | +| 11 | Blast-radius controls for agents | 0/0.5/1 | | + +Subtotal: X.X × 1.25 = X.X / 5.00 + +### D. Hiring (weight 1.0×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 12 | Judgment under AI augmentation | 0/0.5/1 | | + +Subtotal: X.X × 1.00 = X.X / 1.00 + +## Top 3 fixes (highest leverage) +1. **** — why this one, what good looks like, suggested owner. +2. **** — … +3. **** — … + +## Strengths to preserve +- +- + +## Adjacent repos consulted +- `/` — +- `/` — … + +(If none: write "None — all evidence within scope repo.") + +## Notes for re-audit +- +- +``` + +## Worked example of a “Why this score” cell + +Do not include this in actual audits — it’s a calibration example for getting the cell length right. + +|Quality |Cell content | +|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|Too long |`pnpm -r test resolves to nothing — no package implements test. ci.yml line 80: dotnet test || true with comment 'no real tests yet'. Zero test files anywhere. Architecture is testable in principle but the inner loop runs nothing.`| +|Too vague |`No tests exist.` | +|Right size|`CI runs dotnet test || true, no test files exist anywhere, and the architecture's seams sit unused.` | \ No newline at end of file diff --git a/docs/maturity-skill-ref/references/preflight.md b/docs/maturity-skill-ref/references/preflight.md new file mode 100644 index 0000000..74a02b8 --- /dev/null +++ b/docs/maturity-skill-ref/references/preflight.md @@ -0,0 +1,107 @@ +# Environment preflight & multi-repo scope + +Read this when running steps 2 (preflight) and 4 (adjacent repo mapping) of *How to run an audit* in `SKILL.md`. + +## Environment preflight + +**First, read `docs/audits/CONFIG.md` if it exists.** That file is scaffolded by the `setup-agent-maturity-assessment` skill and declares the GitHub auth method, the canonical org/repo/branch, the pre-approved list of adjacent repos in scope, and the audit cadence. When it’s present, use its declared values as the source of truth — skip the runtime probes below for the parts CONFIG.md already answers, and treat the runtime probes as drift-detection only. + +If CONFIG.md is **missing** or its declared auth method fails the probe (e.g. CONFIG says “gh” but `gh auth status` errors), fall back to the full preflight below and surface the gap in *Notes for re-audit* so the user can re-run `setup-agent-maturity-assessment` later. + +The diagnostic commands assume `gh` CLI is in `$PATH` and authenticated. In a sandboxed runtime (e.g. Cowork) this is often not true even if `gh` is installed on the host. Run this preflight before scoring and select the tier: + +```bash +# Tier 1 — gh CLI authenticated → highest fidelity (full GitHub API access) +command -v gh >/dev/null 2>&1 && gh auth status >/dev/null 2>&1 && echo "tier=1 gh" + +# Tier 2 — GitHub MCP server connected → equivalent fidelity via MCP tools +# (Detect via host capabilities; in Claude Code, look for tools named like +# list_pull_requests, get_workflow_runs, get_branch_protection.) + +# Tier 3 — git + filesystem only → reduced fidelity +git -C . rev-parse --is-inside-work-tree >/dev/null 2>&1 && echo "tier=3 git-only" +``` + +### Tier behavior + +|Tier |Available |Use for | +|------------------------|--------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------| +|1. `gh` authenticated |All `gh pr list`, `gh api`, `gh run list` commands|Default. Highest-fidelity audits. | +|2. GitHub MCP |Equivalent MCP-routed tools |Use when running in a sandbox where `gh` isn’t on the host but a GitHub MCP is connected. | +|3. git + filesystem only|`git log`, `find`, `grep` |Fallback. Items 2, 3, 9, 11 score against approximations (merge commits as PR proxies, no branch-protection visibility, no review-depth metrics).| + +**At Tier 3, the audit MUST:** + +- State “Tier 3 (git-only) audit — limited GitHub-side evidence” in the Summary’s *One-line take*. +- Add an entry to *Notes for re-audit* listing which items were scored against fallback evidence and what to re-verify when running at Tier 1. +- Never auto-promote a Tier 3 score to 1.0 on items 2, 3, 9, or 11 — the missing GitHub-side data could pull them down. Cap those at 0.5 unless filesystem evidence alone is sufficient. + +**To upgrade Tier 3 → Tier 1 in Cowork (or any sandbox):** add a GitHub MCP server. Cowork’s curated MCP registry doesn’t currently bundle one, so add it as a custom MCP via Settings → MCP Servers, pointing at GitHub’s official `github/github-mcp-server` (remote-hostable) or Anthropic’s reference implementation. Auth flows through your GitHub OAuth/PAT scoped to the orgs you want to audit — no creds touch the sandbox. + +### Optional — host-side probe script + +When the sandbox is stuck at Tier 3 but the user has `gh` on their host, ask them to run this and paste the output back. The audit can incorporate the results without any creds entering the sandbox. + +```bash +#!/usr/bin/env bash +# audit-gh-probe.sh — run on host, paste output to Claude +set -euo pipefail +REPO="${1:?usage: audit-gh-probe.sh }" +SINCE="$(date -d '90 days ago' +%Y-%m-%d 2>/dev/null || date -v-90d +%Y-%m-%d)" + +echo "### gh-pr-list (cadence + lead time + review depth) ###" +gh pr list --repo "$REPO" --state merged --limit 200 \ + --search "merged:>$SINCE" \ + --json number,mergedAt,createdAt,additions,deletions,reviews,author + +echo "### gh-branch-protection ###" +gh api "repos/$REPO/branches/$(gh repo view "$REPO" --json defaultBranchRef --jq .defaultBranchRef.name)/protection" 2>&1 || true + +echo "### gh-environments ###" +gh api "repos/$REPO/environments" --jq '.environments[] | {name, has_protection: (.protection_rules | length > 0)}' 2>&1 || true + +echo "### gh-deploy-runs ###" +gh run list --repo "$REPO" --workflow=deploy --limit 100 \ + --json conclusion,createdAt,name 2>&1 || true + +echo "### gh-ci-runs (flake/fail rate) ###" +gh run list --repo "$REPO" --workflow=ci.yml --limit 50 \ + --json conclusion 2>&1 || true +``` + +## Handling multi-repo scope + +A real engineering org doesn’t fit in one repo. CI workflow templates, Terraform/OpenTofu modules, QA / E2E suites, runbooks and dashboards, and shared agent-context skill libraries frequently live in adjacent repos. Auditing only the primary repo under-scores items that depend on those external sources. + +**If `docs/audits/CONFIG.md` exists, use its `## Adjacent repos` table as the seed list** — those repos are already approved to be in scope. Re-run the detection commands below only as **drift detection** to catch new adjacent repos that have been added since the last setup. Surface any new findings in the audit’s *Adjacent repos consulted* section and recommend a re-run of `setup-agent-maturity-assessment` if the list has grown. + +If CONFIG.md is missing, run the full detection from scratch. + +### Detection — run these from the primary repo before scoring + +```bash +# 1. External GitHub Actions referenced from this repo's workflows +grep -rhE "uses:\s*[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+" .github/workflows/ 2>/dev/null \ + | grep -oE "[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+(@[a-zA-Z0-9_.-]+)?" | sort -u + +# 2. Terraform / OpenTofu modules sourced from external Git +grep -rhE "source\s*=\s*\".*\"" infra/ terraform/ 2>/dev/null \ + | grep -E "git::|github\.com/" | sort -u + +# 3. Submodules +git submodule status 2>/dev/null + +# 4. Generic cross-repo references in docs and scripts +grep -rEh "github\.com/[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+" \ + docs/ scripts/ .github/ README.md 2>/dev/null \ + | grep -oE "github\.com/[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+" | sort -u +``` + +### For each adjacent repo discovered + +- Score the relevant criterion *across both repos*. Examples: if reusable workflows live in `/ci-templates`, item #2 (cadence) and item #9 (review) evidence comes from both. If Terraform modules live in `/infra-modules`, item #11 (blast-radius) needs both. +- Use `gh repo view /` and targeted `gh api`/`gh search` calls to inspect — don’t clone unless necessary. +- If access is blocked (private repo, no permission), score against what’s visible and flag in *Notes for re-audit*. +- List every adjacent repo consulted in the audit’s *Adjacent repos consulted* section so a re-auditor can reproduce. + +**Org-level criteria (#8 governance, #12 hiring) are inherently outside any one repo.** Look for them in `/.github` policy repo, internal handbook, IT/security docs. If you can’t reach those, mark `n/a` with the reason. Phase 1 question 7 is intended to surface these out-of-band sources from the human before evidence gathering. \ No newline at end of file diff --git a/justfile b/justfile index 1f90096..c0a4542 100644 --- a/justfile +++ b/justfile @@ -37,6 +37,10 @@ lint: report: bun run report +# Run the Agent Maturity Assessment (alias: just assess ) +assess *ARGS: + ./tui/teamhero-tui assess {{ARGS}} + # Uninstall CLI wrapper uninstall: sh scripts/uninstall.sh diff --git a/scripts/run-assess.ts b/scripts/run-assess.ts new file mode 100644 index 0000000..a0488a2 --- /dev/null +++ b/scripts/run-assess.ts @@ -0,0 +1,153 @@ +#!/usr/bin/env bun +import { join } from "node:path"; +import { config as dotenvConfig } from "dotenv"; +import { configDir } from "../src/lib/paths.js"; + +dotenvConfig({ path: join(configDir(), ".env"), override: true }); + +import { consola, createConsola } from "consola"; +import { MaturityAIScorer } from "../src/services/maturity/ai-scorer.js"; +import { + FileSystemAuditStore, + readAnswersJson, +} from "../src/services/maturity/audit-store.js"; +import { MaturityService } from "../src/services/maturity/maturity.service.js"; +import { + StdinInterviewTransport, + StdinLineReader, +} from "../src/services/maturity/stdin-interview.js"; +import type { + AssessCommandInput, + InterviewAnswer, +} from "../src/services/maturity/types.js"; + +/** + * Headless maturity-assessment service runner. + * + * Protocol: + * stdin ← First JSON line: AssessCommandInput config. + * Subsequent lines: interview-answer events (when interactiveInterview=true). + * stdout → JSON-lines events: + * - {"type":"progress","step":"...","status":"...","message":"..."} + * - {"type":"interview-frame","message":"..."} + * - {"type":"interview-question","questionId":"q1",...} + * - {"type":"result","outputPath":"...","jsonOutputPath":"...","data":{...}} + * - {"type":"error","message":"..."} + * stderr → consola log output (passed through). + * exit 0 = success, exit 1 = error + */ + +type JsonLineEmitter = (event: Record) => void; + +const emit: JsonLineEmitter = (event) => { + process.stdout.write(`${JSON.stringify(event)}\n`); +}; + +function emitProgress( + step: string, + status: "active" | "complete" | "failed", + message: string, +): void { + emit({ type: "progress", step, status, message }); +} + +// readConfigLine + interview answers share a single stdin reader so the +// stdin pipe doesn't get half-consumed by an async iterator and then closed. + +async function loadInterviewAnswersFromFile( + path: string, +): Promise { + try { + return await readAnswersJson(path); + } catch (err) { + consola.warn( + `Failed to read interview answers from ${path}: ${(err as Error).message}`, + ); + return []; + } +} + +async function main(): Promise { + const logger = createConsola({ defaults: { tag: "maturity" } }); + + const reader = new StdinLineReader(); + const configLine = await reader.nextLine(); + if (!configLine) { + emit({ type: "error", message: "No config received on stdin" }); + process.exit(1); + } + + let input: AssessCommandInput; + try { + input = JSON.parse(configLine) as AssessCommandInput; + } catch (err) { + emit({ + type: "error", + message: `Failed to parse config JSON: ${(err as Error).message}`, + }); + process.exit(1); + } + + emitProgress("startup", "active", "Maturity assessment starting…"); + + // Resolve interview transport + let interview; + let preloaded: InterviewAnswer[] = []; + if (input.interactiveInterview) { + interview = new StdinInterviewTransport(reader, emit); + } else if (input.interviewAnswersPath) { + preloaded = await loadInterviewAnswersFromFile(input.interviewAnswersPath); + } + + // Audit store: only when scope has a localPath + const auditStore = input.scope.localPath + ? new FileSystemAuditStore(input.scope.localPath) + : undefined; + + const scorer = new MaturityAIScorer({ dryRun: input.dryRun ?? false }); + const service = new MaturityService({ + logger, + scorer, + ...(interview ? { interview } : {}), + ...(auditStore ? { auditStore } : {}), + onProgress: (step, message) => emitProgress(step, "active", message), + }); + + // If a pre-supplied answer file was given (and not interactive), seed by + // monkey-patching auditStore.readPriorAnswers to return those values when + // no CONFIG.md exists. We do it here to keep MaturityService reusable. + if (preloaded.length > 0 && auditStore) { + const original = auditStore.readPriorAnswers.bind(auditStore); + auditStore.readPriorAnswers = async () => { + const fromFile = preloaded; + const fromConfig = await original(); + const merged = new Map(fromConfig.map((a) => [a.questionId, a] as const)); + for (const a of fromFile) merged.set(a.questionId, a); + return [...merged.values()]; + }; + } + + try { + const result = await service.run(input); + emitProgress("complete", "complete", "Audit complete."); + emit({ + type: "result", + outputPath: result.outputPath, + ...(result.jsonOutputPath + ? { jsonOutputPath: result.jsonOutputPath } + : {}), + data: result.artifact as unknown as Record, + }); + process.exit(0); + } catch (err) { + emit({ type: "error", message: (err as Error).message }); + consola.error(err); + process.exit(1); + } +} + +main().catch((err) => { + emit({ type: "error", message: (err as Error).message }); + consola.error(err); + process.exit(1); +}); diff --git a/src/cli/index.ts b/src/cli/index.ts index 453cf13..2758fe5 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -154,7 +154,7 @@ export function createCli( reportArgIndex >= 0 ? process.argv.slice(reportArgIndex + 1) : []; // Reject subcommands that are top-level — don't allow `teamhero report doctor`. - const subcommands = ["doctor", "setup"]; + const subcommands = ["doctor", "setup", "assess"]; if (argsToPass.length > 0 && subcommands.includes(argsToPass[0])) { deps.logger.error( `Unknown argument: ${argsToPass[0]}. Did you mean \`teamhero ${argsToPass[0]}\`?`, @@ -165,6 +165,20 @@ export function createCli( await spawnTui(deps, argsToPass); }); + program + .command("assess [args...]") + .description( + "Run the Agent Maturity Assessment (12-criterion AI-readiness audit)", + ) + .helpOption(false) + .allowUnknownOption() + .allowExcessArguments() + .action(async function (this: Command) { + const idx = process.argv.indexOf("assess"); + const argsToPass = idx >= 0 ? process.argv.slice(idx) : ["assess"]; + await spawnTui(deps, argsToPass); + }); + program .command("setup") .description("Configure credentials and preferences") @@ -220,7 +234,7 @@ export async function run( // If a subcommand is followed by --help, pass through to the Go binary // instead of letting Commander handle it (which prints the top-level help). const args = argv.slice(2); - const subcommands = ["report", "doctor", "setup"]; + const subcommands = ["report", "doctor", "setup", "assess"]; if ( args.length >= 1 && subcommands.includes(args[0]) && diff --git a/src/core/types.ts b/src/core/types.ts index a2a8174..e59d063 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -545,3 +545,44 @@ export interface PeriodDeltas { * visibleWins, technicalWins, discrepancyAnalysis, roadmapSynthesis. */ export type SystemPrompts = Record; + +// --------------------------------------------------------------------------- +// Maturity Assessment ports +// --------------------------------------------------------------------------- + +import type { + AdjacentRepo, + EvidenceFact, + EvidenceTier, + InterviewAnswer, + InterviewQuestion, + ScopeDescriptor, +} from "../services/maturity/types.js"; + +export interface MaturityProvider { + /** Stable id matching the rubric item this provider scores. */ + readonly itemId: number; + /** + * Run the deterministic detector against the given scope and tier. + * Returns zero or more EvidenceFact records — never throws on missing files. + */ + collect(input: { + scope: ScopeDescriptor; + tier: EvidenceTier; + adjacentRepos: AdjacentRepo[]; + }): Promise; +} + +export interface InterviewTransport { + /** Show the framing message before the first question. */ + frame(message: string): Promise; + /** Ask one question and wait for the answer. Implementations must block. */ + ask(question: InterviewQuestion): Promise; +} + +export interface AuditStore { + /** Read prior interview answers from docs/audits/CONFIG.md (or equivalent). */ + readPriorAnswers(): Promise; + /** Persist confirmed/updated interview answers and the today date. */ + writeAnswers(answers: InterviewAnswer[], today: string): Promise; +} diff --git a/src/services/maturity/adjacent-repos.ts b/src/services/maturity/adjacent-repos.ts new file mode 100644 index 0000000..b771ae6 --- /dev/null +++ b/src/services/maturity/adjacent-repos.ts @@ -0,0 +1,110 @@ +import { join } from "node:path"; +import { findFiles, readIfExists } from "./fs-utils.js"; +import type { AdjacentRepo, ScopeDescriptor } from "./types.js"; + +const OWNER_REPO = /([a-zA-Z0-9_.-]+)\/([a-zA-Z0-9_.-]+)/; + +const STDLIB_OWNERS = new Set([ + "actions", + "docker", + "github", + "oven-sh", + "hashicorp", // skip the modules' own owners (we only want intra-org neighbours) +]); + +/** + * Detect adjacent repos referenced from the local repo. Mirrors the four + * detection commands in references/preflight.md (multi-repo section): + * + * 1. External GitHub Actions referenced in workflows (`uses: owner/repo@vX`) + * 2. Terraform modules sourced from external Git + * 3. Submodules + * 4. Generic cross-repo references in docs/scripts + */ +export async function detectAdjacentRepos( + scope: ScopeDescriptor, +): Promise { + if (!scope.localPath) return []; + const root = scope.localPath; + const found = new Map(); + + // 1. Workflow references + const workflowFiles = await findFiles(root, { + pathContains: [".github/workflows"], + nameRegex: /\.ya?ml$/i, + maxDepth: 4, + limit: 100, + }); + for (const wf of workflowFiles) { + const content = await readIfExists(join(root, wf)); + if (!content) continue; + for (const line of content.split(/\r?\n/)) { + const usesMatch = /\buses:\s*([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)/.exec( + line, + ); + if (usesMatch) { + const m = OWNER_REPO.exec(usesMatch[1]); + if (m && !STDLIB_OWNERS.has(m[1].toLowerCase())) { + addRepo(found, m[1], m[2], `Workflow uses: ${m[0]}`); + } + } + } + } + + // 2. Terraform module sources + const tfFiles = await findFiles(root, { + pathContains: ["infra/", "terraform/"], + nameRegex: /\.tf$/, + maxDepth: 6, + limit: 100, + }); + for (const tf of tfFiles) { + const content = await readIfExists(join(root, tf)); + if (!content) continue; + const matches = content.matchAll( + /source\s*=\s*"(?:git::|github\.com\/)([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)/g, + ); + for (const match of matches) { + const m = OWNER_REPO.exec(match[1]); + if (m) addRepo(found, m[1], m[2], "Terraform module source"); + } + } + + // 3. Submodules + const gitmodules = await readIfExists(join(root, ".gitmodules")); + if (gitmodules) { + const matches = gitmodules.matchAll( + /url\s*=\s*(?:.*github\.com[:/])?([a-zA-Z0-9_.-]+\/[a-zA-Z0-9_.-]+)/g, + ); + for (const m of matches) { + const owner = OWNER_REPO.exec(m[1]); + if (owner) addRepo(found, owner[1], owner[2], "Git submodule"); + } + } + + // 4. Generic cross-repo refs in README / docs + const readme = await readIfExists(join(root, "README.md")); + if (readme) { + const matches = readme.matchAll( + /github\.com\/([a-zA-Z0-9_.-]+)\/([a-zA-Z0-9_.-]+)/g, + ); + for (const m of matches) { + if (!STDLIB_OWNERS.has(m[1].toLowerCase())) { + addRepo(found, m[1], m[2], "Referenced in README.md"); + } + } + } + + return [...found.values()]; +} + +function addRepo( + map: Map, + owner: string, + name: string, + reason: string, +): void { + const key = `${owner}/${name}`.toLowerCase(); + if (map.has(key)) return; + map.set(key, { owner, name, reason }); +} diff --git a/src/services/maturity/ai-scorer.ts b/src/services/maturity/ai-scorer.ts new file mode 100644 index 0000000..07b1d6c --- /dev/null +++ b/src/services/maturity/ai-scorer.ts @@ -0,0 +1,212 @@ +import { type ConsolaInstance, consola } from "consola"; +import OpenAI from "openai"; +import { getEnv } from "../../lib/env.js"; +import { + buildMaturityPrompt, + MATURITY_ASSESSMENT_SCHEMA, + type MaturityScoringContext, +} from "./maturity-prompts.js"; +import { RUBRIC_ITEMS } from "./rubric.js"; +import type { ItemScore, ItemScoreValue, TopFix } from "./types.js"; + +export interface MaturityAIResult { + oneLineTake: string; + items: ItemScore[]; + topFixes: TopFix[]; + strengths: string[]; + notesForReaudit: string[]; +} + +export interface MaturityAIScorerOptions { + apiKey?: string; + model?: string; + baseUrl?: string; + logger?: ConsolaInstance; + /** Set true in tests / dry-run to skip the network call. */ + dryRun?: boolean; +} + +const TIER3_CAPPED = new Set([2, 3, 9, 11]); + +function parseScore(raw: string): ItemScoreValue { + if (raw === "0") return 0; + if (raw === "1") return 1; + if (raw === "0.5") return 0.5; + if (raw === "n/a") return "n/a"; + throw new Error(`Invalid score string: ${raw}`); +} + +/** + * Enforce tier-3 caps: if an item is in TIER3_CAPPED and the AI awarded 1.0 + * on a git-only audit, downgrade to 0.5 and append a note. We do this + * post-hoc so the AI's reasoning is preserved but the rubric is honored. + */ +function applyTier3Caps( + items: ItemScore[], + tier: MaturityScoringContext["tier"], +): { items: ItemScore[]; notes: string[] } { + if (tier !== "git-only") return { items, notes: [] }; + const notes: string[] = []; + const capped = items.map((s) => { + if (TIER3_CAPPED.has(s.itemId) && s.score === 1) { + notes.push( + `Item ${s.itemId} was capped at 0.5 by the tier-3 rule (no GitHub-side evidence available).`, + ); + return { + ...s, + score: 0.5 as ItemScoreValue, + whyThisScore: `${s.whyThisScore} [Tier-3 cap applied.]`, + }; + } + return s; + }); + return { items: capped, notes }; +} + +/** + * Validate that the AI returned exactly 12 items covering ids 1..12. + * Adds neutral 0/0.5/1 placeholders for any missing items so the audit + * always renders all rows. + */ +function ensureAllItems(items: ItemScore[]): { + items: ItemScore[]; + missing: number[]; +} { + const seen = new Set(items.map((i) => i.itemId)); + const missing: number[] = []; + const filled = [...items]; + for (const item of RUBRIC_ITEMS) { + if (!seen.has(item.id)) { + missing.push(item.id); + filled.push({ + itemId: item.id, + score: "n/a", + whyThisScore: "Missing from AI response — rescore in next audit.", + }); + } + } + filled.sort((a, b) => a.itemId - b.itemId); + return { items: filled, missing }; +} + +export class MaturityAIScorer { + private readonly apiKey?: string; + private readonly model: string; + private readonly baseUrl?: string; + private readonly logger: ConsolaInstance; + private readonly dryRun: boolean; + + constructor(options: MaturityAIScorerOptions = {}) { + this.apiKey = options.apiKey ?? getEnv("OPENAI_API_KEY") ?? undefined; + this.model = + options.model ?? + getEnv("MATURITY_AI_MODEL") ?? + getEnv("AI_MODEL") ?? + "gpt-5-mini"; + this.baseUrl = options.baseUrl ?? getEnv("OPENAI_BASE_URL") ?? undefined; + this.logger = options.logger ?? consola.withTag("maturity-ai"); + this.dryRun = options.dryRun ?? false; + } + + async score(context: MaturityScoringContext): Promise { + const prompt = buildMaturityPrompt(context); + + if (this.dryRun) { + return this.dryRunResult(); + } + + if (!this.apiKey) { + throw new Error( + "OPENAI_API_KEY required for maturity assessment AI scoring (or pass --dry-run for a placeholder).", + ); + } + + const client = new OpenAI({ + apiKey: this.apiKey, + ...(this.baseUrl ? { baseURL: this.baseUrl } : {}), + }); + + this.logger.debug( + `Maturity AI scoring (model=${this.model}, prompt=${prompt.length} chars)`, + ); + + const response = await client.responses.create({ + model: this.model, + input: prompt, + text: { format: MATURITY_ASSESSMENT_SCHEMA }, + } as Parameters[0]); + + const outputText = (response as unknown as Record) + .output_text as string | undefined; + + if (!outputText) { + throw new Error("Empty AI response for maturity assessment"); + } + + const parsed = JSON.parse(outputText) as { + oneLineTake: string; + items: Array<{ itemId: number; score: string; whyThisScore: string }>; + topFixes: Array<{ + itemId: number; + whyThisOne: string; + whatGoodLooksLike: string; + owner: string; + }>; + strengths: string[]; + notesForReaudit: string[]; + }; + + const itemScores: ItemScore[] = parsed.items.map((i) => ({ + itemId: i.itemId, + score: parseScore(i.score), + whyThisScore: i.whyThisScore, + })); + + const { items: capped, notes: capNotes } = applyTier3Caps( + itemScores, + context.tier, + ); + const { items: filled, missing } = ensureAllItems(capped); + + const notes = [...parsed.notesForReaudit, ...capNotes]; + if (missing.length > 0) { + notes.push(`AI response was missing item(s): ${missing.join(", ")}.`); + } + + const topFixes: TopFix[] = parsed.topFixes.map((f) => { + const fix: TopFix = { + itemId: f.itemId, + whyThisOne: f.whyThisOne, + whatGoodLooksLike: f.whatGoodLooksLike, + }; + if (f.owner && f.owner.toLowerCase() !== "unassigned") { + fix.owner = f.owner; + } + return fix; + }); + + return { + oneLineTake: parsed.oneLineTake, + items: filled, + topFixes, + strengths: parsed.strengths, + notesForReaudit: notes, + }; + } + + private dryRunResult(): MaturityAIResult { + const items: ItemScore[] = RUBRIC_ITEMS.map((item) => ({ + itemId: item.id, + score: 0.5 as ItemScoreValue, + whyThisScore: + "Dry-run placeholder — rerun without --dry-run for real scoring.", + })); + return { + oneLineTake: "Dry-run audit — no AI scoring performed.", + items, + topFixes: [], + strengths: [], + notesForReaudit: ["Dry-run mode was active — rerun without --dry-run."], + }; + } +} diff --git a/src/services/maturity/audit-store.ts b/src/services/maturity/audit-store.ts new file mode 100644 index 0000000..6a7f969 --- /dev/null +++ b/src/services/maturity/audit-store.ts @@ -0,0 +1,134 @@ +import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { dirname, join } from "node:path"; +import type { AuditStore } from "../../core/types.js"; +import { getQuestion, INTERVIEW_QUESTIONS } from "./interview.js"; +import type { InterviewAnswer, InterviewQuestionId } from "./types.js"; + +/** + * Reads / writes docs/audits/CONFIG.md inside a repo. Format documented in + * references/interview.md (## Org-level answers). + */ +export class FileSystemAuditStore implements AuditStore { + constructor(private readonly repoPath: string) {} + + private configPath(): string { + return join(this.repoPath, "docs", "audits", "CONFIG.md"); + } + + async readPriorAnswers(): Promise { + try { + const text = await readFile(this.configPath(), "utf8"); + return parseConfigMd(text); + } catch (err) { + if ((err as NodeJS.ErrnoException).code === "ENOENT") return []; + throw err; + } + } + + async writeAnswers(answers: InterviewAnswer[], today: string): Promise { + const text = renderConfigMd(answers, today); + await mkdir(dirname(this.configPath()), { recursive: true }); + await writeFile(this.configPath(), text, "utf8"); + } +} + +/** + * Parse the `## Org-level answers` section of CONFIG.md. Heading mapping + * comes from interview.md verbatim. + */ +export function parseConfigMd(text: string): InterviewAnswer[] { + const answers: InterviewAnswer[] = []; + const lines = text.split(/\r?\n/); + let inSection = false; + let currentQuestion: InterviewQuestionId | null = null; + let buffer: string[] = []; + + const flush = () => { + if (currentQuestion) { + const value = buffer.join("\n").trim(); + if (value.length > 0) { + answers.push({ + questionId: currentQuestion, + value, + isOption: false, + }); + } + currentQuestion = null; + buffer = []; + } + }; + + for (const line of lines) { + if (/^##\s+Org-level answers/i.test(line)) { + inSection = true; + continue; + } + if (inSection && /^##\s+/.test(line)) { + flush(); + break; + } + if (!inSection) continue; + + const headingMatch = /^###\s+(.+)$/.exec(line); + if (headingMatch) { + flush(); + currentQuestion = matchQuestionByHeading(headingMatch[1].trim()); + continue; + } + if (currentQuestion) { + buffer.push(line); + } + } + flush(); + return answers; +} + +function matchQuestionByHeading(heading: string): InterviewQuestionId | null { + const q = INTERVIEW_QUESTIONS.find( + (q) => q.configHeading.toLowerCase() === heading.toLowerCase(), + ); + return q?.id ?? null; +} + +export function renderConfigMd( + answers: InterviewAnswer[], + today: string, +): string { + const lines: string[] = []; + lines.push("## Org-level answers"); + lines.push(""); + lines.push(`last_updated: ${today}`); + lines.push(""); + for (const q of INTERVIEW_QUESTIONS) { + lines.push(`### ${q.configHeading}`); + const answer = answers.find((a) => a.questionId === q.id); + lines.push(answer?.value?.trim() || "unknown"); + lines.push(""); + } + return lines.join("\n"); +} + +/** + * Read pre-supplied interview answers from a JSON file (used by --interview-answers + * in headless mode). Format: { "q1": "...", "q2": "...", ... }. + */ +export async function readAnswersJson( + path: string, +): Promise { + const text = await readFile(path, "utf8"); + const parsed = JSON.parse(text) as Record; + const answers: InterviewAnswer[] = []; + for (const [qid, value] of Object.entries(parsed)) { + try { + getQuestion(qid as InterviewQuestionId); + } catch { + continue; + } + answers.push({ + questionId: qid as InterviewQuestionId, + value, + isOption: false, + }); + } + return answers; +} diff --git a/src/services/maturity/audit-writer.ts b/src/services/maturity/audit-writer.ts new file mode 100644 index 0000000..1a3792a --- /dev/null +++ b/src/services/maturity/audit-writer.ts @@ -0,0 +1,223 @@ +import { mkdir, writeFile } from "node:fs/promises"; +import { dirname } from "node:path"; +import { getRubricItem, MATURITY_BANDS, RUBRIC_CATEGORIES } from "./rubric.js"; +import type { + AssessmentArtifact, + CategoryId, + ItemScore, + ItemScoreValue, +} from "./types.js"; + +function tierLabel(tier: AssessmentArtifact["tier"]): string { + switch (tier) { + case "gh": + return "1: gh"; + case "github-mcp": + return "2: GitHub MCP"; + case "git-only": + return "3: git-only"; + } +} + +function formatScore(score: ItemScoreValue): string { + if (score === "n/a") return "n/a"; + if (score === 1) return "1"; + if (score === 0) return "0"; + return "0.5"; +} + +function fixed(num: number, digits = 1): string { + return num.toFixed(digits); +} + +function findItemScore(items: ItemScore[], itemId: number): ItemScore { + const score = items.find((s) => s.itemId === itemId); + if (!score) { + throw new Error(`Missing score for item ${itemId}`); + } + return score; +} + +function categoryTable( + artifact: AssessmentArtifact, + categoryId: CategoryId, +): string { + const cat = RUBRIC_CATEGORIES.find((c) => c.id === categoryId); + if (!cat) throw new Error(`Unknown category ${categoryId}`); + const subtotal = artifact.categorySubtotals.find((s) => s.id === categoryId); + if (!subtotal) throw new Error(`Missing subtotal for ${categoryId}`); + + const lines: string[] = []; + lines.push(`### ${cat.id}. ${cat.title} (weight ${fixed(cat.weight, 2)}×)`); + lines.push("| # | Item | Score | Why this score |"); + lines.push("|---|------|-------|----------------|"); + for (const itemId of cat.itemIds) { + const itemDef = getRubricItem(itemId); + const itemScore = findItemScore(artifact.items, itemId); + lines.push( + `| ${itemId} | ${itemDef.title} | ${formatScore(itemScore.score)} | ${itemScore.whyThisScore} |`, + ); + } + lines.push(""); + const rawCount = artifact.items + .filter((s) => cat.itemIds.includes(s.itemId) && s.score !== "n/a") + .reduce((sum, s) => sum + (s.score as number), 0); + lines.push( + `Subtotal: ${fixed(rawCount)} × ${fixed(cat.weight, 2)} = ${fixed(rawCount * cat.weight)} / ${fixed(cat.maxWeighted, 2)}`, + ); + + return lines.join("\n"); +} + +export function renderAuditMarkdown(artifact: AssessmentArtifact): string { + const lines: string[] = []; + lines.push( + `# Agent Maturity Assessment — ${artifact.scope.displayName} — ${artifact.auditDate}`, + ); + lines.push(""); + lines.push("## Summary"); + lines.push( + `- Raw score: ${fixed(artifact.rawScore)} / ${artifact.rawScoreMax}`, + ); + lines.push(`- Weighted score: ${fixed(artifact.scorePercent)}%`); + lines.push( + `- Band: **${artifact.band}** (${MATURITY_BANDS.find((b) => b.name === artifact.band)?.rangeLabel ?? "?"})`, + ); + lines.push( + `- Evidence tier: **${tierLabel(artifact.tier)}** (see references/preflight.md)`, + ); + lines.push(`- One-line take: ${artifact.oneLineTake}`); + lines.push(""); + lines.push("### Maturity scale (where this audit lands)"); + lines.push(""); + lines.push("| Band | % range | This audit |"); + lines.push("|------|---------|:----------:|"); + for (const band of MATURITY_BANDS) { + const marker = band.name === artifact.band ? "◉" : ""; + lines.push(`| ${band.name} | ${band.rangeLabel} | ${marker} |`); + } + lines.push(""); + lines.push("## Scores"); + lines.push(""); + lines.push(categoryTable(artifact, "A")); + lines.push(""); + lines.push(categoryTable(artifact, "B")); + lines.push(""); + lines.push(categoryTable(artifact, "C")); + lines.push(""); + lines.push(categoryTable(artifact, "D")); + lines.push(""); + lines.push("## Top 3 fixes (highest leverage)"); + if (artifact.topFixes.length === 0) { + lines.push( + "_No fixes identified — assessment is either incomplete or the org is at the ceiling._", + ); + } else { + artifact.topFixes.slice(0, 3).forEach((fix, idx) => { + const item = getRubricItem(fix.itemId); + const owner = fix.owner ? ` (suggested owner: ${fix.owner})` : ""; + lines.push( + `${idx + 1}. **${item.title}** — ${fix.whyThisOne} ${fix.whatGoodLooksLike}${owner}`, + ); + }); + } + lines.push(""); + lines.push("## Strengths to preserve"); + if (artifact.strengths.length === 0) { + lines.push("- _None highlighted in this run._"); + } else { + for (const s of artifact.strengths) { + lines.push(`- ${s}`); + } + } + lines.push(""); + lines.push("## Adjacent repos consulted"); + if (artifact.adjacentRepos.length === 0) { + lines.push("None — all evidence within scope repo."); + } else { + for (const r of artifact.adjacentRepos) { + lines.push(`- \`${r.owner}/${r.name}\` — ${r.reason}`); + } + } + lines.push(""); + lines.push("## Notes for re-audit"); + if (artifact.notesForReaudit.length === 0) { + lines.push("- _No outstanding calibration notes._"); + } else { + for (const note of artifact.notesForReaudit) { + lines.push(`- ${note}`); + } + } + lines.push(""); + lines.push("---"); + lines.push(""); + lines.push( + `Rubric version ${artifact.rubricVersion} · Generated by Team Hero · ${artifact.auditDate}`, + ); + lines.push(""); + + return lines.join("\n"); +} + +export function renderAuditJson(artifact: AssessmentArtifact): string { + return JSON.stringify(artifact, null, 2); +} + +async function ensureDir(filePath: string): Promise { + const dir = dirname(filePath); + if (dir === "." || dir === "" || dir === "/") return; + await mkdir(dir, { recursive: true }); +} + +export interface WriteAuditOptions { + outputPath: string; + jsonOutputPath?: string; + format: "markdown" | "json" | "both"; +} + +export async function writeAudit( + artifact: AssessmentArtifact, + options: WriteAuditOptions, +): Promise<{ outputPath: string; jsonOutputPath?: string }> { + let writtenMarkdown: string | undefined; + let writtenJson: string | undefined; + + if (options.format === "markdown" || options.format === "both") { + const md = renderAuditMarkdown(artifact); + await ensureDir(options.outputPath); + await writeFile(options.outputPath, md, "utf8"); + writtenMarkdown = options.outputPath; + } + + if (options.format === "json" || options.format === "both") { + const jsonPath = + options.jsonOutputPath ?? + `${options.outputPath.replace(/\.md$/i, "")}.json`; + const json = renderAuditJson(artifact); + await ensureDir(jsonPath); + await writeFile(jsonPath, json, "utf8"); + writtenJson = jsonPath; + } + + const result: { outputPath: string; jsonOutputPath?: string } = { + outputPath: writtenMarkdown ?? writtenJson ?? options.outputPath, + }; + if (writtenJson && options.format !== "json") { + result.jsonOutputPath = writtenJson; + } else if (options.format === "json" && writtenJson) { + result.outputPath = writtenJson; + } + return result; +} + +/** + * Compute the default markdown output path from a scope + date, mirroring the + * report file convention (`teamhero-report--.md`). + */ +export function defaultOutputPath(displayName: string, date: string): string { + const slug = displayName + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, ""); + return `./teamhero-maturity-${slug}-${date}.md`; +} diff --git a/src/services/maturity/evidence-collectors.ts b/src/services/maturity/evidence-collectors.ts new file mode 100644 index 0000000..0e5f0b1 --- /dev/null +++ b/src/services/maturity/evidence-collectors.ts @@ -0,0 +1,747 @@ +/** + * Deterministic evidence collectors — one per rubric item. + * + * Each implements MaturityProvider and runs against a local repo path. They + * never throw on missing files; they emit zero or more EvidenceFact records + * with a positive/neutral/negative signal. The AI scorer takes these facts + + * interview answers + diagnostic instructions and produces the final score. + * + * GitHub-backed checks (PR cadence, branch protection, CI run history) are + * skipped at git-only tier — the rubric caps items 2/3/9/11 at 0.5 in that + * case (see references/preflight.md and rubric.ts::tier3Cap). + */ + +import { join } from "node:path"; +import type { MaturityProvider } from "../../core/types.js"; +import { anyFile, fileContains, findFiles, readIfExists } from "./fs-utils.js"; +import type { + AdjacentRepo, + EvidenceFact, + EvidenceTier, + ScopeDescriptor, +} from "./types.js"; + +interface CollectInput { + scope: ScopeDescriptor; + tier: EvidenceTier; + adjacentRepos: AdjacentRepo[]; +} + +function localPath(scope: ScopeDescriptor): string | null { + return scope.localPath ?? null; +} + +function fact( + itemId: number, + signal: EvidenceFact["signal"], + summary: string, + source: string, + details?: Record, +): EvidenceFact { + const result: EvidenceFact = { itemId, signal, summary, source }; + if (details) result.details = details; + return result; +} + +// --------------------------------------------------------------------------- +// Item 1 — Reproducible dev environments +// --------------------------------------------------------------------------- + +class ReproducibleDevCollector implements MaturityProvider { + readonly itemId = 1; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + const candidates = [ + ".devcontainer", + "flake.nix", + "setup.sh", + "scripts/bootstrap.sh", + "scripts/setup.sh", + "Makefile", + "justfile", + "docker-compose.yml", + ]; + const found: string[] = []; + for (const c of candidates) { + if ( + await anyFile(root, { pathContains: [c.toLowerCase()], maxDepth: 3 }) + ) { + found.push(c); + } + } + if (found.length > 0) { + facts.push( + fact( + 1, + "positive", + `Bootstrap surface found: ${found.join(", ")}`, + "reproducible-dev-collector", + { found }, + ), + ); + } else { + facts.push( + fact( + 1, + "negative", + "No devcontainer / flake.nix / bootstrap script detected at the repo root.", + "reproducible-dev-collector", + ), + ); + } + + const readme = await readIfExists(join(root, "README.md")); + if ( + readme && + /\b(install|setup|bootstrap|getting started)\b/i.test(readme) + ) { + facts.push( + fact( + 1, + "positive", + "README documents an install/setup section.", + "reproducible-dev-collector", + ), + ); + } + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 2 — Sub-day integration cadence (gh-backed; capped at 0.5 on git-only) +// --------------------------------------------------------------------------- + +class IntegrationCadenceCollector implements MaturityProvider { + readonly itemId = 2; + + async collect(input: CollectInput): Promise { + const facts: EvidenceFact[] = []; + if (input.tier === "git-only") { + facts.push( + fact( + 2, + "neutral", + "Tier 3 (git-only): cadence approximated from local merge commits — no PR or DORA visibility. Score capped at 0.5.", + "integration-cadence-collector", + ), + ); + return facts; + } + // Note: actual gh API queries happen in the AI scorer's tool-use layer or + // via the existing octokit adapter. Here we record that the tier supports it. + facts.push( + fact( + 2, + "neutral", + `Tier ${input.tier}: PR cadence and CI run history available via GitHub.`, + "integration-cadence-collector", + ), + ); + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 3 — Testability +// --------------------------------------------------------------------------- + +class TestabilityCollector implements MaturityProvider { + readonly itemId = 3; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + + const testFiles = await findFiles(root, { + nameRegex: + /\.(test|spec)\.(ts|js|tsx|jsx|py|go|rs)$|.+_test\.go$|.*Tests\.cs$/, + maxDepth: 5, + limit: 500, + }); + if (testFiles.length === 0) { + facts.push( + fact( + 3, + "negative", + "No test files (*.spec.*, *.test.*, *_test.go, *Tests.cs) found.", + "testability-collector", + ), + ); + } else { + facts.push( + fact( + 3, + "positive", + `${testFiles.length} test file(s) detected.`, + "testability-collector", + { count: testFiles.length, sample: testFiles.slice(0, 5) }, + ), + ); + } + + // Detect CI continue-on-error / always-true tricks (worker-script grep on workflow files) + const workflows = await findFiles(root, { + pathContains: [".github/workflows"], + maxDepth: 5, + }); + for (const wf of workflows) { + if ( + await fileContains( + join(root, wf), + /(\|\|\s*true\b|continue-on-error:\s*true)/, + ) + ) { + facts.push( + fact( + 3, + "negative", + `CI workflow swallows test failures: ${wf}`, + "testability-collector", + ), + ); + break; + } + } + + if (input.tier === "git-only") { + facts.push( + fact( + 3, + "neutral", + "Tier 3 (git-only): no CI flake/fail-rate visibility. Score capped at 0.5.", + "testability-collector", + ), + ); + } + + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 4 — Observability +// --------------------------------------------------------------------------- + +class ObservabilityCollector implements MaturityProvider { + readonly itemId = 4; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + + const obsLibPattern = + /OpenTelemetry|opentelemetry|Microsoft\.ApplicationInsights|datadog|@datadog|prom-client|prometheus|grafana|loki|tempo|sentry|honeycomb|newrelic|splunk/i; + const manifestNames = [ + "package.json", + "go.mod", + "Cargo.toml", + "requirements.txt", + "pyproject.toml", + "pom.xml", + "build.gradle", + "build.gradle.kts", + ]; + const found: string[] = []; + for (const m of manifestNames) { + if (await fileContains(join(root, m), obsLibPattern)) { + found.push(m); + } + } + if (found.length > 0) { + facts.push( + fact( + 4, + "positive", + `Observability libraries referenced in: ${found.join(", ")}`, + "observability-collector", + ), + ); + } else { + facts.push( + fact( + 4, + "negative", + "No telemetry libraries (OTel/Datadog/Sentry/Prometheus/etc.) detected in dependency manifests.", + "observability-collector", + ), + ); + } + + const runbooks = await findFiles(root, { + pathContains: ["runbook", "incident", "/sli", "/slo", "ops/"], + maxDepth: 5, + limit: 50, + }); + if (runbooks.length > 0) { + facts.push( + fact( + 4, + "positive", + `Runbook/SLI/SLO docs present: ${runbooks.slice(0, 3).join(", ")}`, + "observability-collector", + { count: runbooks.length }, + ), + ); + } + + const dashboards = await findFiles(root, { + pathContains: ["grafana", "dashboards", "alerts"], + nameRegex: /\.(json|yml|yaml|libsonnet|jsonnet)$/, + maxDepth: 5, + }); + if (dashboards.length > 0) { + facts.push( + fact( + 4, + "positive", + `Committed dashboards/alerts: ${dashboards.length} file(s).`, + "observability-collector", + ), + ); + } + + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 5 — Design discipline +// --------------------------------------------------------------------------- + +class DesignDisciplineCollector implements MaturityProvider { + readonly itemId = 5; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + + const adrs = await findFiles(root, { + pathContains: ["adr", "architecture-decision"], + nameRegex: /\.md$/i, + maxDepth: 5, + }); + if (adrs.length > 0) { + facts.push( + fact( + 5, + "positive", + `Architecture decision records found (${adrs.length}).`, + "design-discipline-collector", + { count: adrs.length, sample: adrs.slice(0, 3) }, + ), + ); + } + + const archMd = await readIfExists(join(root, "ARCHITECTURE.md")); + if (archMd && archMd.length > 200) { + facts.push( + fact( + 5, + "positive", + "ARCHITECTURE.md present and non-trivial.", + "design-discipline-collector", + ), + ); + } else if (archMd) { + facts.push( + fact( + 5, + "neutral", + "ARCHITECTURE.md present but very short.", + "design-discipline-collector", + ), + ); + } + + const docsArch = await findFiles(root, { + pathContains: ["docs/architecture", "docs/design"], + nameRegex: /\.md$/i, + maxDepth: 5, + }); + if (docsArch.length > 0) { + facts.push( + fact( + 5, + "positive", + `Design docs under docs/: ${docsArch.length} file(s).`, + "design-discipline-collector", + ), + ); + } + + const glossary = await anyFile(root, { + nameRegex: /^(GLOSSARY|UBIQUITOUS-LANGUAGE|TERMS)\.md$/i, + maxDepth: 4, + }); + if (glossary) { + facts.push( + fact( + 5, + "positive", + "Glossary / ubiquitous-language file checked in.", + "design-discipline-collector", + ), + ); + } + + if (adrs.length === 0 && !archMd && docsArch.length === 0) { + facts.push( + fact( + 5, + "negative", + "No ADRs, ARCHITECTURE.md, or design docs detected.", + "design-discipline-collector", + ), + ); + } + + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 6 — Deep modules +// --------------------------------------------------------------------------- + +class DeepModulesCollector implements MaturityProvider { + readonly itemId = 6; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + const sourceFiles = await findFiles(root, { + nameRegex: /\.(ts|tsx|js|jsx|go|py|rs|cs|java|kt)$/, + maxDepth: 6, + limit: 2000, + }); + facts.push( + fact( + 6, + "neutral", + `Source files indexed: ${sourceFiles.length}. Module-shape judgment requires AI review of file size distribution.`, + "deep-modules-collector", + { count: sourceFiles.length }, + ), + ); + // Surface a god-file warning: any source file with extreme path depth or a + // neighbour heuristic. We emit just the count summary; the AI judges shape. + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 7 — Repo-local agent context +// --------------------------------------------------------------------------- + +class AgentContextCollector implements MaturityProvider { + readonly itemId = 7; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + const contextFiles = [ + "CLAUDE.md", + "AGENTS.md", + ".cursor/rules", + ".cursorrules", + ".github/copilot-instructions.md", + "memory-bank", + ]; + const found: string[] = []; + for (const candidate of contextFiles) { + if ( + await anyFile(root, { + pathContains: [candidate.toLowerCase()], + maxDepth: 4, + }) + ) { + found.push(candidate); + } + } + const skillsDir = await findFiles(root, { + pathContains: [".claude/skills", "claude-plugin/skills", ".skills"], + nameRegex: /SKILL\.md$/i, + maxDepth: 5, + }); + if (skillsDir.length > 0) { + found.push(`${skillsDir.length} repo-local skill(s)`); + } + if (found.length > 0) { + facts.push( + fact( + 7, + "positive", + `Repo-local agent context: ${found.join(", ")}.`, + "agent-context-collector", + ), + ); + } else { + facts.push( + fact( + 7, + "negative", + "No CLAUDE.md / AGENTS.md / .cursor / repo-local skills detected.", + "agent-context-collector", + ), + ); + } + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 8 — Sanctioned AI tooling (primary signal: interview) +// --------------------------------------------------------------------------- + +class SanctionedAiCollector implements MaturityProvider { + readonly itemId = 8; + + async collect(_input: CollectInput): Promise { + const facts: EvidenceFact[] = []; + facts.push( + fact( + 8, + "neutral", + "Item 8 is scored primarily from interview Q1. Any policy doc evidence will be cross-checked.", + "sanctioned-ai-collector", + ), + ); + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 9 — Human review on every PR (gh-backed; cap at 0.5 on git-only) +// --------------------------------------------------------------------------- + +class HumanReviewCollector implements MaturityProvider { + readonly itemId = 9; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + const facts: EvidenceFact[] = []; + if (root) { + const codeowners = + (await readIfExists(join(root, "CODEOWNERS"))) ?? + (await readIfExists(join(root, ".github", "CODEOWNERS"))) ?? + (await readIfExists(join(root, "docs", "CODEOWNERS"))); + if (codeowners) { + facts.push( + fact( + 9, + "positive", + "CODEOWNERS file present.", + "human-review-collector", + ), + ); + } else { + facts.push( + fact( + 9, + "neutral", + "No CODEOWNERS file detected.", + "human-review-collector", + ), + ); + } + } + if (input.tier === "git-only") { + facts.push( + fact( + 9, + "neutral", + "Tier 3 (git-only): no branch-protection or review-depth visibility. Score capped at 0.5.", + "human-review-collector", + ), + ); + } + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 10 — Evals for AI-touched paths +// --------------------------------------------------------------------------- + +class EvalsCollector implements MaturityProvider { + readonly itemId = 10; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + const evalDirs = await findFiles(root, { + pathContains: ["/evals/", "/benchmarks/", "/eval-suite/"], + maxDepth: 5, + limit: 50, + }); + if (evalDirs.length > 0) { + facts.push( + fact( + 10, + "positive", + `Eval / benchmark surface detected (${evalDirs.length} file(s)).`, + "evals-collector", + ), + ); + } else { + facts.push( + fact( + 10, + "neutral", + "No evals/ or benchmarks/ directory detected. Item also depends on interview Q5.", + "evals-collector", + ), + ); + } + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 11 — Blast-radius controls +// --------------------------------------------------------------------------- + +class BlastRadiusCollector implements MaturityProvider { + readonly itemId = 11; + + async collect(input: CollectInput): Promise { + const root = localPath(input.scope); + if (!root) return []; + const facts: EvidenceFact[] = []; + + const workflowFiles = await findFiles(root, { + pathContains: [".github/workflows"], + maxDepth: 4, + }); + let oidcCount = 0; + let secretsKeyCount = 0; + const oidcPattern = + /azure\/login@|aws-actions\/configure-aws-credentials@|google-github-actions\/auth@/; + const longLivedSecretsPattern = + /secrets\.AWS_ACCESS_KEY_ID|secrets\.AZURE_CLIENT_SECRET|secrets\.GCP_KEY/; + for (const wf of workflowFiles) { + if (await fileContains(join(root, wf), oidcPattern)) oidcCount++; + if (await fileContains(join(root, wf), longLivedSecretsPattern)) + secretsKeyCount++; + } + if (oidcCount > 0) { + facts.push( + fact( + 11, + "positive", + `OIDC-based auth in CI workflows (${oidcCount} workflow(s)).`, + "blast-radius-collector", + ), + ); + } + if (secretsKeyCount > 0) { + facts.push( + fact( + 11, + "negative", + `Long-lived cloud creds via repo secrets in ${secretsKeyCount} workflow(s).`, + "blast-radius-collector", + ), + ); + } + + const tfFiles = await findFiles(root, { + pathContains: ["infra/", "terraform/"], + nameRegex: /\.tf$/, + maxDepth: 6, + limit: 100, + }); + if (tfFiles.length > 0) { + facts.push( + fact( + 11, + "positive", + `Terraform/infrastructure-as-code present (${tfFiles.length} file(s)).`, + "blast-radius-collector", + ), + ); + } + + if (input.tier === "git-only") { + facts.push( + fact( + 11, + "neutral", + "Tier 3 (git-only): no environment-protection visibility. Score capped at 0.5.", + "blast-radius-collector", + ), + ); + } + + return facts; + } +} + +// --------------------------------------------------------------------------- +// Item 12 — Judgment under AI augmentation (primary: interview Q2) +// --------------------------------------------------------------------------- + +class HiringCollector implements MaturityProvider { + readonly itemId = 12; + + async collect(_input: CollectInput): Promise { + const facts: EvidenceFact[] = []; + facts.push( + fact( + 12, + "neutral", + "Item 12 is scored primarily from interview Q2.", + "hiring-collector", + ), + ); + return facts; + } +} + +export function defaultCollectors(): MaturityProvider[] { + return [ + new ReproducibleDevCollector(), + new IntegrationCadenceCollector(), + new TestabilityCollector(), + new ObservabilityCollector(), + new DesignDisciplineCollector(), + new DeepModulesCollector(), + new AgentContextCollector(), + new SanctionedAiCollector(), + new HumanReviewCollector(), + new EvalsCollector(), + new BlastRadiusCollector(), + new HiringCollector(), + ]; +} + +export async function runAllCollectors( + collectors: MaturityProvider[], + input: CollectInput, +): Promise { + const facts: EvidenceFact[] = []; + for (const c of collectors) { + try { + const f = await c.collect(input); + facts.push(...f); + } catch (err) { + facts.push({ + itemId: c.itemId, + signal: "neutral", + summary: `Collector for item ${c.itemId} threw: ${(err as Error).message}`, + source: "evidence-collectors", + }); + } + } + return facts; +} diff --git a/src/services/maturity/fs-utils.ts b/src/services/maturity/fs-utils.ts new file mode 100644 index 0000000..098d219 --- /dev/null +++ b/src/services/maturity/fs-utils.ts @@ -0,0 +1,122 @@ +import { readdir, readFile, stat } from "node:fs/promises"; +import { join, relative } from "node:path"; + +const DEFAULT_IGNORES = new Set([ + ".git", + "node_modules", + "dist", + "build", + "out", + ".next", + ".turbo", + ".cache", + "target", // rust + "vendor", // go vendored + "coverage", +]); + +export interface FindOptions { + /** Maximum directory depth (default 4). */ + maxDepth?: number; + /** Only return file names (basename) matching this regex when set. */ + nameRegex?: RegExp; + /** Only return paths whose lowercased relative form contains one of these substrings. */ + pathContains?: string[]; + /** Maximum number of matches to return (default 200). */ + limit?: number; +} + +/** + * Walk a directory tree and return matching file paths (relative to root). + * Skips DEFAULT_IGNORES entries and symlinks. + */ +export async function findFiles( + root: string, + options: FindOptions = {}, +): Promise { + const maxDepth = options.maxDepth ?? 4; + const limit = options.limit ?? 200; + const matches: string[] = []; + + async function walk(dir: string, depth: number): Promise { + if (depth > maxDepth || matches.length >= limit) return; + let entries; + try { + entries = await readdir(dir, { withFileTypes: true }); + } catch { + return; + } + for (const entry of entries) { + if (matches.length >= limit) return; + if (DEFAULT_IGNORES.has(entry.name)) continue; + if (entry.isSymbolicLink()) continue; + const abs = join(dir, entry.name); + const rel = relative(root, abs); + const lowerRel = rel.toLowerCase().replace(/\\/g, "/"); + if (entry.isDirectory()) { + await walk(abs, depth + 1); + continue; + } + if (options.nameRegex && !options.nameRegex.test(entry.name)) continue; + if (options.pathContains) { + const hit = options.pathContains.some((needle) => + lowerRel.includes(needle.toLowerCase()), + ); + if (!hit) continue; + } + matches.push(rel); + } + } + + try { + const s = await stat(root); + if (!s.isDirectory()) return []; + } catch { + return []; + } + + await walk(root, 0); + return matches; +} + +/** Convenience: does any file matching options exist? */ +export async function anyFile( + root: string, + options: FindOptions = {}, +): Promise { + const found = await findFiles(root, { ...options, limit: 1 }); + return found.length > 0; +} + +/** Read a file or return null if it doesn't exist / can't be read. */ +export async function readIfExists(path: string): Promise { + try { + return await readFile(path, "utf8"); + } catch { + return null; + } +} + +/** + * Check whether the file content matches a regex. Returns true if the file + * exists AND contains a match. + */ +export async function fileContains( + path: string, + pattern: RegExp, +): Promise { + const content = await readIfExists(path); + if (content === null) return false; + return pattern.test(content); +} + +/** Look for a substring across many candidate files; return first hit's path. */ +export async function firstFileContaining( + paths: string[], + pattern: RegExp, +): Promise { + for (const p of paths) { + if (await fileContains(p, pattern)) return p; + } + return null; +} diff --git a/src/services/maturity/interview.ts b/src/services/maturity/interview.ts new file mode 100644 index 0000000..5a9b4fa --- /dev/null +++ b/src/services/maturity/interview.ts @@ -0,0 +1,122 @@ +import type { InterviewQuestion, InterviewQuestionId } from "./types.js"; + +/** + * Phase-1 interview questions — verbatim from references/interview.md. + * + * The wording is calibrated; do not paraphrase. Each question is asked one at + * a time and waits for an answer before proceeding (see SKILL.md). + */ +export const INTERVIEW_QUESTIONS: ReadonlyArray = [ + { + id: "q1", + prompt: + "What AI tooling do engineers actually use day-to-day (Claude, Copilot, Cursor, etc.)? Is it company-paid with managed accounts, or are people using personal accounts or free tiers? Is there a documented policy on what data can be sent to third-party AI providers?", + options: [ + "Company-paid managed seats + documented data-handling policy", + "Company-paid seats but governance is loose / no written policy", + "Mostly personal accounts or free tier; no policy", + "I don't know", + ], + allowFreeText: true, + configHeading: "AI tooling (Q1)", + }, + { + id: "q2", + prompt: + "Do technical interviews allow candidates to use AI, and are interviewers trained to evaluate how well they use it (critique, decomposition, catching wrong outputs)? Or is AI either banned or effectively unassessed?", + options: [ + "AI allowed in interviews, interviewers trained to assess judgment with AI", + "AI allowed but assessment is informal / uncalibrated", + "AI banned, or interviews don't really test technical judgment", + "I don't know", + ], + allowFreeText: true, + configHeading: "Hiring (Q2)", + }, + { + id: "q3", + prompt: + "Are all four DORA metrics (deployment frequency, lead time, change failure rate, MTTR) actively tracked and visible to the team — e.g., a dashboard engineers actually look at? Or are some tracked in theory but not used?", + options: [ + "All four DORA metrics tracked on a dashboard the team actually uses", + "Some DORA metrics tracked but not actively watched", + "Not really tracked / vibes-based", + "I don't know", + ], + allowFreeText: true, + configHeading: "DORA visibility (Q3)", + }, + { + id: "q4", + prompt: + "When engineers hand work to AI agents, is there a consistent upfront design step (ADR, shared-understanding session, spec) before code generation? Or is it ad hoc — some engineers do it, others prompt straight into code?", + options: [ + "Consistent upfront design step (ADR / spec / shared-understanding) before agent code", + "Some engineers do it, others prompt straight into code", + "No design step — agents are pointed at problems and turned loose", + "I don't know", + ], + allowFreeText: true, + configHeading: "Design before code (Q4)", + }, + { + id: "q5", + prompt: + "Are LLMs in the product (user-facing features), in the dev loop only, or both? If in the product: is there an offline eval suite plus production telemetry? If dev-loop only: is AI impact tracked deliberately — even a spreadsheet, Asana board, or sprint retro metric counts — or is it purely gut-feel with no numbers anyone could point to?", + options: [ + "LLMs in product with offline evals + prod telemetry", + "LLMs in dev loop with tracked metrics (Asana, spreadsheet, retro numbers, etc.)", + "LLMs used but purely gut-feel — no numbers anyone could point to", + "No LLMs in product or dev loop", + "I don't know", + ], + allowFreeText: true, + configHeading: "Eval coverage (Q5)", + }, + { + id: "q6", + prompt: + "Has anyone explicitly red-teamed a worst-case agent scenario in prod (bad migration, runaway infra change, secret exfiltration)? Are rollback paths for agent-triggered writes documented?", + options: [ + "Worst-case agent scenarios have been red-teamed; rollback paths documented", + "Some controls in place but no explicit red-teaming", + "No red-teaming; agents share human-equivalent prod creds", + "I don't know", + ], + allowFreeText: true, + configHeading: "Blast-radius red-teaming (Q6)", + }, + { + id: "q7", + prompt: + "Are there adjacent repos I should treat as in-scope that automated detection might miss — e.g., an internal handbook, security/IT policy repo, org-wide `.github` repo, shared skill library?", + options: [ + "Yes — list the repos", + "No, scope is just the primary repo(s) you've found", + "I don't know", + ], + allowFreeText: true, + configHeading: "Out-of-band adjacent repos (Q7)", + }, +] as const; + +export const FRAMING_MESSAGE = + 'I\'m going to ask 7 quick questions one at a time — they cover the parts of the audit that aren\'t visible in the repo. "I don\'t know" or "n/a" is a valid answer to any of them and will mark that criterion as not assessed, not failed.'; + +const UNKNOWN_TOKENS = new Set( + ["i don't know", "i dont know", "unknown", "n/a", "na", "skip"].map((s) => + s.toLowerCase().trim(), + ), +); + +export function isUnknownAnswer(value: string): boolean { + return UNKNOWN_TOKENS.has(value.trim().toLowerCase()); +} + +export function getQuestion(id: InterviewQuestionId): InterviewQuestion { + const q = INTERVIEW_QUESTIONS.find((q) => q.id === id); + if (!q) { + throw new Error(`Unknown interview question: ${id}`); + } + return q; +} diff --git a/src/services/maturity/maturity-prompts.ts b/src/services/maturity/maturity-prompts.ts new file mode 100644 index 0000000..9b1fbb7 --- /dev/null +++ b/src/services/maturity/maturity-prompts.ts @@ -0,0 +1,176 @@ +import { RUBRIC_CATEGORIES, RUBRIC_ITEMS } from "./rubric.js"; +import type { + AdjacentRepo, + EvidenceFact, + EvidenceTier, + InterviewAnswer, + ScopeDescriptor, +} from "./types.js"; + +export const MATURITY_ASSESSMENT_SCHEMA = { + type: "json_schema" as const, + name: "agent_maturity_assessment", + strict: true, + schema: { + type: "object" as const, + properties: { + oneLineTake: { type: "string" as const }, + items: { + type: "array" as const, + items: { + type: "object" as const, + properties: { + itemId: { type: "integer" as const }, + score: { + type: "string" as const, + enum: ["0", "0.5", "1", "n/a"] as const, + }, + whyThisScore: { type: "string" as const }, + }, + required: ["itemId", "score", "whyThisScore"] as const, + additionalProperties: false as const, + }, + }, + topFixes: { + type: "array" as const, + items: { + type: "object" as const, + properties: { + itemId: { type: "integer" as const }, + whyThisOne: { type: "string" as const }, + whatGoodLooksLike: { type: "string" as const }, + owner: { type: "string" as const }, + }, + required: [ + "itemId", + "whyThisOne", + "whatGoodLooksLike", + "owner", + ] as const, + additionalProperties: false as const, + }, + }, + strengths: { + type: "array" as const, + items: { type: "string" as const }, + }, + notesForReaudit: { + type: "array" as const, + items: { type: "string" as const }, + }, + }, + required: [ + "oneLineTake", + "items", + "topFixes", + "strengths", + "notesForReaudit", + ] as const, + additionalProperties: false as const, + }, +} as const; + +export interface MaturityScoringContext { + scope: ScopeDescriptor; + tier: EvidenceTier; + adjacentRepos: AdjacentRepo[]; + evidence: EvidenceFact[]; + interviewAnswers: InterviewAnswer[]; +} + +function rubricBlock(): string { + const lines: string[] = []; + for (const cat of RUBRIC_CATEGORIES) { + lines.push( + `### Category ${cat.id} — ${cat.title} (weight ${cat.weight.toFixed(2)}×)`, + ); + for (const itemId of cat.itemIds) { + const item = RUBRIC_ITEMS.find((i) => i.id === itemId); + if (!item) continue; + lines.push(`#### Item ${item.id} — ${item.title}`); + lines.push(`- 1.0 — ${item.scoreLevels.one}`); + lines.push(`- 0.5 — ${item.scoreLevels.half}`); + lines.push(`- 0.0 — ${item.scoreLevels.zero}`); + if (item.interviewLink) { + lines.push( + `- Interview link: ${item.interviewLink.questionId} (${item.interviewLink.mode})`, + ); + } + if (item.tier3Cap) { + lines.push(`- Tier-3 cap: 0.5 (insufficient GitHub-side evidence)`); + } + lines.push(`- Why it matters: ${item.whyItMatters}`); + lines.push(""); + } + } + return lines.join("\n"); +} + +function evidenceBlock(evidence: EvidenceFact[]): string { + const byItem = new Map(); + for (const f of evidence) { + const list = byItem.get(f.itemId) ?? []; + list.push(f); + byItem.set(f.itemId, list); + } + const lines: string[] = []; + for (const item of RUBRIC_ITEMS) { + const facts = byItem.get(item.id) ?? []; + lines.push(`#### Item ${item.id} — ${item.title}`); + if (facts.length === 0) { + lines.push("- (no deterministic evidence collected)"); + } else { + for (const f of facts) { + lines.push(`- [${f.signal}] ${f.summary}`); + } + } + lines.push(""); + } + return lines.join("\n"); +} + +function interviewBlock(answers: InterviewAnswer[]): string { + if (answers.length === 0) return "_No interview answers supplied._"; + return answers.map((a) => `- ${a.questionId}: ${a.value}`).join("\n"); +} + +export function buildMaturityPrompt(context: MaturityScoringContext): string { + const scopeLine = `${context.scope.mode} | ${context.scope.displayName}`; + const adjacentLine = + context.adjacentRepos.length === 0 + ? "(none detected)" + : context.adjacentRepos + .map((r) => `${r.owner}/${r.name} (${r.reason})`) + .join("; "); + + return [ + "You are auditing an engineering organization for AI-agentic-coding readiness using the Agent Maturity Assessment rubric.", + "", + "# Scope", + `- ${scopeLine}`, + `- Evidence tier: ${context.tier}`, + `- Adjacent repos consulted: ${adjacentLine}`, + "", + "# Rules", + "- Score each of the 12 items as exactly one of: 0, 0.5, 1, n/a.", + "- Be conservative: if it's not visibly true, score 0.5. If there's no evidence at all, 0.", + "- Use n/a ONLY if the corresponding interview answer is 'unknown' / 'I don't know' or the item genuinely doesn't apply to this scope. Never default to 0 because of missing context.", + "- For tier-3 (git-only) audits, you MAY NOT award 1.0 to items 2, 3, 9, or 11 — cap them at 0.5.", + "- 'whyThisScore' MUST be a single sentence of 25 words or fewer. State the single most decisive piece of evidence. No semicolons, no 'but also' hedging.", + "- Pick the 3 highest-leverage fixes (preferentially from items scoring < 1.0). Each fix needs an owner suggestion (engineering-manager, platform-team, security, leadership, etc.). If you can't pick 3, return fewer; the schema requires the 'owner' field even if it's 'unassigned'.", + "- Strengths: 1–3 short bullets the team is already doing right that should not get broken during change.", + "- One-line take: a single sentence summarizing the audit at a glance.", + "- Notes for re-audit: anything scored n/a, calibration warnings, or specific data to recheck next quarter.", + "", + "# Rubric (full text)", + rubricBlock(), + "", + "# Deterministic evidence (collected from filesystem / GitHub)", + evidenceBlock(context.evidence), + "", + "# Interview answers (Phase 1)", + interviewBlock(context.interviewAnswers), + "", + "Return JSON matching the agent_maturity_assessment schema.", + ].join("\n"); +} diff --git a/src/services/maturity/maturity.service.ts b/src/services/maturity/maturity.service.ts new file mode 100644 index 0000000..8a43e19 --- /dev/null +++ b/src/services/maturity/maturity.service.ts @@ -0,0 +1,206 @@ +import { type ConsolaInstance, consola } from "consola"; +import type { + AuditStore, + InterviewTransport, + MaturityProvider, +} from "../../core/types.js"; +import { detectAdjacentRepos } from "./adjacent-repos.js"; +import { type MaturityAIResult, MaturityAIScorer } from "./ai-scorer.js"; +import { defaultOutputPath, writeAudit } from "./audit-writer.js"; +import { defaultCollectors, runAllCollectors } from "./evidence-collectors.js"; +import { + FRAMING_MESSAGE, + INTERVIEW_QUESTIONS, + isUnknownAnswer, +} from "./interview.js"; +import { detectTier } from "./preflight.js"; +import { RUBRIC_VERSION } from "./rubric.js"; +import { + categorySubtotals, + computeOverallScore, + findMissingItems, +} from "./scoring.js"; +import type { + AssessCommandInput, + AssessmentArtifact, + AssessResult, + EvidenceFact, + EvidenceTier, + InterviewAnswer, +} from "./types.js"; + +export interface MaturityServiceDeps { + collectors?: MaturityProvider[]; + scorer?: MaturityAIScorer; + interview?: InterviewTransport; + auditStore?: AuditStore; + logger?: ConsolaInstance; + /** ProgressReporter — fired at each pipeline step for the TUI / headless emit. */ + onProgress?: (step: string, message: string) => void; +} + +export class MaturityService { + private readonly collectors: MaturityProvider[]; + private readonly scorer: MaturityAIScorer; + private readonly interview?: InterviewTransport; + private readonly auditStore?: AuditStore; + private readonly logger: ConsolaInstance; + private readonly onProgress: (step: string, message: string) => void; + + constructor(deps: MaturityServiceDeps = {}) { + this.collectors = deps.collectors ?? defaultCollectors(); + this.scorer = deps.scorer ?? new MaturityAIScorer(); + if (deps.interview) this.interview = deps.interview; + if (deps.auditStore) this.auditStore = deps.auditStore; + this.logger = deps.logger ?? consola.withTag("maturity"); + this.onProgress = deps.onProgress ?? (() => {}); + } + + async run(input: AssessCommandInput): Promise { + const today = new Date().toISOString().slice(0, 10); + + this.onProgress("preflight", "Detecting evidence tier…"); + const tier: EvidenceTier = await detectTier( + input.scope.localPath ?? process.cwd(), + input.evidenceTier, + ); + this.onProgress("preflight", `Tier resolved: ${tier}`); + + this.onProgress("adjacent-repos", "Detecting adjacent repos…"); + const adjacentRepos = await detectAdjacentRepos(input.scope); + if (adjacentRepos.length > 0) { + this.onProgress( + "adjacent-repos", + `Found ${adjacentRepos.length} adjacent repo(s): ${adjacentRepos.map((r) => `${r.owner}/${r.name}`).join(", ")}`, + ); + } + + this.onProgress("interview", "Gathering Phase-1 interview answers…"); + const interviewAnswers = await this.collectInterviewAnswers(input); + + this.onProgress("evidence", "Running deterministic evidence collectors…"); + const evidence: EvidenceFact[] = await runAllCollectors(this.collectors, { + scope: input.scope, + tier, + adjacentRepos, + }); + this.onProgress( + "evidence", + `Collected ${evidence.length} evidence fact(s) across ${this.collectors.length} items.`, + ); + + this.onProgress("scoring", "Running AI scorer…"); + const ai: MaturityAIResult = await this.scorer.score({ + scope: input.scope, + tier, + adjacentRepos, + evidence, + interviewAnswers, + }); + + const missing = findMissingItems(ai.items); + if (missing.length > 0) { + this.logger.warn(`Items missing from AI response: ${missing.join(", ")}`); + } + + const overall = computeOverallScore(ai.items); + const subtotals = categorySubtotals(ai.items); + + const artifact: AssessmentArtifact = { + scope: input.scope, + tier, + rubricVersion: RUBRIC_VERSION, + auditDate: today, + items: ai.items, + topFixes: ai.topFixes, + strengths: ai.strengths, + oneLineTake: ai.oneLineTake, + adjacentRepos, + notesForReaudit: ai.notesForReaudit, + interviewAnswers, + rawScore: overall.rawScore, + rawScoreMax: overall.rawScoreMax, + weightedScore: overall.weightedScore, + weightedScoreMax: overall.weightedScoreMax, + scorePercent: overall.scorePercent, + band: overall.band.name, + categorySubtotals: subtotals.map((s) => ({ + id: s.id, + raw: s.rawSum, + weighted: s.weighted, + max: s.maxWeighted, + })), + }; + + this.onProgress( + "writing", + `Writing audit (${overall.scorePercent.toFixed(1)}% — ${overall.band.name})…`, + ); + const outputPath = + input.outputPath ?? defaultOutputPath(input.scope.displayName, today); + const written = await writeAudit(artifact, { + outputPath, + format: input.outputFormat ?? "both", + }); + + if (this.auditStore) { + try { + await this.auditStore.writeAnswers(interviewAnswers, today); + this.onProgress( + "audit-store", + "Updated docs/audits/CONFIG.md with interview answers.", + ); + } catch (err) { + this.logger.warn( + `Failed to update CONFIG.md: ${(err as Error).message}`, + ); + } + } + + const result: AssessResult = { + outputPath: written.outputPath, + artifact, + }; + if (written.jsonOutputPath) result.jsonOutputPath = written.jsonOutputPath; + return result; + } + + private async collectInterviewAnswers( + _input: AssessCommandInput, + ): Promise { + const stored = (await this.auditStore?.readPriorAnswers()) ?? []; + const byId = new Map(stored.map((a) => [a.questionId, a] as const)); + + // In headless / non-interactive mode we just return the stored answers + // (or "unknown" if missing). + if (!this.interview) { + return INTERVIEW_QUESTIONS.map((q) => { + const prior = byId.get(q.id); + if (prior) return prior; + return { questionId: q.id, value: "unknown", isOption: false }; + }); + } + + await this.interview.frame(FRAMING_MESSAGE); + const answers: InterviewAnswer[] = []; + for (const q of INTERVIEW_QUESTIONS) { + const answer = await this.interview.ask(q); + // Normalize "I don't know" answers + if (isUnknownAnswer(answer.value)) { + answers.push({ ...answer, value: "unknown" }); + } else { + answers.push(answer); + } + } + return answers; + } +} + +/** Convenience: quick non-interactive run with default deps. */ +export async function runHeadlessAssessment( + input: AssessCommandInput, + overrides?: MaturityServiceDeps, +): Promise { + const service = new MaturityService(overrides); + return service.run(input); +} diff --git a/src/services/maturity/preflight.ts b/src/services/maturity/preflight.ts new file mode 100644 index 0000000..db77a2f --- /dev/null +++ b/src/services/maturity/preflight.ts @@ -0,0 +1,58 @@ +import { spawn } from "node:child_process"; +import { stat } from "node:fs/promises"; +import { join } from "node:path"; +import { getEnv } from "../../lib/env.js"; +import type { EvidenceTier } from "./types.js"; + +/** + * Detects which evidence-fidelity tier we can operate at. + * + * Order: + * 1. `gh` CLI in PATH and authenticated → "gh" + * 2. Hint env var TEAMHERO_GITHUB_MCP=1 (set by the Go TUI when an MCP is wired) → "github-mcp" + * 3. Anything else → "git-only" + */ +export async function detectTier( + cwd: string, + override?: EvidenceTier | "auto", +): Promise { + if (override && override !== "auto") return override; + + if (await ghIsAuthenticated()) return "gh"; + + if (getEnv("TEAMHERO_GITHUB_MCP") === "1") return "github-mcp"; + + if (await isGitRepo(cwd)) return "git-only"; + + return "git-only"; +} + +async function isGitRepo(cwd: string): Promise { + try { + const s = await stat(join(cwd, ".git")); + return s.isDirectory() || s.isFile(); // worktrees use a file + } catch { + return false; + } +} + +async function ghIsAuthenticated(): Promise { + return new Promise((resolve) => { + const child = spawn("gh", ["auth", "status"], { + stdio: ["ignore", "ignore", "ignore"], + }); + child.on("error", () => resolve(false)); + child.on("close", (code) => resolve(code === 0)); + }); +} + +export function describeTier(tier: EvidenceTier): string { + switch (tier) { + case "gh": + return "Tier 1 — gh CLI authenticated (highest fidelity)"; + case "github-mcp": + return "Tier 2 — GitHub MCP connected"; + case "git-only": + return "Tier 3 — git + filesystem only (limited GitHub-side evidence)"; + } +} diff --git a/src/services/maturity/rubric.ts b/src/services/maturity/rubric.ts new file mode 100644 index 0000000..ff75641 --- /dev/null +++ b/src/services/maturity/rubric.ts @@ -0,0 +1,331 @@ +import type { MaturityBand, RubricCategory, RubricItem } from "./types.js"; + +export { RUBRIC_VERSION } from "./types.js"; + +export const RUBRIC_CATEGORIES: ReadonlyArray = [ + { + id: "A", + title: "Engineering basics", + weight: 1.0, + maxRaw: 4, + maxWeighted: 4.0, + itemIds: [1, 2, 3, 4], + }, + { + id: "B", + title: "Knowledge & context", + weight: 1.5, + maxRaw: 3, + maxWeighted: 4.5, + itemIds: [5, 6, 7], + }, + { + id: "C", + title: "AI governance & quality", + weight: 1.25, + maxRaw: 4, + maxWeighted: 5.0, + itemIds: [8, 9, 10, 11], + }, + { + id: "D", + title: "Hiring", + weight: 1.0, + maxRaw: 1, + maxWeighted: 1.0, + itemIds: [12], + }, +] as const; + +export const MAX_RAW_SCORE = 12; +export const MAX_WEIGHTED_SCORE = 14.5; + +export const MATURITY_BANDS: ReadonlyArray = [ + { + name: "Excellent", + min: 90, + max: Infinity, + rangeLabel: "90%+", + interpretation: + "Genuinely rare. Confirm with a second pass — first audits often score too generously.", + }, + { + name: "Healthy", + min: 75, + max: 89.9999, + rangeLabel: "75–89%", + interpretation: "Targeted fixes will compound.", + }, + { + name: "Functional but slow", + min: 60, + max: 74.9999, + rangeLabel: "60–74%", + interpretation: + "Real risk of being out-shipped by AI-native competitors. Where most orgs actually live.", + }, + { + name: "Significant dysfunction", + min: 40, + max: 59.9999, + rangeLabel: "40–59%", + interpretation: "Treat as a turnaround.", + }, + { + name: "Triage", + min: -Infinity, + max: 39.9999, + rangeLabel: "<40%", + interpretation: "Stop new feature work until basics are in.", + }, +] as const; + +export const RUBRIC_ITEMS: ReadonlyArray = [ + { + id: 1, + slug: "reproducible-dev-environments", + title: "Reproducible dev environments", + categoryId: "A", + scoreLevels: { + one: "Clone-to-green-build in <30 min via devcontainer, Nix, or a single setup script. Same path works for an agent.", + half: "README exists but bootstrap takes >2 hours or has known broken steps.", + zero: "“Ask Bob, he knows the trick.”", + }, + repoCheck: + "`.devcontainer/`, `flake.nix`, `setup.sh`, or equivalent. Run it from a clean machine.", + diagnosticCommands: [ + "ls .devcontainer/ flake.nix setup.sh scripts/bootstrap* 2>/dev/null", + "time bash on a clean machine to verify the <30 min claim", + ], + whyItMatters: + "Onboarding latency is the first multiplier on team velocity, and agents need bootstrappable environments too. If a human can't get green in 30 minutes, an agent definitely can't.", + }, + { + id: 2, + slug: "sub-day-integration-cadence", + title: "Sub-day integration cadence with measured outcomes", + categoryId: "A", + scoreLevels: { + one: "Code integrates to mainline at least daily. PRs are small and merge sub-day. All four DORA metrics tracked and visible.", + half: "Some metrics tracked, but cadence is weekly, PRs sit for days, or feature branches routinely outlive a sprint.", + zero: "Long-lived feature branches as the norm, release trains measured in months, no metrics.", + }, + repoCheck: + "Age distribution of merged PRs over the last 90 days; presence of any DORA dashboard.", + diagnosticCommands: [ + "gh pr list --state merged --limit 200 --search 'merged:>$(date -d \"90 days ago\" +%Y-%m-%d)' --json mergedAt,createdAt,additions,deletions,reviews,author", + "gh run list --workflow=deploy*.yml --limit 100 --json conclusion,createdAt,name --branch ", + ], + whyItMatters: + "Integration cadence is the leading indicator of engineering performance. Agents work fastest when changes validate against current main immediately; long-lived branches accumulate integration debt humans have to resolve later.", + interviewLink: { questionId: "q3", mode: "combine" }, + tier3Cap: true, + }, + { + id: 3, + slug: "testability-and-agent-inner-loop", + title: "Testability and the agent inner loop", + categoryId: "A", + scoreLevels: { + one: "App is built to be tested (DI, ports/adapters, deep modules). Unit tests sub-second; full suite in minutes; flakes treated as bugs. TDD with agents is the default.", + half: "Tests exist and mostly run, but known untestable areas, slow suite, flakes re-run rather than fixed, or TDD is occasional.", + zero: "Manual QA, flaky-and-ignored test suite, or no seams in the application.", + }, + repoCheck: + "Run the suite, time it, check failure rate over the last 50 CI runs; sample a recent feature PR and look at whether tests were written before or after the implementation.", + diagnosticCommands: [ + "time (e.g. time pnpm test, time dotnet test)", + "find . -name '*.test.*' -o -name '*.spec.*' -o -name '*Tests.cs' 2>/dev/null | wc -l", + "gh run list --workflow=ci.yml --limit 50 --json conclusion --jq '[.[] | .conclusion] | group_by(.) | map({status: .[0], count: length})'", + "grep -rE '\\\\|\\\\|\\\\s*true|continue-on-error:\\\\s*true' .github/workflows/ 2>/dev/null", + ], + whyItMatters: + "Humans can reason around bad tests; agents can't — they follow the signal. The test suite is the rate limit on agent throughput.", + tier3Cap: true, + }, + { + id: 4, + slug: "observability-before-features", + title: "Observability before features", + categoryId: "A", + scoreLevels: { + one: "Structured logs, distributed traces, error budgets defined, on-call with runbooks. New features ship instrumented.", + half: "Logs and metrics exist but tracing is partial; runbooks stale.", + zero: "“We grep CloudWatch when something breaks.”", + }, + repoCheck: + "OTel libraries in deps, dashboards exist, error budget docs, recency of last runbook update.", + diagnosticCommands: [ + "grep -rEh 'OpenTelemetry|opentelemetry|Microsoft\\\\.ApplicationInsights|datadog|prometheus|grafana|loki|tempo|sentry|honeycomb|newrelic|splunk' --include='*.csproj' --include='package.json' --include='go.mod' --include='requirements*.txt' --include='Cargo.toml' 2>/dev/null", + "find . -ipath '*runbook*' -o -ipath '*incident*' -o -ipath '*sli*' -o -ipath '*slo*' 2>/dev/null", + ], + whyItMatters: + "You can't fix what you can't see. AI accelerates ship rate, which accelerates incident rate — observability is the safety net that makes acceleration survivable.", + }, + { + id: 5, + slug: "design-discipline", + title: "Design discipline as a first-class practice", + categoryId: "B", + scoreLevels: { + one: "ADRs current and dated. ARCHITECTURE.md exists per active repo. Ubiquitous-language glossary checked in and referenced in agent context. Design happens before code generation.", + half: "Some design artifacts exist but are stale; ubiquitous language is implicit; planning is informal and inconsistent.", + zero: "Tribal knowledge. Architecture lives in one staff engineer's head. Agents are turned loose without shared design concept.", + }, + repoCheck: + "`docs/adr/`, `ARCHITECTURE.md`, glossary or ubiquitous-language file; check git log on those paths for recency; sample an agent-driven PR for evidence of upfront design vs. straight-to-code.", + diagnosticCommands: [ + "find . -ipath '*adr*' -name '*.md' 2>/dev/null | head", + "find . -iname 'ARCHITECTURE.md' -o -iname 'GLOSSARY.md' -o -iname '*ubiquitous*' 2>/dev/null", + "git log --since='90 days ago' --oneline -- docs/adr/ ARCHITECTURE.md 2>/dev/null | wc -l", + ], + whyItMatters: + "Specs-to-code without design discipline produces software entropy. Investing in design daily keeps tactical AI execution aligned with strategic intent. The ubiquitous language is the bridge between domain experts, engineers, and agents.", + interviewLink: { questionId: "q4", mode: "combine" }, + }, + { + id: 6, + slug: "deep-modules", + title: "Codebase composed of deep modules", + categoryId: "B", + scoreLevels: { + one: "Codebase is structured as deep modules: few large modules, each with substantial functionality hidden behind a simple, stable interface.", + half: "Some areas well-modularized; others are shallow / sprinkly. A handful of god-classes exist but are known and bounded.", + zero: "Sprawling shallow modules with leaky interfaces; 4000-line god files alongside 30-line helper files with no clear pattern.", + }, + repoCheck: + "File size distribution, public API surface per module, sample two random modules and see whether you can summarize each one's purpose in a sentence.", + diagnosticCommands: [ + "find . -type f \\( -name '*.ts' -o -name '*.go' -o -name '*.py' -o -name '*.rs' -o -name '*.cs' \\) -not -path '*/node_modules/*' -not -path '*/.git/*' -exec wc -l {} + | sort -nr | head -30", + ], + whyItMatters: + "AI excels at filling in implementation when given a clean interface; it produces sprawl when given no constraints. Deep modules give agents the right shape of problem to solve.", + }, + { + id: 7, + slug: "repo-local-agent-context", + title: "Repo-local agent context", + categoryId: "B", + scoreLevels: { + one: "`CLAUDE.md` / `AGENTS.md` / skill files checked into the repo. Team-level prompt and skill libraries are versioned. Agents joining the team get the same onboarding humans get.", + half: "Some individuals have personal CLAUDE.md files; nothing shared at the repo level.", + zero: "No agent context anywhere; people copy-paste instructions into chat each time.", + }, + repoCheck: + "`CLAUDE.md`, `AGENTS.md`, `.claude/`, `.cursor/rules/`, `.skills/`, or equivalent. Read one — does it teach the agent something the engineer wouldn't have to be told?", + diagnosticCommands: [ + "find . -maxdepth 4 \\( -iname 'CLAUDE.md' -o -iname 'AGENTS.md' -o -name '.claude' -o -name '.cursor' -o -name '.skills' -o -name 'memory-bank' \\) -not -path './node_modules/*' -not -path './.git/*' 2>/dev/null", + ], + whyItMatters: + "Agents perform at the level of context the repo provides them. Ad-hoc personal prompts mean each engineer's agent operates at a different standard; checked-in context means everyone (and every agent) gets the same baseline.", + }, + { + id: 8, + slug: "sanctioned-ai-tooling", + title: "Sanctioned, governed AI tooling", + categoryId: "C", + scoreLevels: { + one: "Approved model list, ZDR posture documented, secrets scanning on agent outputs, clear policy on what can / can't be sent to third parties, paid seats budgeted.", + half: "Tooling is paid for but governance is loose; or governance is tight but everyone uses personal accounts anyway.", + zero: "Shadow AI. People paste prod data into free-tier chatbots.", + }, + diagnosticCommands: [ + "Cross-check against any policy docs in /.github or an internal handbook if reachable.", + ], + whyItMatters: + "Shadow AI is shadow IT with worse confidentiality and IP risk. Governance now is cheaper than recovering from a leak later.", + interviewLink: { questionId: "q1", mode: "primary" }, + }, + { + id: 9, + slug: "human-review-on-every-pr", + title: "Human review on every PR", + categoryId: "C", + scoreLevels: { + one: "AI-generated code is reviewed by a human who understands it well enough to defend it in a postmortem. “The agent wrote it” is not a shield.", + half: "Reviews happen but are cursory; AI-authored PRs get rubber-stamped.", + zero: "Auto-merge on agent PRs, or no review process at all.", + }, + repoCheck: + "PR review settings, review depth on a sample of recent AI-tagged PRs.", + diagnosticCommands: [ + "find . -name 'CODEOWNERS' 2>/dev/null", + "gh api 'repos/{owner}/{repo}/branches//protection' 2>/dev/null", + "gh pr list --state merged --limit 50 --json reviews,author,additions,deletions", + ], + whyItMatters: + "AI-authored code that no human can defend is technical debt with no owner. Review discipline is what keeps the org accountable for what it ships.", + tier3Cap: true, + }, + { + id: 10, + slug: "evals-for-ai-touched-paths", + title: "Evals for AI-touched code paths", + categoryId: "C", + scoreLevels: { + one: "If LLMs are in the product → offline eval suite + prod telemetry. If LLMs are in the dev loop → adoption, throughput, and defect rate measured honestly.", + half: "Vibes-based confidence; some metrics but no rigor.", + zero: "No evals, no measurement, no idea if the AI helps or hurts.", + }, + repoCheck: "`evals/`, `benchmarks/`, internal AI tooling dashboards.", + diagnosticCommands: [ + "find . -type d \\( -name 'evals' -o -name 'benchmarks' \\) 2>/dev/null", + ], + whyItMatters: + "Without evals, you can't tell whether AI is helping or hurting. Evals are also the only way to catch silent regressions in AI-driven product features.", + interviewLink: { questionId: "q5", mode: "combine" }, + }, + { + id: 11, + slug: "blast-radius-controls", + title: "Blast-radius controls for agent actions", + categoryId: "C", + scoreLevels: { + one: "Scoped credentials per agent, dry-run modes, audit logs of every agent-triggered write, documented rollback paths. Worst-case scenarios red-teamed.", + half: "Some controls exist but are inconsistent; audit logs partial.", + zero: "Agents have prod write access via human-equivalent creds; no audit trail.", + }, + diagnosticCommands: [ + "grep -rEh 'azure/login@|aws-actions/configure-aws-credentials@|google-github-actions/auth@' .github/workflows/ 2>/dev/null", + "gh api 'repos/{owner}/{repo}/environments' --jq '.environments[] | {name: .name, has_protection: (.protection_rules | length > 0)}' 2>/dev/null", + "find infra/ terraform/ -name '*.tf' 2>/dev/null | xargs grep -lE 'service_account|workload_identity|managed_identity|user_assigned_identity' 2>/dev/null", + ], + whyItMatters: + "Autonomous agents will eventually do something stupid. The question is whether the blast radius is bounded by design or by luck.", + interviewLink: { questionId: "q6", mode: "combine" }, + tier3Cap: true, + }, + { + id: 12, + slug: "judgment-under-ai-augmentation", + title: "Interviews assess judgment under AI augmentation", + categoryId: "D", + scoreLevels: { + one: "Candidates use AI in interviews and are evaluated on critique, decomposition, recognizing wrong answers, and shipping correct work.", + half: "AI is allowed but interviewers don't know how to assess its use; or it's banned for “purity” reasons.", + zero: "Old-style whiteboard-only interviews; or no real technical bar at all.", + }, + diagnosticCommands: [ + "If a rubric is reachable in an internal repo, cross-check.", + ], + whyItMatters: + "Hiring is a forward-looking bet. The skill that matters in the AI-agentic era isn't “can write code without AI” — it's “can use AI well.”", + interviewLink: { questionId: "q2", mode: "primary" }, + }, +] as const; + +export function getRubricItem(id: number): RubricItem { + const item = RUBRIC_ITEMS.find((i) => i.id === id); + if (!item) { + throw new Error(`Unknown rubric item: ${id}`); + } + return item; +} + +export function getCategory(id: "A" | "B" | "C" | "D"): RubricCategory { + const cat = RUBRIC_CATEGORIES.find((c) => c.id === id); + if (!cat) { + throw new Error(`Unknown category: ${id}`); + } + return cat; +} diff --git a/src/services/maturity/scoring.ts b/src/services/maturity/scoring.ts new file mode 100644 index 0000000..4d5e8fa --- /dev/null +++ b/src/services/maturity/scoring.ts @@ -0,0 +1,123 @@ +import { + getRubricItem, + MATURITY_BANDS, + MAX_RAW_SCORE, + MAX_WEIGHTED_SCORE, + RUBRIC_CATEGORIES, +} from "./rubric.js"; +import type { + CategoryId, + ItemScore, + MaturityBand, + MaturityBandName, +} from "./types.js"; + +/** + * Per-item numeric value, treating "n/a" as null. + */ +function scoreNumeric(score: ItemScore["score"]): number | null { + if (score === "n/a") return null; + return score; +} + +export interface CategorySubtotal { + id: CategoryId; + rawSum: number; // sum of 0/0.5/1 values, ignoring n/a + weighted: number; // rawSum × weight + maxRaw: number; // adjusted for n/a + maxWeighted: number; // adjusted for n/a +} + +export function categorySubtotals(items: ItemScore[]): CategorySubtotal[] { + return RUBRIC_CATEGORIES.map((cat) => { + const inCat = items.filter((s) => { + const item = getRubricItem(s.itemId); + return item.categoryId === cat.id; + }); + + let rawSum = 0; + let countAssessed = 0; + for (const s of inCat) { + const numeric = scoreNumeric(s.score); + if (numeric === null) continue; + rawSum += numeric; + countAssessed += 1; + } + + const weighted = rawSum * cat.weight; + const maxRaw = countAssessed; // each assessed item contributes max 1.0 to raw + const maxWeighted = countAssessed * cat.weight; + + return { + id: cat.id, + rawSum, + weighted, + maxRaw, + maxWeighted, + }; + }); +} + +export interface OverallScore { + rawScore: number; + rawScoreMax: number; + weightedScore: number; + weightedScoreMax: number; + scorePercent: number; + band: MaturityBand; +} + +export function computeOverallScore(items: ItemScore[]): OverallScore { + const subtotals = categorySubtotals(items); + + const rawScore = subtotals.reduce((sum, s) => sum + s.rawSum, 0); + const rawScoreMax = subtotals.reduce((sum, s) => sum + s.maxRaw, 0); + const weightedScore = subtotals.reduce((sum, s) => sum + s.weighted, 0); + const weightedScoreMax = subtotals.reduce((sum, s) => sum + s.maxWeighted, 0); + + const scorePercent = + weightedScoreMax > 0 ? (weightedScore / weightedScoreMax) * 100 : 0; + const band = classifyBand(scorePercent); + + return { + rawScore, + rawScoreMax, + weightedScore, + weightedScoreMax, + scorePercent, + band, + }; +} + +export function classifyBand(scorePercent: number): MaturityBand { + for (const band of MATURITY_BANDS) { + if (scorePercent >= band.min && scorePercent <= band.max) { + return band; + } + } + // Fallback — shouldn't happen given the bands cover the full range + return MATURITY_BANDS[MATURITY_BANDS.length - 1]; +} + +export function bandByName(name: MaturityBandName): MaturityBand { + const band = MATURITY_BANDS.find((b) => b.name === name); + if (!band) { + throw new Error(`Unknown maturity band: ${name}`); + } + return band; +} + +/** Returns the unweighted-max constants for diagnostics. */ +export function maxScores(): { raw: number; weighted: number } { + return { raw: MAX_RAW_SCORE, weighted: MAX_WEIGHTED_SCORE }; +} + +/** + * Validate that a list of ItemScores covers all 12 items exactly once. + * Returns missing item IDs (empty array if valid). + */ +export function findMissingItems(items: ItemScore[]): number[] { + const expected = new Set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + for (const s of items) expected.delete(s.itemId); + return [...expected].sort((a, b) => a - b); +} diff --git a/src/services/maturity/stdin-interview.ts b/src/services/maturity/stdin-interview.ts new file mode 100644 index 0000000..b9fb201 --- /dev/null +++ b/src/services/maturity/stdin-interview.ts @@ -0,0 +1,135 @@ +import type { InterviewTransport } from "../../core/types.js"; +import type { InterviewAnswer, InterviewQuestion } from "./types.js"; + +type Emit = (event: Record) => void; + +/** + * Shared line reader for a stdin-like stream. Used by both the initial + * config-line read and the interview-answer round-trip so they don't + * compete for ownership of stdin. + * + * Buffers incoming bytes, splits on `\n`, and resolves callers in FIFO + * order. Once `on('end')` fires, all queued and future callers receive `""`. + * + * Two-queue design: incoming lines that arrive *before* a caller is waiting + * sit in `pending`; waiters that arrive before lines do sit in `waiters`. + * `pump()` matches them up FIFO-style. + */ +export class StdinLineReader { + private buffered = ""; + private pending: string[] = []; + private waiters: Array<(line: string) => void> = []; + private closed = false; + + constructor(private readonly stream: NodeJS.ReadableStream = process.stdin) { + this.stream.setEncoding?.("utf8"); + this.stream.on("data", (chunk: string | Buffer) => { + const text = typeof chunk === "string" ? chunk : chunk.toString("utf8"); + this.buffered += text; + this.parseLines(); + this.pump(); + }); + this.stream.on("end", () => { + this.closed = true; + this.parseLines(); + this.pump(); + // Remaining waiters get "" (EOF). + while (this.waiters.length > 0) { + const w = this.waiters.shift(); + if (w) w(""); + } + }); + // `error` events on stdin (e.g. premature pipe closure) — treat as EOF. + this.stream.on("error", () => { + this.closed = true; + while (this.waiters.length > 0) { + const w = this.waiters.shift(); + if (w) w(""); + } + }); + } + + /** Pull complete lines off the byte buffer into `pending`. */ + private parseLines(): void { + while (true) { + const idx = this.buffered.indexOf("\n"); + if (idx < 0) break; + const line = this.buffered.slice(0, idx).replace(/\r$/, ""); + this.buffered = this.buffered.slice(idx + 1); + this.pending.push(line); + } + } + + /** Match any pending lines against any waiting callers, FIFO. */ + private pump(): void { + while (this.pending.length > 0 && this.waiters.length > 0) { + const line = this.pending.shift() as string; + const w = this.waiters.shift(); + if (w) w(line); + } + } + + nextLine(): Promise { + return new Promise((resolve) => { + if (this.pending.length > 0) { + const line = this.pending.shift() as string; + resolve(line); + return; + } + if (this.closed) { + resolve(""); + return; + } + this.waiters.push(resolve); + }); + } +} + +/** + * Interview transport that emits `interview-question` JSON-line events on stdout + * and reads `interview-answer` events from a shared StdinLineReader. Used by + * scripts/run-assess.ts when the Go TUI is acting as the interactive frontend. + * + * The TUI (or another harness) must reply to each question with a JSON line: + * {"type":"interview-answer","questionId":"q1","value":"...","isOption":true} + */ +export class StdinInterviewTransport implements InterviewTransport { + constructor( + private readonly reader: StdinLineReader, + private readonly emit: Emit, + ) {} + + async frame(message: string): Promise { + this.emit({ type: "interview-frame", message }); + } + + async ask(question: InterviewQuestion): Promise { + this.emit({ + type: "interview-question", + questionId: question.id, + questionText: question.prompt, + options: question.options, + allowFreeText: question.allowFreeText, + configHeading: question.configHeading, + }); + + while (true) { + const line = await this.reader.nextLine(); + if (!line) { + return { questionId: question.id, value: "unknown", isOption: false }; + } + let parsed: Record; + try { + parsed = JSON.parse(line); + } catch { + continue; + } + if (parsed.type !== "interview-answer") continue; + if (parsed.questionId !== question.id) continue; + const value = typeof parsed.value === "string" ? parsed.value : "unknown"; + const isOption = + typeof parsed.isOption === "boolean" ? parsed.isOption : false; + return { questionId: question.id, value, isOption }; + } + } +} diff --git a/src/services/maturity/types.ts b/src/services/maturity/types.ts new file mode 100644 index 0000000..b82f016 --- /dev/null +++ b/src/services/maturity/types.ts @@ -0,0 +1,193 @@ +/** + * Value types specific to the Agent Maturity Assessment feature. + * + * Port interfaces live in src/core/types.ts (see MaturityProvider, InterviewTransport, + * AuditStore there). This file holds concrete data shapes only. + */ + +/** Bumped whenever the rubric content or scoring math changes. Cache key includes this. */ +export const RUBRIC_VERSION = "1.0.0"; + +export type EvidenceTier = "gh" | "github-mcp" | "git-only"; + +export type CategoryId = "A" | "B" | "C" | "D"; + +export type ItemScoreValue = 0 | 0.5 | 1 | "n/a"; + +export interface RubricItem { + /** Item number 1–12, used in tables and IDs. */ + id: number; + /** Stable string id (e.g. "reproducible-dev-environments"). */ + slug: string; + /** Short title used in score tables. */ + title: string; + /** Category identifier (A/B/C/D). */ + categoryId: CategoryId; + /** Score-level definitions: what 1.0 / 0.5 / 0.0 look like. */ + scoreLevels: { one: string; half: string; zero: string }; + /** Repo-check guidance — single sentence describing where to look. */ + repoCheck?: string; + /** Diagnostic shell commands referenced in criteria.md. Markdown lines. */ + diagnosticCommands: string[]; + /** Why this item matters — used as supporting context for the AI prompt. */ + whyItMatters: string; + /** + * Phase-1 interview question id this item depends on (if any). + * "primary" = item is scored mainly from interview; "combine" = combined with repo evidence. + */ + interviewLink?: { + questionId: InterviewQuestionId; + mode: "primary" | "combine"; + }; + /** + * If true, tier-3 (git-only) audits cap this item at 0.5 — GitHub-side data is required + * to confidently award 1.0 (per references/preflight.md). + */ + tier3Cap?: boolean; +} + +export interface RubricCategory { + id: CategoryId; + title: string; + weight: number; + maxRaw: number; + maxWeighted: number; + itemIds: number[]; +} + +export type InterviewQuestionId = + | "q1" + | "q2" + | "q3" + | "q4" + | "q5" + | "q6" + | "q7"; + +export interface InterviewQuestion { + id: InterviewQuestionId; + prompt: string; + options: string[]; + /** When true, allow the user to enter free-text (in addition to choosing an option). */ + allowFreeText: boolean; + /** CONFIG.md heading the answer is stored under. */ + configHeading: string; +} + +export interface InterviewAnswer { + questionId: InterviewQuestionId; + /** Verbatim answer text. "unknown" → maps to n/a for the linked criterion. */ + value: string; + /** True if the user chose an option; false if they used free-text. */ + isOption: boolean; +} + +/** A single piece of evidence for an item, gathered by a deterministic detector. */ +export interface EvidenceFact { + /** The item this evidence supports. */ + itemId: number; + /** Signal strength: positive (counts toward 1.0), neutral, or negative. */ + signal: "positive" | "neutral" | "negative"; + /** Human-readable summary used in the AI prompt and JSON output. */ + summary: string; + /** Optional structured details for debugging / re-audit. */ + details?: Record; + /** The collector that produced this fact (for traceability). */ + source: string; +} + +export interface ScopeDescriptor { + mode: "org" | "local-repo" | "both"; + org?: string; + repos?: string[]; + localPath?: string; + /** Human-friendly name used in the audit title and filename. */ + displayName: string; +} + +export interface AdjacentRepo { + owner: string; + name: string; + /** Why we think this repo is adjacent (e.g. "uses: in workflow", "tf module"). */ + reason: string; +} + +export interface ItemScore { + itemId: number; + score: ItemScoreValue; + whyThisScore: string; +} + +export interface TopFix { + itemId: number; + owner?: string; + whatGoodLooksLike: string; + whyThisOne: string; +} + +export interface AssessmentArtifact { + scope: ScopeDescriptor; + tier: EvidenceTier; + rubricVersion: string; + auditDate: string; + items: ItemScore[]; + topFixes: TopFix[]; + strengths: string[]; + oneLineTake: string; + adjacentRepos: AdjacentRepo[]; + notesForReaudit: string[]; + interviewAnswers: InterviewAnswer[]; + rawScore: number; + rawScoreMax: number; + weightedScore: number; + weightedScoreMax: number; + scorePercent: number; + band: MaturityBandName; + categorySubtotals: Array<{ + id: CategoryId; + raw: number; + weighted: number; + max: number; + }>; +} + +export type MaturityBandName = + | "Excellent" + | "Healthy" + | "Functional but slow" + | "Significant dysfunction" + | "Triage"; + +export interface MaturityBand { + name: MaturityBandName; + min: number; + max: number; + rangeLabel: string; + interpretation: string; +} + +/** Top-level command input — what scripts/run-assess.ts reads from stdin. */ +export interface AssessCommandInput { + scope: ScopeDescriptor; + /** Override tier detection. Default: "auto". */ + evidenceTier?: EvidenceTier | "auto"; + /** Path to a JSON file with pre-supplied interview answers (headless mode). */ + interviewAnswersPath?: string; + /** Override audit output path. Default: ./teamhero-maturity--.md */ + outputPath?: string; + /** Output format. Default: "both". */ + outputFormat?: "markdown" | "json" | "both"; + /** Flush cached assessment(s) before running. */ + flushCache?: boolean; + /** Skip the AI scorer (useful for tests / debugging). */ + dryRun?: boolean; + mode?: "interactive" | "headless"; + /** When true, allow stdin to receive interview-answer events from the TUI. */ + interactiveInterview?: boolean; +} + +export interface AssessResult { + outputPath: string; + jsonOutputPath?: string; + artifact: AssessmentArtifact; +} diff --git a/tests/fixtures/maturity/teamhero-answers.json b/tests/fixtures/maturity/teamhero-answers.json new file mode 100644 index 0000000..f60bb43 --- /dev/null +++ b/tests/fixtures/maturity/teamhero-answers.json @@ -0,0 +1,9 @@ +{ + "q1": "Company-paid Claude Code seats with documented OpenAI/Anthropic data-handling policy.", + "q2": "AI is allowed in interviews; interviewers evaluate critique and decomposition under AI augmentation.", + "q3": "DORA metrics tracked but not yet on a shared dashboard the team checks daily.", + "q4": "Consistent ADR-style design step before agent code; planning docs live in docs/.", + "q5": "LLMs are used in dev-loop only; impact tracked via sprint retro notes.", + "q6": "unknown", + "q7": "No" +} diff --git a/tests/integration/maturity-end-to-end.spec.ts b/tests/integration/maturity-end-to-end.spec.ts new file mode 100644 index 0000000..33e4732 --- /dev/null +++ b/tests/integration/maturity-end-to-end.spec.ts @@ -0,0 +1,45 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { MaturityAIScorer } from "../../src/services/maturity/ai-scorer.js"; +import { runHeadlessAssessment } from "../../src/services/maturity/maturity.service.js"; + +describe("maturity assessment end-to-end (dry-run)", () => { + it("produces a markdown audit file in dry-run mode against this repo", async () => { + const tmpDir = mkdtempSync(join(tmpdir(), "tm-maturity-")); + const outputPath = join(tmpDir, "audit.md"); + try { + const scorer = new MaturityAIScorer({ dryRun: true }); + const result = await runHeadlessAssessment( + { + scope: { + mode: "local-repo", + localPath: process.cwd(), + displayName: "self-test", + }, + evidenceTier: "git-only", // keep deterministic for the test + outputPath, + outputFormat: "both", + dryRun: true, + }, + { scorer }, + ); + expect(result.outputPath).toBe(outputPath); + expect(result.jsonOutputPath).toBeDefined(); + + const md = readFileSync(outputPath, "utf8"); + expect(md).toContain("# Agent Maturity Assessment"); + expect(md).toContain("## Scores"); + expect(md).toContain("### A. Engineering basics"); + expect(md).toContain("### D. Hiring"); + + const json = JSON.parse(readFileSync(result.jsonOutputPath!, "utf8")); + expect(json.items).toHaveLength(12); + expect(json.rubricVersion).toBe("1.0.0"); + expect(json.tier).toBe("git-only"); + } finally { + rmSync(tmpDir, { recursive: true, force: true }); + } + }, 60_000); +}); diff --git a/tests/unit/services/maturity/adjacent-repos.spec.ts b/tests/unit/services/maturity/adjacent-repos.spec.ts new file mode 100644 index 0000000..5904acd --- /dev/null +++ b/tests/unit/services/maturity/adjacent-repos.spec.ts @@ -0,0 +1,25 @@ +import { describe, expect, it } from "bun:test"; +import { detectAdjacentRepos } from "../../../../src/services/maturity/adjacent-repos.js"; +import type { ScopeDescriptor } from "../../../../src/services/maturity/types.js"; + +describe("detectAdjacentRepos against this repo", () => { + it("returns an array (may include detected workflow refs)", async () => { + const scope: ScopeDescriptor = { + mode: "local-repo", + localPath: process.cwd(), + displayName: "self", + }; + const result = await detectAdjacentRepos(scope); + expect(Array.isArray(result)).toBe(true); + }); + + it("returns [] for an org-only scope without local path", async () => { + const scope: ScopeDescriptor = { + mode: "org", + org: "acme", + displayName: "acme", + }; + const result = await detectAdjacentRepos(scope); + expect(result).toEqual([]); + }); +}); diff --git a/tests/unit/services/maturity/audit-store.spec.ts b/tests/unit/services/maturity/audit-store.spec.ts new file mode 100644 index 0000000..0f4adda --- /dev/null +++ b/tests/unit/services/maturity/audit-store.spec.ts @@ -0,0 +1,80 @@ +import { describe, expect, it } from "bun:test"; +import { + parseConfigMd, + renderConfigMd, +} from "../../../../src/services/maturity/audit-store.js"; +import type { InterviewAnswer } from "../../../../src/services/maturity/types.js"; + +describe("renderConfigMd", () => { + it("renders all 7 question headings in order", () => { + const md = renderConfigMd([], "2026-05-03"); + const order = [ + "AI tooling (Q1)", + "Hiring (Q2)", + "DORA visibility (Q3)", + "Design before code (Q4)", + "Eval coverage (Q5)", + "Blast-radius red-teaming (Q6)", + "Out-of-band adjacent repos (Q7)", + ]; + const positions = order.map((heading) => md.indexOf(heading)); + expect(positions.every((p) => p >= 0)).toBe(true); + for (let i = 1; i < positions.length; i++) { + expect(positions[i]).toBeGreaterThan(positions[i - 1]); + } + }); + + it("includes the last_updated date", () => { + expect(renderConfigMd([], "2026-05-03")).toContain( + "last_updated: 2026-05-03", + ); + }); + + it("substitutes 'unknown' for missing answers", () => { + const md = renderConfigMd([], "2026-05-03"); + // Each section should have "unknown" beneath its heading + expect((md.match(/unknown/g) ?? []).length).toBeGreaterThanOrEqual(7); + }); + + it("uses provided answer values when present", () => { + const answers: InterviewAnswer[] = [ + { + questionId: "q1", + value: "Company-paid Claude seats with policy", + isOption: true, + }, + ]; + const md = renderConfigMd(answers, "2026-05-03"); + expect(md).toContain("Company-paid Claude seats with policy"); + }); +}); + +describe("parseConfigMd", () => { + it("round-trips: render → parse returns the same answers", () => { + const original: InterviewAnswer[] = [ + { questionId: "q1", value: "Paid Claude", isOption: true }, + { questionId: "q3", value: "DORA via Grafana", isOption: false }, + ]; + const md = renderConfigMd(original, "2026-05-03"); + const parsed = parseConfigMd(md); + const q1 = parsed.find((a) => a.questionId === "q1"); + const q3 = parsed.find((a) => a.questionId === "q3"); + expect(q1?.value).toBe("Paid Claude"); + expect(q3?.value).toBe("DORA via Grafana"); + }); + + it("returns empty for empty input", () => { + expect(parseConfigMd("")).toEqual([]); + }); + + it("ignores content outside the Org-level answers section", () => { + const md = `# Header\n\n## Other section\n\nfoo\n\n## Org-level answers\n\n### AI tooling (Q1)\nbar\n\n## After\n\nbaz\n`; + const parsed = parseConfigMd(md); + expect(parsed).toHaveLength(1); + expect(parsed[0]).toEqual({ + questionId: "q1", + value: "bar", + isOption: false, + }); + }); +}); diff --git a/tests/unit/services/maturity/audit-writer.spec.ts b/tests/unit/services/maturity/audit-writer.spec.ts new file mode 100644 index 0000000..d14fdc6 --- /dev/null +++ b/tests/unit/services/maturity/audit-writer.spec.ts @@ -0,0 +1,185 @@ +import { describe, expect, it } from "bun:test"; +import { + defaultOutputPath, + renderAuditJson, + renderAuditMarkdown, +} from "../../../../src/services/maturity/audit-writer.js"; +import { computeOverallScore } from "../../../../src/services/maturity/scoring.js"; +import type { + AssessmentArtifact, + ItemScore, +} from "../../../../src/services/maturity/types.js"; + +function buildArtifact(items: ItemScore[]): AssessmentArtifact { + const overall = computeOverallScore(items); + const _subtotals = items.reduce< + Record + >((acc, _) => acc, {}); + const cats = ["A", "B", "C", "D"] as const; + return { + scope: { mode: "local-repo", localPath: ".", displayName: "test-org" }, + tier: "gh", + rubricVersion: "1.0.0", + auditDate: "2026-05-03", + items, + topFixes: [ + { + itemId: 4, + whatGoodLooksLike: "Stand up structured logs and a runbook directory.", + whyThisOne: + "Observability is the lowest-risk leverage in this snapshot.", + }, + ], + strengths: ["CLAUDE.md / AGENTS.md exist and are kept current."], + oneLineTake: "Strong foundations, observability lagging.", + adjacentRepos: [], + notesForReaudit: ["Re-check item 4 once dashboards land."], + interviewAnswers: [], + rawScore: overall.rawScore, + rawScoreMax: overall.rawScoreMax, + weightedScore: overall.weightedScore, + weightedScoreMax: overall.weightedScoreMax, + scorePercent: overall.scorePercent, + band: overall.band.name, + categorySubtotals: cats.map((id) => ({ + id, + raw: 0, + weighted: 0, + max: 0, + })), + }; +} + +describe("renderAuditMarkdown", () => { + it("renders the title with scope and date", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const md = renderAuditMarkdown(buildArtifact(items)); + expect(md).toContain("# Agent Maturity Assessment — test-org — 2026-05-03"); + }); + + it("includes summary fields", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const md = renderAuditMarkdown(buildArtifact(items)); + expect(md).toMatch(/Raw score: 12.0 \/ 12/); + expect(md).toContain("Weighted score: 100.0%"); + expect(md).toContain("Band: **Excellent**"); + expect(md).toContain("Evidence tier: **1: gh**"); + }); + + it("marks the active band with ◉", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 0.5 as const, + whyThisScore: "partial", + })); + const md = renderAuditMarkdown(buildArtifact(items)); + // 6/12 raw with weights = 7.25 / 14.5 = 50% → Significant dysfunction + // The active-band marker only appears in the maturity-scale table rows, + // which start with "| |" — disambiguate from the Summary line. + const tableRow = md + .split("\n") + .find((l) => l.startsWith("| Significant dysfunction |")); + expect(tableRow).toBeDefined(); + expect(tableRow).toContain("◉"); + // Other band rows must not be marked + const healthyRow = md.split("\n").find((l) => l.startsWith("| Healthy |")); + expect(healthyRow).toBeDefined(); + expect(healthyRow).not.toContain("◉"); + }); + + it("renders all 4 category tables", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const md = renderAuditMarkdown(buildArtifact(items)); + expect(md).toContain("### A. Engineering basics"); + expect(md).toContain("### B. Knowledge & context"); + expect(md).toContain("### C. AI governance & quality"); + expect(md).toContain("### D. Hiring"); + }); + + it("includes top fixes section", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const md = renderAuditMarkdown(buildArtifact(items)); + expect(md).toContain("## Top 3 fixes"); + expect(md).toContain("Observability before features"); + }); + + it("falls back to a no-fixes message when topFixes is empty", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const artifact = buildArtifact(items); + artifact.topFixes = []; + const md = renderAuditMarkdown(artifact); + expect(md).toContain("No fixes identified"); + }); + + it("falls back to 'None' for empty adjacent repos", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const md = renderAuditMarkdown(buildArtifact(items)); + expect(md).toContain("None — all evidence within scope repo."); + }); + + it("shows n/a in the score column", () => { + const items: ItemScore[] = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + items[7] = { itemId: 8, score: "n/a", whyThisScore: "out of scope" }; + const md = renderAuditMarkdown(buildArtifact(items)); + expect(md).toMatch( + /\| 8 \| Sanctioned, governed AI tooling \| n\/a \| out of scope \|/, + ); + }); +}); + +describe("renderAuditJson", () => { + it("produces valid JSON with the same artifact structure", () => { + const items = Array.from({ length: 12 }, (_, i) => ({ + itemId: i + 1, + score: 1 as const, + whyThisScore: "ok", + })); + const json = renderAuditJson(buildArtifact(items)); + const parsed = JSON.parse(json); + expect(parsed.rubricVersion).toBe("1.0.0"); + expect(parsed.items).toHaveLength(12); + expect(parsed.band).toBe("Excellent"); + }); +}); + +describe("defaultOutputPath", () => { + it("slugifies the display name and includes date", () => { + expect(defaultOutputPath("Acme Corp / Backend Team", "2026-05-03")).toBe( + "./teamhero-maturity-acme-corp-backend-team-2026-05-03.md", + ); + }); + + it("collapses leading/trailing dashes", () => { + expect(defaultOutputPath(" --foo-- ", "2026-05-03")).toBe( + "./teamhero-maturity-foo-2026-05-03.md", + ); + }); +}); diff --git a/tests/unit/services/maturity/evidence-collectors.spec.ts b/tests/unit/services/maturity/evidence-collectors.spec.ts new file mode 100644 index 0000000..10d4fc1 --- /dev/null +++ b/tests/unit/services/maturity/evidence-collectors.spec.ts @@ -0,0 +1,86 @@ +import { describe, expect, it } from "bun:test"; +import { + defaultCollectors, + runAllCollectors, +} from "../../../../src/services/maturity/evidence-collectors.js"; +import type { ScopeDescriptor } from "../../../../src/services/maturity/types.js"; + +const SELF_SCOPE: ScopeDescriptor = { + mode: "local-repo", + localPath: process.cwd(), + displayName: "self", +}; + +describe("evidence collectors against this repo", () => { + it("returns 12 collectors in id order", () => { + const cs = defaultCollectors(); + expect(cs).toHaveLength(12); + expect(cs.map((c) => c.itemId)).toEqual([ + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + ]); + }); + + it("item 1 finds the bootstrap surface (justfile / scripts)", async () => { + const facts = await runAllCollectors(defaultCollectors().slice(0, 1), { + scope: SELF_SCOPE, + tier: "gh", + adjacentRepos: [], + }); + const positive = facts.filter( + (f) => f.itemId === 1 && f.signal === "positive", + ); + expect(positive.length).toBeGreaterThan(0); + }); + + it("item 7 finds CLAUDE.md / AGENTS.md in this repo", async () => { + const cs = defaultCollectors().filter((c) => c.itemId === 7); + const facts = await runAllCollectors(cs, { + scope: SELF_SCOPE, + tier: "gh", + adjacentRepos: [], + }); + const positive = facts.find((f) => f.signal === "positive"); + expect(positive).toBeDefined(); + expect(positive?.summary).toMatch(/CLAUDE|AGENTS/i); + }); + + it("item 3 finds test files in this repo", async () => { + const cs = defaultCollectors().filter((c) => c.itemId === 3); + const facts = await runAllCollectors(cs, { + scope: SELF_SCOPE, + tier: "gh", + adjacentRepos: [], + }); + const positive = facts.find((f) => f.signal === "positive"); + expect(positive).toBeDefined(); + expect(positive?.summary).toMatch(/test file/i); + }); + + it("git-only tier annotates capped items 2/3/9/11", async () => { + const cs = defaultCollectors().filter((c) => + [2, 3, 9, 11].includes(c.itemId), + ); + const facts = await runAllCollectors(cs, { + scope: SELF_SCOPE, + tier: "git-only", + adjacentRepos: [], + }); + for (const id of [2, 3, 9, 11]) { + const cap = facts.find( + (f) => f.itemId === id && /capped/i.test(f.summary), + ); + expect(cap).toBeDefined(); + } + }); + + it("no-localPath scope still produces interview-only facts for items 8 and 12", async () => { + const cs = defaultCollectors().filter((c) => [8, 12].includes(c.itemId)); + const facts = await runAllCollectors(cs, { + scope: { mode: "org", org: "acme", displayName: "acme" }, + tier: "gh", + adjacentRepos: [], + }); + expect(facts.find((f) => f.itemId === 8)).toBeDefined(); + expect(facts.find((f) => f.itemId === 12)).toBeDefined(); + }); +}); diff --git a/tests/unit/services/maturity/interview.spec.ts b/tests/unit/services/maturity/interview.spec.ts new file mode 100644 index 0000000..ba7c0f8 --- /dev/null +++ b/tests/unit/services/maturity/interview.spec.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from "bun:test"; +import { + FRAMING_MESSAGE, + getQuestion, + INTERVIEW_QUESTIONS, + isUnknownAnswer, +} from "../../../../src/services/maturity/interview.js"; + +describe("interview questions", () => { + it("has exactly 7 questions", () => { + expect(INTERVIEW_QUESTIONS).toHaveLength(7); + }); + + it("questions are in id order q1..q7", () => { + const ids = INTERVIEW_QUESTIONS.map((q) => q.id); + expect(ids).toEqual(["q1", "q2", "q3", "q4", "q5", "q6", "q7"]); + }); + + it("every question includes an 'I don't know' option", () => { + for (const q of INTERVIEW_QUESTIONS) { + expect(q.options.some((o) => /don't know/i.test(o))).toBe(true); + } + }); + + it("every question allows free-text", () => { + for (const q of INTERVIEW_QUESTIONS) { + expect(q.allowFreeText).toBe(true); + } + }); + + it("every question has a non-empty CONFIG.md heading", () => { + for (const q of INTERVIEW_QUESTIONS) { + expect(q.configHeading.length).toBeGreaterThan(0); + } + }); + + it("framing message mentions 7 questions", () => { + expect(FRAMING_MESSAGE).toMatch(/7/); + }); + + it("getQuestion throws for unknown id", () => { + // @ts-expect-error + expect(() => getQuestion("q99")).toThrow(); + }); +}); + +describe("isUnknownAnswer", () => { + it("matches 'I don't know'", () => { + expect(isUnknownAnswer("I don't know")).toBe(true); + }); + it("matches 'unknown'", () => { + expect(isUnknownAnswer("unknown")).toBe(true); + }); + it("matches 'n/a'", () => { + expect(isUnknownAnswer("n/a")).toBe(true); + }); + it("does not match a real answer", () => { + expect(isUnknownAnswer("We use Claude paid seats with policy")).toBe(false); + }); + it("ignores whitespace and casing", () => { + expect(isUnknownAnswer(" Unknown ")).toBe(true); + }); +}); diff --git a/tests/unit/services/maturity/maturity-prompts.spec.ts b/tests/unit/services/maturity/maturity-prompts.spec.ts new file mode 100644 index 0000000..07c7857 --- /dev/null +++ b/tests/unit/services/maturity/maturity-prompts.spec.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from "bun:test"; +import { + buildMaturityPrompt, + MATURITY_ASSESSMENT_SCHEMA, +} from "../../../../src/services/maturity/maturity-prompts.js"; + +describe("buildMaturityPrompt", () => { + it("includes the rubric, evidence, and interview blocks", () => { + const prompt = buildMaturityPrompt({ + scope: { + mode: "local-repo", + localPath: ".", + displayName: "test", + }, + tier: "gh", + adjacentRepos: [ + { owner: "acme", name: "ci-templates", reason: "workflow uses" }, + ], + evidence: [ + { + itemId: 1, + signal: "positive", + summary: "justfile present", + source: "test", + }, + ], + interviewAnswers: [ + { + questionId: "q1", + value: "Company-paid Claude with policy", + isOption: true, + }, + ], + }); + + expect(prompt).toContain("Agent Maturity Assessment"); + expect(prompt).toContain("Item 1 — Reproducible dev environments"); + expect(prompt).toContain( + "Item 12 — Interviews assess judgment under AI augmentation", + ); + expect(prompt).toContain("acme/ci-templates"); + expect(prompt).toContain("justfile present"); + expect(prompt).toContain("Company-paid Claude with policy"); + expect(prompt).toContain("agent_maturity_assessment schema"); + }); + + it("handles missing evidence and answers gracefully", () => { + const prompt = buildMaturityPrompt({ + scope: { mode: "org", org: "acme", displayName: "acme" }, + tier: "git-only", + adjacentRepos: [], + evidence: [], + interviewAnswers: [], + }); + expect(prompt).toContain("(none detected)"); + expect(prompt).toContain("(no deterministic evidence collected)"); + expect(prompt).toContain("_No interview answers supplied._"); + }); + + it("mentions tier-3 cap rule", () => { + const prompt = buildMaturityPrompt({ + scope: { mode: "org", org: "acme", displayName: "acme" }, + tier: "git-only", + adjacentRepos: [], + evidence: [], + interviewAnswers: [], + }); + expect(prompt).toMatch(/cap them at 0\.5/); + }); +}); + +describe("MATURITY_ASSESSMENT_SCHEMA", () => { + it("uses strict mode and the canonical name", () => { + expect(MATURITY_ASSESSMENT_SCHEMA.strict).toBe(true); + expect(MATURITY_ASSESSMENT_SCHEMA.name).toBe("agent_maturity_assessment"); + }); +}); diff --git a/tests/unit/services/maturity/rubric.spec.ts b/tests/unit/services/maturity/rubric.spec.ts new file mode 100644 index 0000000..b9dcbd8 --- /dev/null +++ b/tests/unit/services/maturity/rubric.spec.ts @@ -0,0 +1,83 @@ +import { describe, expect, it } from "bun:test"; +import { + getCategory, + getRubricItem, + MATURITY_BANDS, + MAX_RAW_SCORE, + MAX_WEIGHTED_SCORE, + RUBRIC_CATEGORIES, + RUBRIC_ITEMS, +} from "../../../../src/services/maturity/rubric.js"; +import { RUBRIC_VERSION } from "../../../../src/services/maturity/types.js"; + +describe("rubric", () => { + it("has exactly 12 items", () => { + expect(RUBRIC_ITEMS).toHaveLength(12); + }); + + it("item ids are 1..12 with no gaps", () => { + const ids = RUBRIC_ITEMS.map((i) => i.id).sort((a, b) => a - b); + expect(ids).toEqual([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + }); + + it("category weights sum to the documented max weighted score", () => { + const total = RUBRIC_CATEGORIES.reduce((sum, c) => sum + c.maxWeighted, 0); + expect(total).toBeCloseTo(MAX_WEIGHTED_SCORE, 5); + }); + + it("category item lists partition the 12 items", () => { + const all = RUBRIC_CATEGORIES.flatMap((c) => c.itemIds).sort( + (a, b) => a - b, + ); + expect(all).toEqual([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); + expect(all).toHaveLength(MAX_RAW_SCORE); + }); + + it("each item references a valid category", () => { + for (const item of RUBRIC_ITEMS) { + expect(["A", "B", "C", "D"]).toContain(item.categoryId); + const cat = getCategory(item.categoryId); + expect(cat.itemIds).toContain(item.id); + } + }); + + it("category weights match the spec (A=1.0, B=1.5, C=1.25, D=1.0)", () => { + expect(getCategory("A").weight).toBe(1.0); + expect(getCategory("B").weight).toBe(1.5); + expect(getCategory("C").weight).toBe(1.25); + expect(getCategory("D").weight).toBe(1.0); + }); + + it("each item has non-empty score levels and whyItMatters", () => { + for (const item of RUBRIC_ITEMS) { + expect(item.scoreLevels.one.length).toBeGreaterThan(10); + expect(item.scoreLevels.half.length).toBeGreaterThan(10); + expect(item.scoreLevels.zero.length).toBeGreaterThan(5); + expect(item.whyItMatters.length).toBeGreaterThan(20); + expect(item.title.length).toBeGreaterThan(3); + } + }); + + it("tier3Cap items are exactly 2, 3, 9, 11 (per preflight.md)", () => { + const capped = RUBRIC_ITEMS.filter((i) => i.tier3Cap).map((i) => i.id); + expect(capped.sort((a, b) => a - b)).toEqual([2, 3, 9, 11]); + }); + + it("getRubricItem throws for unknown id", () => { + expect(() => getRubricItem(99)).toThrow(); + }); + + it("maturity bands cover the full 0..100 range", () => { + for (let pct = 0; pct <= 100; pct += 5) { + const matched = MATURITY_BANDS.filter( + (b) => pct >= b.min && pct <= b.max, + ); + expect(matched).toHaveLength(1); + } + }); + + it("rubric version is a non-empty string", () => { + expect(RUBRIC_VERSION).toBeTruthy(); + expect(typeof RUBRIC_VERSION).toBe("string"); + }); +}); diff --git a/tests/unit/services/maturity/scoring.spec.ts b/tests/unit/services/maturity/scoring.spec.ts new file mode 100644 index 0000000..a1897a9 --- /dev/null +++ b/tests/unit/services/maturity/scoring.spec.ts @@ -0,0 +1,138 @@ +import { describe, expect, it } from "bun:test"; +import { + bandByName, + categorySubtotals, + classifyBand, + computeOverallScore, + findMissingItems, +} from "../../../../src/services/maturity/scoring.js"; +import type { ItemScore } from "../../../../src/services/maturity/types.js"; + +function fillItems(score: 0 | 0.5 | 1): ItemScore[] { + return Array.from({ length: 12 }, (_, idx) => ({ + itemId: idx + 1, + score, + whyThisScore: "test", + })); +} + +describe("computeOverallScore", () => { + it("perfect score = 12 raw, 14.5 weighted, 100%", () => { + const result = computeOverallScore(fillItems(1)); + expect(result.rawScore).toBe(12); + expect(result.weightedScore).toBeCloseTo(14.5, 5); + expect(result.scorePercent).toBe(100); + expect(result.band.name).toBe("Excellent"); + }); + + it("zero score = 0 raw, 0 weighted, 0%", () => { + const result = computeOverallScore(fillItems(0)); + expect(result.rawScore).toBe(0); + expect(result.weightedScore).toBe(0); + expect(result.scorePercent).toBe(0); + expect(result.band.name).toBe("Triage"); + }); + + it("all 0.5 = 6 raw, 7.25 weighted, 50%", () => { + const result = computeOverallScore(fillItems(0.5)); + expect(result.rawScore).toBe(6); + expect(result.weightedScore).toBeCloseTo(7.25, 5); + expect(result.scorePercent).toBeCloseTo(50, 5); + expect(result.band.name).toBe("Significant dysfunction"); + }); + + it("n/a items are excluded from numerator AND max", () => { + const items: ItemScore[] = fillItems(1); + // Mark item 12 (D, 1.0× weight) as n/a + items[11] = { itemId: 12, score: "n/a", whyThisScore: "no info" }; + const result = computeOverallScore(items); + // 11 items × 1.0 raw, max raw 11 + expect(result.rawScore).toBe(11); + expect(result.rawScoreMax).toBe(11); + // max weighted should drop by 1.0 (item 12's category D weight) + expect(result.weightedScoreMax).toBeCloseTo(13.5, 5); + expect(result.scorePercent).toBeCloseTo(100, 5); + }); + + it("n/a in category B (1.5×) drops max weighted by 1.5", () => { + const items: ItemScore[] = fillItems(1); + items[6] = { itemId: 7, score: "n/a", whyThisScore: "out of scope" }; + const result = computeOverallScore(items); + expect(result.weightedScoreMax).toBeCloseTo(14.5 - 1.5, 5); + }); + + it("category B is weighted at 1.5×", () => { + const items: ItemScore[] = fillItems(0); + // Set item 5 (category B) to 1.0 — should produce 1.5 weighted + items[4] = { itemId: 5, score: 1, whyThisScore: "" }; + const result = computeOverallScore(items); + expect(result.weightedScore).toBeCloseTo(1.5, 5); + }); + + it("category C is weighted at 1.25×", () => { + const items: ItemScore[] = fillItems(0); + items[7] = { itemId: 8, score: 1, whyThisScore: "" }; + const result = computeOverallScore(items); + expect(result.weightedScore).toBeCloseTo(1.25, 5); + }); +}); + +describe("classifyBand", () => { + it("90% → Excellent", () => { + expect(classifyBand(95).name).toBe("Excellent"); + }); + it("80% → Healthy", () => { + expect(classifyBand(80).name).toBe("Healthy"); + }); + it("70% → Functional but slow", () => { + expect(classifyBand(70).name).toBe("Functional but slow"); + }); + it("50% → Significant dysfunction", () => { + expect(classifyBand(50).name).toBe("Significant dysfunction"); + }); + it("30% → Triage", () => { + expect(classifyBand(30).name).toBe("Triage"); + }); + it("boundary 75% → Healthy", () => { + expect(classifyBand(75).name).toBe("Healthy"); + }); + it("boundary 89.99% → Healthy", () => { + expect(classifyBand(89.99).name).toBe("Healthy"); + }); +}); + +describe("bandByName", () => { + it("returns Healthy band by name", () => { + expect(bandByName("Healthy").min).toBe(75); + }); + it("throws for unknown band", () => { + // @ts-expect-error + expect(() => bandByName("Bogus")).toThrow(); + }); +}); + +describe("categorySubtotals", () => { + it("each category gets its weight applied", () => { + const items = fillItems(1); + const subs = categorySubtotals(items); + const a = subs.find((s) => s.id === "A")!; + const b = subs.find((s) => s.id === "B")!; + const c = subs.find((s) => s.id === "C")!; + const d = subs.find((s) => s.id === "D")!; + expect(a.weighted).toBeCloseTo(4.0, 5); + expect(b.weighted).toBeCloseTo(4.5, 5); + expect(c.weighted).toBeCloseTo(5.0, 5); + expect(d.weighted).toBeCloseTo(1.0, 5); + }); +}); + +describe("findMissingItems", () => { + it("returns empty when all 12 present", () => { + expect(findMissingItems(fillItems(1))).toEqual([]); + }); + + it("returns missing item ids", () => { + const items = fillItems(1).filter((i) => i.itemId !== 7 && i.itemId !== 11); + expect(findMissingItems(items)).toEqual([7, 11]); + }); +}); diff --git a/tests/unit/services/maturity/stdin-interview.spec.ts b/tests/unit/services/maturity/stdin-interview.spec.ts new file mode 100644 index 0000000..40e4884 --- /dev/null +++ b/tests/unit/services/maturity/stdin-interview.spec.ts @@ -0,0 +1,154 @@ +import { describe, expect, it } from "bun:test"; +import { Readable } from "node:stream"; +import { + StdinInterviewTransport, + StdinLineReader, +} from "../../../../src/services/maturity/stdin-interview.js"; +import type { InterviewQuestion } from "../../../../src/services/maturity/types.js"; + +function makeStream(lines: string[]): NodeJS.ReadableStream { + return Readable.from(lines.map((l) => `${l}\n`)); +} + +describe("StdinLineReader", () => { + it("returns lines that arrive before any caller waits (no dropped lines)", async () => { + const stream = makeStream(["alpha", "beta", "gamma"]); + const reader = new StdinLineReader(stream); + // Give the stream a tick to deliver data + end before reading. + await new Promise((r) => setTimeout(r, 10)); + expect(await reader.nextLine()).toBe("alpha"); + expect(await reader.nextLine()).toBe("beta"); + expect(await reader.nextLine()).toBe("gamma"); + expect(await reader.nextLine()).toBe(""); // EOF + }); + + it("returns lines that arrive after the caller is already waiting", async () => { + const stream = new Readable({ read() {} }); + const reader = new StdinLineReader(stream); + const firstP = reader.nextLine(); + stream.push("hello\n"); + expect(await firstP).toBe("hello"); + const secondP = reader.nextLine(); + stream.push("world\n"); + stream.push(null); + expect(await secondP).toBe("world"); + }); + + it("handles a single chunk that contains multiple lines and EOF", async () => { + const stream = new Readable({ read() {} }); + const reader = new StdinLineReader(stream); + stream.push("one\ntwo\nthree\n"); + stream.push(null); + await new Promise((r) => setTimeout(r, 10)); + expect(await reader.nextLine()).toBe("one"); + expect(await reader.nextLine()).toBe("two"); + expect(await reader.nextLine()).toBe("three"); + expect(await reader.nextLine()).toBe(""); + }); + + it("handles partial-line chunks across multiple data events", async () => { + const stream = new Readable({ read() {} }); + const reader = new StdinLineReader(stream); + stream.push("hel"); + stream.push("lo\nwo"); + stream.push("rld\n"); + stream.push(null); + await new Promise((r) => setTimeout(r, 10)); + expect(await reader.nextLine()).toBe("hello"); + expect(await reader.nextLine()).toBe("world"); + expect(await reader.nextLine()).toBe(""); + }); +}); + +describe("StdinInterviewTransport", () => { + const sampleQuestion: InterviewQuestion = { + id: "q1", + prompt: "test prompt", + options: ["a", "b", "I don't know"], + allowFreeText: true, + configHeading: "Test (Q1)", + }; + + it("emits a question event and returns the matching answer", async () => { + const stream = makeStream([ + '{"type":"interview-answer","questionId":"q1","value":"hello","isOption":false}', + ]); + const reader = new StdinLineReader(stream); + const emitted: Array> = []; + const transport = new StdinInterviewTransport(reader, (e) => + emitted.push(e), + ); + await new Promise((r) => setTimeout(r, 10)); + + const answer = await transport.ask(sampleQuestion); + expect(answer).toEqual({ + questionId: "q1", + value: "hello", + isOption: false, + }); + expect(emitted).toHaveLength(1); + expect(emitted[0].type).toBe("interview-question"); + expect(emitted[0].questionId).toBe("q1"); + }); + + it("ignores answers that don't match the current question id", async () => { + const stream = makeStream([ + '{"type":"interview-answer","questionId":"q5","value":"wrong","isOption":true}', + '{"type":"interview-answer","questionId":"q1","value":"right","isOption":true}', + ]); + const reader = new StdinLineReader(stream); + const transport = new StdinInterviewTransport(reader, () => {}); + await new Promise((r) => setTimeout(r, 10)); + const answer = await transport.ask(sampleQuestion); + expect(answer.value).toBe("right"); + }); + + it("returns 'unknown' when the stream closes without answering", async () => { + const stream = makeStream([]); + const reader = new StdinLineReader(stream); + const transport = new StdinInterviewTransport(reader, () => {}); + await new Promise((r) => setTimeout(r, 10)); + const answer = await transport.ask(sampleQuestion); + expect(answer.value).toBe("unknown"); + }); + + it("processes 7 questions in sequence with all answers buffered upfront", async () => { + const lines = []; + for (let i = 1; i <= 7; i++) { + lines.push( + JSON.stringify({ + type: "interview-answer", + questionId: `q${i}`, + value: `answer-${i}`, + isOption: true, + }), + ); + } + const stream = makeStream(lines); + const reader = new StdinLineReader(stream); + const transport = new StdinInterviewTransport(reader, () => {}); + await new Promise((r) => setTimeout(r, 10)); + + const collected: string[] = []; + for (let i = 1; i <= 7; i++) { + const q: InterviewQuestion = { + id: `q${i}` as InterviewQuestion["id"], + prompt: `question ${i}`, + options: ["x", "y"], + allowFreeText: true, + configHeading: `Q${i}`, + }; + const answer = await transport.ask(q); + collected.push(answer.value); + } + expect(collected).toEqual([ + "answer-1", + "answer-2", + "answer-3", + "answer-4", + "answer-5", + "answer-6", + "answer-7", + ]); + }); +}); diff --git a/tui/assess.go b/tui/assess.go new file mode 100644 index 0000000..7479083 --- /dev/null +++ b/tui/assess.go @@ -0,0 +1,146 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + + "github.com/charmbracelet/lipgloss" +) + +func printAssessUsage() { + fmt.Fprintf(os.Stderr, `Usage: teamhero assess [flags] + +Run the Agent Maturity Assessment — a 12-criterion diagnostic that scores an +engineering organization for AI-agentic-coding readiness. Produces a weighted +percentage, a raw /12 score, item-level evidence, top-3 fixes, strengths, and +a maturity band (Excellent / Healthy / Functional but slow / Significant +dysfunction / Triage). + +Saved configuration: + Previous interactive runs save settings to ~/.config/teamhero/assess-config.json + (or $XDG_CONFIG_HOME/teamhero/assess-config.json). Headless mode loads this + automatically when present. + +Scope flags: + --scope-mode org | local-repo | both + --target-org GitHub org name (org or both modes) + --target-repos Comma-separated repo names (optional, narrows scope) + --path Local repo path (local-repo or both modes) + --display-name Override the audit's scope display name + +Run flags: + --headless Run non-interactively (auto-detected in CI / piped stdin) + --evidence-tier auto | gh | github-mcp | git-only (default: auto) + --interview-answers JSON file with pre-supplied Phase-1 answers + Format: {"q1":"...","q2":"...",...} + --audit-output Output file path (default: timestamped in cwd) + --audit-output-format markdown | json | both (default: both) + --dry-run Skip the AI scorer and emit a placeholder audit + --flush-assess-cache Flush cached assessment(s) before running + --show-assess-config Print saved configuration as JSON and exit + +Examples: + teamhero assess Interactive wizard + interview + teamhero assess --headless --path . Audit the current repo, no interview + teamhero assess --headless --target-org acme --interview-answers answers.json + Headless org-level audit with pre-supplied answers + teamhero assess --dry-run --path . Smoke test without an OpenAI call + +Exit codes: + 0 Success + 1 Configuration error + 2 Service / scoring error +`) +} + +// runAssess is the entry point for the "assess" subcommand. It dispatches to +// either the headless run loop or the interactive wizard based on environment. +func runAssess() error { + if *flagAssessShowConfig { + cfg, err := LoadAssessConfig() + if err != nil || cfg == nil { + fmt.Fprintln(os.Stderr, "No saved assess configuration found at "+assessConfigPath()) + os.Exit(1) + } + data, _ := json.MarshalIndent(cfg, "", " ") + fmt.Println(string(data)) + return nil + } + + cfg := loadOrInitAssessConfig() + applyAssessFlagsTo(&cfg, flagWasSet) + fillAssessDefaults(&cfg) + + if isHeadless() { + if !hasMinimalAssessConfig(&cfg) { + fmt.Fprintln(os.Stderr, "assess: scope is required (set --path, --target-org, or run interactively)") + os.Exit(1) + } + return runAssessHeadless(cfg) + } + + return runAssessInteractive(&cfg) +} + +func loadOrInitAssessConfig() AssessConfig { + saved, _ := LoadAssessConfig() + if saved != nil { + return *saved + } + return DefaultAssessConfig() +} + +// runAssessHeadless drives the assess service runner without any TTY UI. +// Interview answers must come from --interview-answers or a CONFIG.md file. +func runAssessHeadless(cfg AssessConfig) error { + cfg.Mode = "headless" + cfg.InteractiveInterview = false + + res, err := RunAssessServiceRunner(cfg) + if err != nil { + return err + } + defer res.Close() + + for evt := range res.Events { + switch evt.Type { + case "progress": + fmt.Fprintf(os.Stderr, "[%s] %s\n", evt.Step, evt.Message) + case "interview-frame": + fmt.Fprintln(os.Stderr, evt.Message) + case "interview-question": + fmt.Fprintf( + os.Stderr, + "⚠ assess: interview question %q received in headless mode — answer with --interview-answers or run interactively\n", + evt.QuestionID, + ) + // Headless mode never sends an answer; the service times out the + // stream and falls back to "unknown" for the question. + case "result": + styled := lipgloss.NewStyle().Bold(true) + fmt.Println(styled.Render("Audit complete:")) + fmt.Println(" " + evt.OutputPath) + if evt.JsonOutputPath != "" { + fmt.Println(" " + evt.JsonOutputPath) + } + case "error": + fmt.Fprintf(os.Stderr, "✖ %s\n", evt.Message) + return fmt.Errorf("assess: %s", evt.Message) + } + } + + for err := range res.Errors { + if err != nil { + if res.Stderr != nil && res.Stderr.Len() > 0 { + fmt.Fprintln(os.Stderr, res.Stderr.String()) + } + return err + } + } + + if err := SaveAssessConfig(&cfg); err != nil { + fmt.Fprintf(os.Stderr, "Note: failed to save assess config: %v\n", err) + } + return nil +} diff --git a/tui/assess_config.go b/tui/assess_config.go new file mode 100644 index 0000000..dd6a935 --- /dev/null +++ b/tui/assess_config.go @@ -0,0 +1,78 @@ +package main + +import ( + "encoding/json" + "os" + "path/filepath" +) + +// AssessConfig mirrors src/services/maturity/types.ts::AssessCommandInput. +// It is sent as the first JSON line on stdin to the run-assess.ts service runner. +type AssessConfig struct { + Scope AssessScope `json:"scope"` + EvidenceTier string `json:"evidenceTier,omitempty"` + InterviewAnswersPath string `json:"interviewAnswersPath,omitempty"` + OutputPath string `json:"outputPath,omitempty"` + OutputFormat string `json:"outputFormat,omitempty"` + FlushCache bool `json:"flushCache,omitempty"` + DryRun bool `json:"dryRun,omitempty"` + Mode string `json:"mode,omitempty"` + InteractiveInterview bool `json:"interactiveInterview,omitempty"` +} + +// AssessScope mirrors ScopeDescriptor. +type AssessScope struct { + Mode string `json:"mode"` + Org string `json:"org,omitempty"` + Repos []string `json:"repos,omitempty"` + LocalPath string `json:"localPath,omitempty"` + DisplayName string `json:"displayName"` +} + +// assessConfigPath returns ~/.config/teamhero/assess-config.json (XDG-compliant). +func assessConfigPath() string { + return filepath.Join(configDir(), "assess-config.json") +} + +// LoadAssessConfig reads the saved assess configuration. Returns nil with no +// error if the file does not exist. +func LoadAssessConfig() (*AssessConfig, error) { + data, err := os.ReadFile(assessConfigPath()) + if err != nil { + if os.IsNotExist(err) { + return nil, nil + } + return nil, err + } + var cfg AssessConfig + if err := json.Unmarshal(data, &cfg); err != nil { + return nil, err + } + return &cfg, nil +} + +// SaveAssessConfig persists the assess configuration to disk. +func SaveAssessConfig(cfg *AssessConfig) error { + if err := os.MkdirAll(filepath.Dir(assessConfigPath()), 0o755); err != nil { + return err + } + data, err := json.MarshalIndent(cfg, "", " ") + if err != nil { + return err + } + return os.WriteFile(assessConfigPath(), data, 0o600) +} + +// DefaultAssessConfig returns a sensible starting config for a new user. +func DefaultAssessConfig() AssessConfig { + cwd, _ := os.Getwd() + return AssessConfig{ + Scope: AssessScope{ + Mode: "local-repo", + LocalPath: cwd, + DisplayName: filepath.Base(cwd), + }, + EvidenceTier: "auto", + OutputFormat: "both", + } +} diff --git a/tui/assess_config_test.go b/tui/assess_config_test.go new file mode 100644 index 0000000..50ebc9c --- /dev/null +++ b/tui/assess_config_test.go @@ -0,0 +1,137 @@ +package main + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" +) + +func TestDefaultAssessConfig(t *testing.T) { + cfg := DefaultAssessConfig() + if cfg.Scope.Mode != "local-repo" { + t.Errorf("Mode = %q, want local-repo", cfg.Scope.Mode) + } + if cfg.OutputFormat != "both" { + t.Errorf("OutputFormat = %q, want both", cfg.OutputFormat) + } + if cfg.EvidenceTier != "auto" { + t.Errorf("EvidenceTier = %q, want auto", cfg.EvidenceTier) + } + if cfg.Scope.LocalPath == "" { + t.Error("LocalPath should be set to cwd") + } +} + +func TestSaveAndLoadAssessConfig(t *testing.T) { + dir := t.TempDir() + t.Setenv("XDG_CONFIG_HOME", dir) + + cfg := AssessConfig{ + Scope: AssessScope{ + Mode: "org", + Org: "acme", + DisplayName: "acme", + }, + EvidenceTier: "gh", + OutputFormat: "markdown", + DryRun: true, + } + if err := SaveAssessConfig(&cfg); err != nil { + t.Fatalf("SaveAssessConfig: %v", err) + } + loaded, err := LoadAssessConfig() + if err != nil { + t.Fatalf("LoadAssessConfig: %v", err) + } + if loaded == nil { + t.Fatal("loaded is nil") + } + if loaded.Scope.Org != "acme" || loaded.EvidenceTier != "gh" || !loaded.DryRun { + t.Errorf("round-trip mismatch: %+v", loaded) + } + + // Verify file is JSON-parseable on disk + data, err := os.ReadFile(filepath.Join(dir, "teamhero", "assess-config.json")) + if err != nil { + t.Fatalf("read disk file: %v", err) + } + var probe AssessConfig + if err := json.Unmarshal(data, &probe); err != nil { + t.Errorf("on-disk JSON invalid: %v", err) + } +} + +func TestLoadAssessConfig_MissingFile(t *testing.T) { + dir := t.TempDir() + t.Setenv("XDG_CONFIG_HOME", dir) + cfg, err := LoadAssessConfig() + if err != nil { + t.Fatalf("expected nil error for missing file, got: %v", err) + } + if cfg != nil { + t.Errorf("expected nil config for missing file, got: %+v", cfg) + } +} + +func TestFillAssessDefaults(t *testing.T) { + t.Run("empty -> local-repo", func(t *testing.T) { + cfg := AssessConfig{} + fillAssessDefaults(&cfg) + if cfg.Scope.Mode != "local-repo" { + t.Errorf("mode = %q, want local-repo", cfg.Scope.Mode) + } + if cfg.OutputFormat != "both" { + t.Errorf("format = %q, want both", cfg.OutputFormat) + } + }) + t.Run("org-only sets mode=org", func(t *testing.T) { + cfg := AssessConfig{Scope: AssessScope{Org: "acme"}} + fillAssessDefaults(&cfg) + if cfg.Scope.Mode != "org" { + t.Errorf("mode = %q, want org", cfg.Scope.Mode) + } + if cfg.Scope.DisplayName != "acme" { + t.Errorf("displayName = %q, want acme", cfg.Scope.DisplayName) + } + }) + t.Run("path-only sets mode=local-repo with basename", func(t *testing.T) { + cfg := AssessConfig{Scope: AssessScope{LocalPath: "/foo/bar/baz"}} + fillAssessDefaults(&cfg) + if cfg.Scope.Mode != "local-repo" { + t.Errorf("mode = %q, want local-repo", cfg.Scope.Mode) + } + if cfg.Scope.DisplayName != "baz" { + t.Errorf("displayName = %q, want baz", cfg.Scope.DisplayName) + } + }) + t.Run("org+path sets mode=both", func(t *testing.T) { + cfg := AssessConfig{Scope: AssessScope{Org: "acme", LocalPath: "/foo"}} + fillAssessDefaults(&cfg) + if cfg.Scope.Mode != "both" { + t.Errorf("mode = %q, want both", cfg.Scope.Mode) + } + }) +} + +func TestHasMinimalAssessConfig(t *testing.T) { + if hasMinimalAssessConfig(nil) { + t.Error("nil should be invalid") + } + if hasMinimalAssessConfig(&AssessConfig{}) { + t.Error("empty should be invalid") + } + if hasMinimalAssessConfig(&AssessConfig{Scope: AssessScope{Mode: "org"}}) { + t.Error("org without name should be invalid") + } + if !hasMinimalAssessConfig(&AssessConfig{ + Scope: AssessScope{Mode: "org", Org: "acme", DisplayName: "acme"}, + }) { + t.Error("org+name should be valid") + } + if !hasMinimalAssessConfig(&AssessConfig{ + Scope: AssessScope{Mode: "local-repo", LocalPath: "/foo", DisplayName: "foo"}, + }) { + t.Error("local-repo+path should be valid") + } +} diff --git a/tui/assess_flags.go b/tui/assess_flags.go new file mode 100644 index 0000000..5e48c2d --- /dev/null +++ b/tui/assess_flags.go @@ -0,0 +1,120 @@ +package main + +import ( + "flag" + "path/filepath" + "strings" +) + +// Assess-specific flag set, parsed independently from the report flag set so +// the two subcommands don't interfere. +var ( + flagAssessScopeMode = flag.String("scope-mode", "", "Assess scope mode: org | local-repo | both") + flagAssessOrg = flag.String("target-org", "", "GitHub org to assess (when scope-mode=org or both)") + flagAssessRepos = flag.String("target-repos", "", "Comma-separated repo names to assess (optional)") + flagAssessPath = flag.String("path", "", "Local repo path to assess (when scope-mode=local-repo or both)") + flagAssessDisplayName = flag.String("display-name", "", "Override the audit's scope display name") + flagAssessTier = flag.String("evidence-tier", "", "Override evidence-tier detection: auto | gh | github-mcp | git-only") + flagAssessAnswers = flag.String("interview-answers", "", "Path to a JSON file with pre-supplied interview answers (headless)") + flagAssessOutput = flag.String("audit-output", "", "Output file path (default: timestamped in current directory)") + flagAssessOutputFormat = flag.String("audit-output-format", "", "Output format: markdown | json | both (default: both)") + flagAssessDryRun = flag.Bool("dry-run", false, "Skip the AI scorer (writes a placeholder audit)") + flagAssessFlushCache = flag.Bool("flush-assess-cache", false, "Flush the maturity-assessment cache before running") + flagAssessShowConfig = flag.Bool("show-assess-config", false, "Print saved assess configuration as JSON and exit") +) + +// applyAssessFlagsTo merges explicitly-set CLI flags into cfg. +func applyAssessFlagsTo(cfg *AssessConfig, wasSet func(string) bool) { + if wasSet("scope-mode") { + cfg.Scope.Mode = strings.TrimSpace(*flagAssessScopeMode) + } + if wasSet("target-org") { + cfg.Scope.Org = strings.TrimSpace(*flagAssessOrg) + } + if wasSet("target-repos") { + cfg.Scope.Repos = splitCSV(*flagAssessRepos) + } + if wasSet("path") { + cfg.Scope.LocalPath = strings.TrimSpace(*flagAssessPath) + } + if wasSet("display-name") { + cfg.Scope.DisplayName = strings.TrimSpace(*flagAssessDisplayName) + } + if wasSet("evidence-tier") { + cfg.EvidenceTier = strings.TrimSpace(*flagAssessTier) + } + if wasSet("interview-answers") { + cfg.InterviewAnswersPath = strings.TrimSpace(*flagAssessAnswers) + } + if wasSet("audit-output") { + cfg.OutputPath = strings.TrimSpace(*flagAssessOutput) + } + if wasSet("audit-output-format") { + cfg.OutputFormat = strings.TrimSpace(*flagAssessOutputFormat) + } + if wasSet("dry-run") { + cfg.DryRun = *flagAssessDryRun + } + if wasSet("flush-assess-cache") { + cfg.FlushCache = *flagAssessFlushCache + } +} + +// fillAssessDefaults populates required fields if they're missing. Mirrors +// DefaultAssessConfig but applied to an already-loaded config. +func fillAssessDefaults(cfg *AssessConfig) { + if cfg.Scope.Mode == "" { + if cfg.Scope.LocalPath != "" && cfg.Scope.Org == "" { + cfg.Scope.Mode = "local-repo" + } else if cfg.Scope.Org != "" && cfg.Scope.LocalPath == "" { + cfg.Scope.Mode = "org" + } else if cfg.Scope.Org != "" && cfg.Scope.LocalPath != "" { + cfg.Scope.Mode = "both" + } else { + cfg.Scope.Mode = "local-repo" + } + } + if cfg.Scope.DisplayName == "" { + switch cfg.Scope.Mode { + case "org": + cfg.Scope.DisplayName = cfg.Scope.Org + case "local-repo": + if cfg.Scope.LocalPath != "" { + cfg.Scope.DisplayName = filepath.Base(cfg.Scope.LocalPath) + } + case "both": + if cfg.Scope.Org != "" { + cfg.Scope.DisplayName = cfg.Scope.Org + } else if cfg.Scope.LocalPath != "" { + cfg.Scope.DisplayName = filepath.Base(cfg.Scope.LocalPath) + } + } + } + if cfg.OutputFormat == "" { + cfg.OutputFormat = "both" + } + if cfg.EvidenceTier == "" { + cfg.EvidenceTier = "auto" + } +} + +// hasMinimalAssessConfig returns true if enough config is present to run +// headless without further interactive input. +func hasMinimalAssessConfig(cfg *AssessConfig) bool { + if cfg == nil { + return false + } + switch cfg.Scope.Mode { + case "org", "both": + if strings.TrimSpace(cfg.Scope.Org) == "" { + return false + } + case "local-repo": + if strings.TrimSpace(cfg.Scope.LocalPath) == "" { + return false + } + default: + return false + } + return strings.TrimSpace(cfg.Scope.DisplayName) != "" +} diff --git a/tui/assess_preview.go b/tui/assess_preview.go new file mode 100644 index 0000000..eeeac4f --- /dev/null +++ b/tui/assess_preview.go @@ -0,0 +1,366 @@ +package main + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + + "github.com/charmbracelet/bubbles/spinner" + "github.com/charmbracelet/bubbles/viewport" + tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/glamour" + "github.com/charmbracelet/lipgloss" +) + +// Tab indices for the assess preview. Mirrors the report preview's +// (tabReport / tabDiscrepancy / tabJSON) — the assess flow has a different +// middle tab (Evidence) since maturity audits don't produce discrepancies. +const ( + assessTabAudit = 0 + assessTabEvidence = 1 + assessTabJSON = 2 + assessTabCount = 3 +) + +var assessTabLabels = [assessTabCount]string{"Audit", "Evidence", "JSON Data"} + +type assessRenderedMsg struct { + rendered [assessTabCount]string +} + +type assessPreviewModel struct { + path string + jsonPath string + markdown string + jsonData string + renderErr string + + activeTab int + viewports [assessTabCount]viewport.Model + + width int + height int + + rendering bool + spinner spinner.Model +} + +func newAssessPreviewModel(path, jsonPath, jsonData string) assessPreviewModel { + absPath, _ := filepath.Abs(path) + + md, err := os.ReadFile(absPath) + renderErr := "" + content := "" + if err != nil { + renderErr = fmt.Sprintf("Could not read audit file: %v", err) + } else { + content = string(md) + } + + w := termWidth() + vpWidth := max(20, w-2) + var vps [assessTabCount]viewport.Model + for i := range vps { + vps[i] = viewport.New(vpWidth, 8) + } + + s := spinner.New() + s.Spinner = spinner.Dot + s.Style = lipgloss.NewStyle().Foreground(lipgloss.Color("14")) + + return assessPreviewModel{ + path: absPath, + jsonPath: jsonPath, + markdown: content, + jsonData: jsonData, + renderErr: renderErr, + activeTab: assessTabAudit, + viewports: vps, + width: w, + height: 24, + rendering: true, + spinner: s, + } +} + +func (m assessPreviewModel) Init() tea.Cmd { + return tea.Batch(m.spinner.Tick, tea.WindowSize(), m.renderContentCmd()) +} + +func (m assessPreviewModel) renderContentCmd() tea.Cmd { + markdown := m.markdown + renderErr := m.renderErr + jsonData := m.jsonData + width := max(20, m.width-2) + + return func() tea.Msg { + var rendered [assessTabCount]string + + // Audit tab — full Glamour render of the markdown audit. + if renderErr != "" { + rendered[assessTabAudit] = lipgloss.NewStyle(). + Foreground(lipgloss.Color("9")). + Render(renderErr) + } else { + wrap := max(20, width-2) + r, err := glamour.NewTermRenderer( + glamourStyleOption(), + glamour.WithWordWrap(wrap), + ) + rendered[assessTabAudit] = markdown + if err == nil { + if out, gErr := r.Render(markdown); gErr == nil { + rendered[assessTabAudit] = out + } + } + } + + // Evidence tab — extract evidence facts from the JSON if available. + evidenceMd := buildAssessEvidenceMarkdown(jsonData) + wrap := max(20, width-2) + r, err := glamour.NewTermRenderer( + glamourStyleOption(), + glamour.WithWordWrap(wrap), + ) + rendered[assessTabEvidence] = evidenceMd + if err == nil { + if out, gErr := r.Render(evidenceMd); gErr == nil { + rendered[assessTabEvidence] = out + } + } + + // JSON tab — pretty-print + colorize. + if jsonData == "" { + dim := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + rendered[assessTabJSON] = dim.Render("No JSON data available.") + } else { + pretty := jsonData + var raw json.RawMessage + if jErr := json.Unmarshal([]byte(jsonData), &raw); jErr == nil { + if indented, iErr := json.MarshalIndent(raw, "", " "); iErr == nil { + pretty = string(indented) + } + } + rendered[assessTabJSON] = renderJSONContent(pretty) + } + + return assessRenderedMsg{rendered: rendered} + } +} + +func (m assessPreviewModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch msg := msg.(type) { + case assessRenderedMsg: + m.rendering = false + for i, content := range msg.rendered { + m.viewports[i].SetContent(content) + m.viewports[i].GotoTop() + } + return m, nil + + case spinner.TickMsg: + if m.rendering { + var cmd tea.Cmd + m.spinner, cmd = m.spinner.Update(msg) + return m, cmd + } + return m, nil + + case tea.WindowSizeMsg: + m.width = msg.Width + m.height = msg.Height + m.reflow() + if !m.rendering { + m.rendering = true + return m, tea.Batch(m.spinner.Tick, m.renderContentCmd()) + } + return m, nil + + case tea.KeyMsg: + switch msg.String() { + case "ctrl+c", "q", "esc", "enter": + return m, tea.Quit + case "tab", "right", "l": + m.activeTab = (m.activeTab + 1) % assessTabCount + return m, nil + case "shift+tab", "left", "h": + m.activeTab = (m.activeTab - 1 + assessTabCount) % assessTabCount + return m, nil + } + if !m.rendering { + var cmd tea.Cmd + m.viewports[m.activeTab], cmd = m.viewports[m.activeTab].Update(msg) + return m, cmd + } + return m, nil + } + + var cmd tea.Cmd + m.viewports[m.activeTab], cmd = m.viewports[m.activeTab].Update(msg) + return m, cmd +} + +func (m assessPreviewModel) View() string { + m.reflow() + + header := renderShellHeader(m.width) + contentWidth := max(20, m.width-2) + + titleStyle := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("10")) + labelStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + pathStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("14")) + helpStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + + infoLines := []string{ + titleStyle.Render("Audit Ready"), + labelStyle.Render("Markdown: ") + pathStyle.Render("file://"+m.path), + } + if m.jsonPath != "" { + infoLines = append( + infoLines, + labelStyle.Render("JSON: ")+pathStyle.Render("file://"+m.jsonPath), + ) + } + infoLines = append( + infoLines, + helpStyle.Render("Tab/Arrow to switch tabs, scroll to read, Enter/q to exit."), + ) + info := lipgloss.JoinVertical(lipgloss.Left, infoLines...) + + infoFrame := lipgloss.NewStyle(). + Border(lipgloss.RoundedBorder()). + BorderForeground(lipgloss.Color("240")). + Padding(0, 1). + Width(contentWidth). + Render(info) + + tabBar := m.renderTabBar() + + var tabContent string + if m.rendering { + dim := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + tabContent = "\n " + m.spinner.View() + dim.Render(" Rendering audit…") + } else { + tabContent = m.viewports[m.activeTab].View() + } + + previewBody := lipgloss.JoinVertical(lipgloss.Left, tabBar, tabContent) + previewFrame := lipgloss.NewStyle(). + Border(lipgloss.RoundedBorder()). + BorderForeground(lipgloss.Color("240")). + Padding(0, 1). + Width(contentWidth). + Height(m.previewFrameHeight()). + Render(previewBody) + + return lipgloss.JoinVertical(lipgloss.Left, header, "", infoFrame, "", previewFrame) +} + +func (m *assessPreviewModel) renderTabBar() string { + activeStyle := lipgloss.NewStyle(). + Bold(true). + Foreground(lipgloss.Color("212")). + Border(lipgloss.NormalBorder(), false, false, true, false). + BorderForeground(lipgloss.Color("212")) + inactiveStyle := lipgloss.NewStyle(). + Foreground(lipgloss.Color("241")). + Border(lipgloss.NormalBorder(), false, false, true, false). + BorderForeground(lipgloss.Color("240")) + + var tabs []string + for i, label := range assessTabLabels { + display := " " + label + " " + if i == assessTabJSON && m.jsonData != "" { + display += "✔ " + } + if i == m.activeTab { + tabs = append(tabs, activeStyle.Render(display)) + } else { + tabs = append(tabs, inactiveStyle.Render(display)) + } + } + return lipgloss.JoinHorizontal(lipgloss.Top, tabs...) +} + +func (m *assessPreviewModel) reflow() { + if m.width <= 0 { + m.width = 80 + } + if m.height <= 0 { + m.height = 24 + } + vpWidth := max(20, m.width-2) + vpHeight := max(6, m.previewFrameHeight()-6) + for i := range m.viewports { + m.viewports[i].Width = vpWidth + m.viewports[i].Height = vpHeight + } +} + +func (m *assessPreviewModel) previewFrameHeight() int { + available := m.height - 11 + if available < 10 { + available = 10 + } + return available +} + +// buildAssessEvidenceMarkdown extracts the evidence facts and per-item scores +// from the audit JSON and renders them as a single markdown document for the +// Evidence tab. Falls back to a placeholder when no JSON is present. +func buildAssessEvidenceMarkdown(jsonData string) string { + if jsonData == "" { + return "## Evidence\n\n_No JSON data available — re-run with `--audit-output-format both`._\n" + } + var artifact map[string]any + if err := json.Unmarshal([]byte(jsonData), &artifact); err != nil { + return fmt.Sprintf("## Evidence\n\n_Failed to parse audit JSON: %v_\n", err) + } + + var b []byte + b = append(b, "## Per-item evidence\n\n"...) + items, _ := artifact["items"].([]any) + if len(items) == 0 { + b = append(b, "_No items in audit JSON._\n"...) + return string(b) + } + + for _, raw := range items { + item, ok := raw.(map[string]any) + if !ok { + continue + } + id := item["itemId"] + score := item["score"] + why, _ := item["whyThisScore"].(string) + b = append(b, fmt.Sprintf("### Item %v — score %v\n\n", id, score)...) + if why != "" { + b = append(b, why...) + b = append(b, "\n\n"...) + } + } + + notes, _ := artifact["notesForReaudit"].([]any) + if len(notes) > 0 { + b = append(b, "\n## Notes for re-audit\n\n"...) + for _, n := range notes { + if s, ok := n.(string); ok { + b = append(b, "- "...) + b = append(b, s...) + b = append(b, '\n') + } + } + } + + return string(b) +} + +// RunAssessPreview displays the audit markdown in a tabbed Glamour-rendered +// preview matching the report flow's RunReportPreviewFull look-and-feel. +func RunAssessPreview(path, jsonPath, jsonData string) error { + m := newAssessPreviewModel(path, jsonPath, jsonData) + p := tea.NewProgram(m, tea.WithOutput(os.Stderr), tea.WithAltScreen()) + _, err := teaProgramRun(p) + return err +} diff --git a/tui/assess_preview_test.go b/tui/assess_preview_test.go new file mode 100644 index 0000000..1987b13 --- /dev/null +++ b/tui/assess_preview_test.go @@ -0,0 +1,100 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestBuildAssessEvidenceMarkdown_Empty(t *testing.T) { + out := buildAssessEvidenceMarkdown("") + if !strings.Contains(out, "No JSON data available") { + t.Errorf("expected fallback message, got: %s", out) + } +} + +func TestBuildAssessEvidenceMarkdown_InvalidJSON(t *testing.T) { + out := buildAssessEvidenceMarkdown("not json") + if !strings.Contains(out, "Failed to parse audit JSON") { + t.Errorf("expected parse-error message, got: %s", out) + } +} + +func TestBuildAssessEvidenceMarkdown_RendersItems(t *testing.T) { + jsonStr := `{ + "items": [ + {"itemId": 1, "score": 1, "whyThisScore": "justfile present"}, + {"itemId": 2, "score": 0.5, "whyThisScore": "tier-3 cap"} + ], + "notesForReaudit": ["Re-check item 4 next quarter."] + }` + out := buildAssessEvidenceMarkdown(jsonStr) + if !strings.Contains(out, "## Per-item evidence") { + t.Errorf("missing header: %s", out) + } + if !strings.Contains(out, "Item 1") || !strings.Contains(out, "Item 2") { + t.Errorf("missing items: %s", out) + } + if !strings.Contains(out, "justfile present") { + t.Errorf("missing item-1 reason: %s", out) + } + if !strings.Contains(out, "tier-3 cap") { + t.Errorf("missing item-2 reason: %s", out) + } + if !strings.Contains(out, "Notes for re-audit") { + t.Errorf("missing notes section: %s", out) + } + if !strings.Contains(out, "Re-check item 4 next quarter.") { + t.Errorf("missing note text: %s", out) + } +} + +func TestNewAssessPreviewModel_ReadsMarkdown(t *testing.T) { + dir := t.TempDir() + mdPath := filepath.Join(dir, "audit.md") + if err := os.WriteFile(mdPath, []byte("# Audit\n\nbody"), 0o600); err != nil { + t.Fatalf("write fixture: %v", err) + } + m := newAssessPreviewModel(mdPath, "", "") + if m.renderErr != "" { + t.Errorf("unexpected renderErr: %s", m.renderErr) + } + if !strings.Contains(m.markdown, "# Audit") { + t.Errorf("markdown not loaded: %s", m.markdown) + } + if m.activeTab != assessTabAudit { + t.Errorf("activeTab = %d, want %d", m.activeTab, assessTabAudit) + } +} + +func TestNewAssessPreviewModel_MissingFile(t *testing.T) { + m := newAssessPreviewModel("/no/such/file.md", "", "") + if !strings.Contains(m.renderErr, "Could not read audit file") { + t.Errorf("expected read error, got: %s", m.renderErr) + } +} + +func TestAssessPreview_TabBarHasThreeTabs(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", `{"items":[]}`) + bar := m.renderTabBar() + for _, label := range assessTabLabels { + if !strings.Contains(bar, label) { + t.Errorf("tab bar missing label %q: %s", label, bar) + } + } +} + +func TestAssessPreview_TabBarShowsJSONCheckmark(t *testing.T) { + withData := newAssessPreviewModel("/no/such.md", "", `{"items":[]}`) + bar1 := withData.renderTabBar() + if !strings.Contains(bar1, "✔") { + t.Errorf("expected ✔ next to JSON tab when data present: %s", bar1) + } + + withoutData := newAssessPreviewModel("/no/such.md", "", "") + bar2 := withoutData.renderTabBar() + if strings.Contains(bar2, "✔") { + t.Errorf("did not expect ✔ when no JSON data: %s", bar2) + } +} diff --git a/tui/assess_progress.go b/tui/assess_progress.go new file mode 100644 index 0000000..b1999e9 --- /dev/null +++ b/tui/assess_progress.go @@ -0,0 +1,581 @@ +package main + +import ( + "fmt" + "os" + "strings" + "time" + + "github.com/charmbracelet/bubbles/progress" + "github.com/charmbracelet/bubbles/spinner" + "github.com/charmbracelet/bubbles/viewport" + tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/lipgloss" +) + +// assessStepState mirrors stepState but tracks the assess pipeline. +type assessStepState struct { + text string + status string // "active", "complete", "failed" + message string + startedAt time.Time + finishedAt time.Time +} + +// assessProgressModel is the Bubble Tea model for the maturity-assessment progress display. +// Mirrors progressModel (report) so visual design matches: two-pane layout, step list with +// ✔/✖/spinner icons, monotonic progress bar, right-side configuration summary. +type assessProgressModel struct { + steps []assessStepState + expectedSteps []string // canonical pipeline order, used to compute progress + show all steps from start + spinner spinner.Model + progressBar progress.Model + shellViewport viewport.Model + viewport viewport.Model + cfg *AssessConfig + title string + resultPath string + jsonPath string + jsonData string + errorMsg string + pendingQuestion *GenericEvent // set when an interview-question event arrives mid-flow + answersSent int + totalQuestions int + done bool + width int + height int + peakRatio float64 + cancelled bool +} + +// Messages used by the assess progress program. +type assessStepMsg GenericEvent +type assessDoneMsg struct{} +type assessAskMsg struct{ evt GenericEvent } +type assessAnswerSentMsg struct{ ok bool } +type assessFatalMsg struct{ err error } + +// canonicalAssessSteps drives the right-pane progress denominator and the +// always-visible step list. +var canonicalAssessSteps = []string{ + "startup", + "preflight", + "adjacent-repos", + "interview", + "evidence", + "scoring", + "writing", + "audit-store", + "complete", +} + +func newAssessProgressModel(title string, cfg *AssessConfig, totalQuestions int) assessProgressModel { + s := spinner.New() + s.Spinner = spinner.Dot + s.Style = lipgloss.NewStyle().Foreground(lipgloss.Color("14")) + + p := progress.New( + progress.WithDefaultGradient(), + progress.WithoutPercentage(), + ) + + w := termWidth() + p.Width = max(10, w-16) + + vp := viewport.New(max(20, w-6), 8) + shell := viewport.New(max(20, w), 24) + + return assessProgressModel{ + spinner: s, + progressBar: p, + shellViewport: shell, + viewport: vp, + cfg: cfg, + title: title, + expectedSteps: canonicalAssessSteps, + totalQuestions: totalQuestions, + width: w, + height: 24, + } +} + +func (m assessProgressModel) Init() tea.Cmd { + return tea.Batch(m.spinner.Tick, tea.WindowSize()) +} + +func (m assessProgressModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch msg := msg.(type) { + case tea.WindowSizeMsg: + m.width = msg.Width + m.height = msg.Height + m.reflow() + m.syncViewportContent() + return m, nil + + case tea.KeyMsg: + switch msg.String() { + case "ctrl+c", "q": + if m.pendingQuestion == nil { + m.cancelled = true + m.done = true + return m, tea.Quit + } + } + var cmd tea.Cmd + m.viewport, cmd = m.viewport.Update(msg) + return m, cmd + + case spinner.TickMsg: + var cmd tea.Cmd + m.spinner, cmd = m.spinner.Update(msg) + m.syncViewportContent() + return m, cmd + + case progress.FrameMsg: + var cmd tea.Cmd + model, nextCmd := m.progressBar.Update(msg) + if pm, ok := model.(progress.Model); ok { + m.progressBar = pm + } + cmd = nextCmd + return m, cmd + + case assessStepMsg: + return m.handleStep(GenericEvent(msg)) + + case assessDoneMsg: + m.done = true + return m, tea.Quit + + case assessFatalMsg: + m.errorMsg = msg.err.Error() + m.done = true + return m, tea.Quit + } + + return m, nil +} + +func (m assessProgressModel) handleStep(evt GenericEvent) (tea.Model, tea.Cmd) { + switch evt.Type { + case "progress": + idx := m.findStep(evt.Step) + switch evt.Status { + case "active": + if idx < 0 { + m.steps = append(m.steps, assessStepState{ + text: evt.Step, + status: "active", + message: evt.Message, + startedAt: time.Now(), + }) + } else { + m.steps[idx].status = "active" + if evt.Message != "" { + m.steps[idx].message = evt.Message + } + } + case "complete": + if idx >= 0 { + m.steps[idx].status = "complete" + m.steps[idx].finishedAt = time.Now() + if evt.Message != "" { + m.steps[idx].message = evt.Message + } + } else { + m.steps = append(m.steps, assessStepState{ + text: evt.Step, + status: "complete", + message: evt.Message, + startedAt: time.Now(), + finishedAt: time.Now(), + }) + } + case "failed": + if idx >= 0 { + m.steps[idx].status = "failed" + m.steps[idx].finishedAt = time.Now() + if evt.Message != "" { + m.steps[idx].message = evt.Message + } + } + } + m.recalcPeakRatio() + m.syncViewportContent() + return m, m.progressBar.SetPercent(m.peakRatio) + + case "interview-frame": + // Surface the framing message inline as an active step note. + m.upsertActive("interview", evt.Message) + m.syncViewportContent() + return m, nil + + case "interview-question": + m.pendingQuestion = &evt + m.upsertActive( + "interview", + fmt.Sprintf("Question %d of %d — awaiting answer (%s)…", m.answersSent+1, m.totalQuestions, evt.QuestionID), + ) + m.syncViewportContent() + return m, nil + + case "result": + m.resultPath = evt.OutputPath + m.jsonPath = evt.JsonOutputPath + if len(evt.Data) > 0 { + m.jsonData = string(evt.Data) + } + m.done = true + return m, tea.Quit + + case "report-data": + if len(evt.Data) > 0 { + m.jsonData = string(evt.Data) + } + return m, nil + + case "error": + m.errorMsg = evt.Message + m.done = true + return m, tea.Quit + } + + return m, nil +} + +func (m *assessProgressModel) upsertActive(stepName, message string) { + idx := m.findStep(stepName) + if idx < 0 { + m.steps = append(m.steps, assessStepState{ + text: stepName, + status: "active", + message: message, + startedAt: time.Now(), + }) + return + } + if m.steps[idx].status == "active" { + m.steps[idx].message = message + } +} + +func (m assessProgressModel) findStep(step string) int { + for i, s := range m.steps { + if s.text == step { + return i + } + } + return -1 +} + +// recalcPeakRatio computes a monotonically increasing ratio over the +// canonical step list. Steps not yet seen contribute 0; active steps +// contribute 0.5; complete/failed contribute 1. +func (m *assessProgressModel) recalcPeakRatio() { + denom := float64(len(m.expectedSteps)) + if denom == 0 { + return + } + var completed float64 + for _, name := range m.expectedSteps { + idx := m.findStep(name) + if idx < 0 { + continue + } + switch m.steps[idx].status { + case "complete", "failed": + completed += 1.0 + case "active": + completed += 0.5 + } + } + ratio := completed / denom + if ratio > 1.0 { + ratio = 1.0 + } + if ratio > m.peakRatio { + m.peakRatio = ratio + } +} + +func (m assessProgressModel) View() string { + if m.done && m.resultPath == "" && m.errorMsg == "" { + return "" + } + + m.reflow() + m.syncViewportContent() + + title := renderShellHeader(m.width) + leftPanel := m.renderProgressPanel() + rightPanel := m.renderConfigPanel() + + left := lipgloss.NewStyle().Width(m.leftPanelWidth()).Render(leftPanel) + right := lipgloss.NewStyle().Width(m.rightPanelWidth()).Render(rightPanel) + body := lipgloss.JoinHorizontal(lipgloss.Top, left, " ", right) + + shell := lipgloss.JoinVertical(lipgloss.Left, title, "", body) + m.shellViewport.SetContent(shell) + return m.shellViewport.View() +} + +func (m *assessProgressModel) leftPanelWidth() int { + w := m.width + if w <= 0 { + w = 80 + } + lw := w * 3 / 5 + if lw < 32 { + lw = 32 + } + return lw +} + +func (m *assessProgressModel) rightPanelWidth() int { + w := m.width + if w <= 0 { + w = 80 + } + rw := w - m.leftPanelWidth() - 2 + if rw < 24 { + rw = 24 + } + return rw +} + +func (m *assessProgressModel) contentWidth() int { + frame := lipgloss.NewStyle().Border(lipgloss.RoundedBorder()).Padding(0, 1) + return max(20, m.leftPanelWidth()-frame.GetHorizontalFrameSize()) +} + +func (m *assessProgressModel) viewportHeight() int { + h := m.height + if h <= 0 { + h = 24 + } + available := h - 12 + if available < 4 { + available = 4 + } + return min(14, available) +} + +func (m *assessProgressModel) reflow() { + m.progressBar.Width = max(10, m.contentWidth()-6) + m.viewport.Width = m.contentWidth() + m.viewport.Height = m.viewportHeight() + if m.width <= 0 { + m.width = 80 + } + if m.height <= 0 { + m.height = 24 + } + m.shellViewport.Width = m.width + m.shellViewport.Height = m.height +} + +func (m *assessProgressModel) syncViewportContent() { + doneIcon := lipgloss.NewStyle().Foreground(lipgloss.Color("10")).Render("✔") + errIcon := lipgloss.NewStyle().Foreground(lipgloss.Color("9")).Render("✖") + dim := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + pendingDim := lipgloss.NewStyle().Foreground(lipgloss.Color("239")) + + lines := make([]string, 0, len(m.expectedSteps)*2) + now := time.Now() + + for _, name := range m.expectedSteps { + idx := m.findStep(name) + if idx < 0 { + lines = append(lines, m.fitLine(pendingDim.Render("○ "+humanizeStep(name)))) + continue + } + s := m.steps[idx] + switch s.status { + case "complete": + label := s.message + if label == "" { + label = humanizeStep(s.text) + } + line := doneIcon + " " + label + if elapsed := assessStepElapsed(s, now); elapsed != "" { + line += " — " + elapsed + } + lines = append(lines, m.fitLine(dim.Render(line))) + case "failed": + label := s.message + if label == "" { + label = humanizeStep(s.text) + } + line := errIcon + " " + label + lines = append(lines, m.fitLine(line)) + case "active": + line := m.spinner.View() + " " + humanizeStep(s.text) + if elapsed := assessStepElapsed(s, now); elapsed != "" { + line += " — " + elapsed + } + lines = append(lines, m.fitLine(line)) + if s.message != "" && s.message != s.text { + lines = append(lines, m.fitLine(dim.Render(" "+s.message))) + } + } + } + + m.viewport.SetContent(strings.Join(lines, "\n")) + m.viewport.GotoBottom() +} + +func (m *assessProgressModel) fitLine(line string) string { + maxWidth := m.viewport.Width + if maxWidth <= 0 { + maxWidth = 20 + } + if lipgloss.Width(line) <= maxWidth { + return line + } + runes := []rune(line) + for len(runes) > 0 && lipgloss.Width(string(runes)) > maxWidth-1 { + runes = runes[:len(runes)-1] + } + return string(runes) + "…" +} + +func (m assessProgressModel) renderProgressPanel() string { + contentWidth := m.contentWidth() + title := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("212")).Render(m.title) + + pctStr := fmt.Sprintf("%3d%%", int(m.peakRatio*100)) + bar := m.progressBar.View() + barPadding := max(0, contentWidth-lipgloss.Width(bar)-1-len(pctStr)) + progressLine := fmt.Sprintf("%s %s%s", bar, pctStr, strings.Repeat(" ", barPadding)) + + frame := lipgloss.NewStyle(). + Border(lipgloss.HiddenBorder()). + Padding(0, 1). + Width(contentWidth) + + body := lipgloss.JoinVertical( + lipgloss.Left, + title, + "", + progressLine, + renderDividerLine(m.viewport.Width), + m.viewport.View(), + ) + return frame.Render(body) +} + +func (m assessProgressModel) renderConfigPanel() string { + if m.cfg == nil { + return lipgloss.NewStyle(). + Border(lipgloss.RoundedBorder()). + BorderForeground(lipgloss.Color("240")). + Padding(0, 1). + Width(max(20, m.rightPanelWidth()-4)). + Render("Configuration unavailable") + } + return renderAssessSummary(m.cfg, m.rightPanelWidth()) +} + +// AssessProgressResult is the outcome of a maturity-assessment progress run. +type AssessProgressResult struct { + ResultPath string + JsonPath string + JsonData string + ErrorMsg string + Cancelled bool +} + +// RunAssessProgressDisplay drives the Bubble Tea progress program for the assess flow. +// It owns event-channel reading, dispatches interview prompts to a callback, and +// returns the final result + path data (or an error). +// +// askInterview is invoked synchronously when an "interview-question" event arrives; +// it must return the answer + isOption flag (or an error to abort). The Tea program +// exits the alt-screen for the duration of the prompt so huh can take over the TTY, +// then resumes. +func RunAssessProgressDisplay( + title string, + cfg *AssessConfig, + res *AssessRunResult, + askInterview func(evt GenericEvent) (string, bool, error), +) AssessProgressResult { + m := newAssessProgressModel(title, cfg, 7) + + p := tea.NewProgram(m, tea.WithOutput(os.Stderr), tea.WithAltScreen()) + + go func() { + for evt := range res.Events { + if evt.Type == "interview-question" { + // Pause the alt-screen, run the prompt synchronously, then forward the answer. + p.ReleaseTerminal() + value, isOption, err := askInterview(evt) + if err != nil { + p.Send(assessFatalMsg{err: err}) + p.RestoreTerminal() + continue + } + _ = SendInterviewAnswer(res, evt.QuestionID, value, isOption) + p.RestoreTerminal() + p.Send(assessStepMsg(evt)) + continue + } + p.Send(assessStepMsg(evt)) + } + p.Send(assessDoneMsg{}) + }() + + finalModel, err := teaProgramRun(p) + if err != nil { + return AssessProgressResult{ErrorMsg: fmt.Sprintf("TUI error: %v", err)} + } + final, ok := finalModel.(assessProgressModel) + if !ok { + return AssessProgressResult{ErrorMsg: "internal: unexpected final model"} + } + return AssessProgressResult{ + ResultPath: final.resultPath, + JsonPath: final.jsonPath, + JsonData: final.jsonData, + ErrorMsg: final.errorMsg, + Cancelled: final.cancelled, + } +} + +func assessStepElapsed(s assessStepState, now time.Time) string { + if s.startedAt.IsZero() { + return "" + } + if !s.finishedAt.IsZero() { + return formatElapsed(s.finishedAt.Sub(s.startedAt)) + } + if dur := now.Sub(s.startedAt); dur >= 3*time.Second { + return formatElapsed(dur) + } + return "" +} + +// humanizeStep maps the lower-kebab step name to a label that fits the +// existing report's tone (capitalized verb-phrases). +func humanizeStep(step string) string { + switch step { + case "startup": + return "Initializing assessment" + case "preflight": + return "Detecting evidence tier" + case "adjacent-repos": + return "Mapping adjacent repositories" + case "interview": + return "Phase-1 interview" + case "evidence": + return "Collecting evidence" + case "scoring": + return "AI scoring" + case "writing": + return "Writing audit" + case "audit-store": + return "Updating CONFIG.md" + case "complete": + return "Audit complete" + } + return step +} diff --git a/tui/assess_progress_test.go b/tui/assess_progress_test.go new file mode 100644 index 0000000..d36b5ca --- /dev/null +++ b/tui/assess_progress_test.go @@ -0,0 +1,205 @@ +package main + +import ( + "strings" + "testing" + + tea "github.com/charmbracelet/bubbletea" +) + +func newProgressForTest() assessProgressModel { + cfg := DefaultAssessConfig() + m := newAssessProgressModel("Test", &cfg, 7) + m.width = 100 + m.height = 30 + m.reflow() + return m +} + +func TestNewAssessProgressModel(t *testing.T) { + m := newProgressForTest() + if m.title != "Test" { + t.Errorf("title = %q, want %q", m.title, "Test") + } + if len(m.expectedSteps) != len(canonicalAssessSteps) { + t.Errorf("expectedSteps = %d, want %d", len(m.expectedSteps), len(canonicalAssessSteps)) + } + if m.totalQuestions != 7 { + t.Errorf("totalQuestions = %d, want 7", m.totalQuestions) + } + if m.cfg == nil { + t.Error("cfg is nil") + } +} + +func TestAssessProgress_HandleStepActiveAddsStep(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "active", Message: "starting"}) + model := updated.(assessProgressModel) + if len(model.steps) != 1 { + t.Fatalf("steps count = %d, want 1", len(model.steps)) + } + if model.steps[0].text != "preflight" || model.steps[0].status != "active" { + t.Errorf("step = %+v", model.steps[0]) + } +} + +func TestAssessProgress_HandleStepCompletePromotesStep(t *testing.T) { + m := newProgressForTest() + m1, _ := m.handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "active", Message: "..."}) + m2, _ := m1.(assessProgressModel).handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "complete", Message: "Tier resolved: gh"}) + final := m2.(assessProgressModel) + if final.steps[0].status != "complete" { + t.Errorf("status = %q, want complete", final.steps[0].status) + } + if final.steps[0].message != "Tier resolved: gh" { + t.Errorf("message = %q", final.steps[0].message) + } + if final.steps[0].finishedAt.IsZero() { + t.Error("finishedAt not set") + } +} + +func TestAssessProgress_PeakRatioMonotonic(t *testing.T) { + m := newProgressForTest() + steps := []string{"startup", "preflight", "adjacent-repos", "interview"} + for _, s := range steps { + updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: s, Status: "complete"}) + m = updated.(assessProgressModel) + } + if m.peakRatio <= 0 { + t.Errorf("peakRatio = %f, want >0", m.peakRatio) + } + prev := m.peakRatio + updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "active"}) + final := updated.(assessProgressModel) + if final.peakRatio < prev { + t.Errorf("peakRatio regressed: prev=%f now=%f", prev, final.peakRatio) + } +} + +func TestAssessProgress_ResultEventStoresPaths(t *testing.T) { + m := newProgressForTest() + updated, cmd := m.handleStep(GenericEvent{ + Type: "result", + OutputPath: "./audit.md", + JsonOutputPath: "./audit.json", + Data: []byte(`{"items":[]}`), + }) + final := updated.(assessProgressModel) + if final.resultPath != "./audit.md" { + t.Errorf("resultPath = %q", final.resultPath) + } + if final.jsonPath != "./audit.json" { + t.Errorf("jsonPath = %q", final.jsonPath) + } + if !final.done { + t.Error("done not set on result") + } + if cmd == nil { + t.Error("expected a tea.Quit cmd on result") + } +} + +func TestAssessProgress_ErrorEventCapturesMessage(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{Type: "error", Message: "boom"}) + final := updated.(assessProgressModel) + if final.errorMsg != "boom" { + t.Errorf("errorMsg = %q", final.errorMsg) + } + if !final.done { + t.Error("done not set on error") + } +} + +func TestAssessProgress_InterviewQuestionUpdatesActiveStep(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: "interview", Status: "active", Message: "Gathering Phase-1…"}) + m = updated.(assessProgressModel) + updated, _ = m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q1", + QuestionText: "what?", + }) + final := updated.(assessProgressModel) + if final.pendingQuestion == nil { + t.Fatal("pendingQuestion should be set") + } + idx := final.findStep("interview") + if idx < 0 { + t.Fatal("interview step not found") + } + if !strings.Contains(final.steps[idx].message, "awaiting answer") { + t.Errorf("message = %q, want awaiting-answer note", final.steps[idx].message) + } +} + +func TestAssessProgress_HumanizeStep(t *testing.T) { + cases := map[string]string{ + "startup": "Initializing assessment", + "preflight": "Detecting evidence tier", + "adjacent-repos": "Mapping adjacent repositories", + "interview": "Phase-1 interview", + "evidence": "Collecting evidence", + "scoring": "AI scoring", + "writing": "Writing audit", + "audit-store": "Updating CONFIG.md", + "complete": "Audit complete", + "unknown-step": "unknown-step", + } + for input, want := range cases { + got := humanizeStep(input) + if got != want { + t.Errorf("humanizeStep(%q) = %q, want %q", input, got, want) + } + } +} + +func TestAssessProgress_ViewRendersTitleAndPanels(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "active"}) + m = updated.(assessProgressModel) + view := m.View() + if !strings.Contains(view, "Test") { + t.Error("view missing title") + } + if !strings.Contains(view, "Assessment Setup") { + t.Error("view missing right-pane summary header") + } +} + +func TestAssessProgress_KeyCtrlCSetsCancelled(t *testing.T) { + m := newProgressForTest() + updated, cmd := m.Update(tea.KeyMsg{Type: tea.KeyCtrlC}) + final := updated.(assessProgressModel) + if !final.cancelled { + t.Error("cancelled should be true after ctrl+c") + } + if !final.done { + t.Error("done should be true after ctrl+c") + } + if cmd == nil { + t.Error("expected tea.Quit cmd") + } +} + +func TestAssessProgress_PendingQuestionBlocksCancel(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{Type: "interview-question", QuestionID: "q1", QuestionText: "?"}) + m = updated.(assessProgressModel) + updated2, _ := m.Update(tea.KeyMsg{Type: tea.KeyRunes, Runes: []rune{'q'}}) + final := updated2.(assessProgressModel) + if final.cancelled { + t.Error("cancelled should remain false while a question is pending") + } +} + +func TestAssessProgress_ViewportSizesScaleWithWindow(t *testing.T) { + m := newProgressForTest() + updated, _ := m.Update(tea.WindowSizeMsg{Width: 200, Height: 60}) + final := updated.(assessProgressModel) + if final.contentWidth() < 50 { + t.Errorf("contentWidth too small at width=200: got %d", final.contentWidth()) + } +} diff --git a/tui/assess_protocol.go b/tui/assess_protocol.go new file mode 100644 index 0000000..087e062 --- /dev/null +++ b/tui/assess_protocol.go @@ -0,0 +1,34 @@ +package main + +// Assess-flow JSON-lines protocol additions, parallel to protocol.go. +// The shared envelope is GenericEvent (defined there); these types describe +// events specific to the maturity assessment. + +// InterviewFrameEvent precedes the first question in an interactive run. +type InterviewFrameEvent struct { + Type string `json:"type"` + Message string `json:"message"` +} + +// InterviewQuestionEvent — service → TUI. The TUI must reply with an +// InterviewAnswerEvent before the service can proceed. +type InterviewQuestionEvent struct { + Type string `json:"type"` + QuestionID string `json:"questionId"` + QuestionText string `json:"questionText"` + Options []string `json:"options"` + AllowFreeText bool `json:"allowFreeText"` + ConfigHeading string `json:"configHeading"` +} + +// InterviewAnswerEvent — TUI → service over the subprocess stdin. +type InterviewAnswerEvent struct { + Type string `json:"type"` + QuestionID string `json:"questionId"` + Value string `json:"value"` + IsOption bool `json:"isOption"` +} + +// AssessResultData carries the rendered audit JSON. We use json.RawMessage on +// GenericEvent.Data; concrete Go-side parsing is not needed — the TUI just +// surfaces the raw markdown to the preview. diff --git a/tui/assess_runner.go b/tui/assess_runner.go new file mode 100644 index 0000000..eeffa50 --- /dev/null +++ b/tui/assess_runner.go @@ -0,0 +1,166 @@ +package main + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" +) + +// assessScriptPath returns the path to scripts/run-assess.ts. Mirrors +// resolveScriptPath but for the assess service runner. +func assessScriptPath() string { + exePath, err := os.Executable() + if err == nil { + dir := filepath.Dir(exePath) + candidate := filepath.Join(dir, "..", "scripts", "run-assess.ts") + if _, err := os.Stat(candidate); err == nil { + return candidate + } + } + candidates := []string{ + "scripts/run-assess.ts", + "./scripts/run-assess.ts", + } + for _, c := range candidates { + if _, err := os.Stat(c); err == nil { + return c + } + } + home, _ := os.UserHomeDir() + if home != "" { + fallback := filepath.Join(home, "teamhero.cli", "scripts", "run-assess.ts") + if _, err := os.Stat(fallback); err == nil { + return fallback + } + } + return "scripts/run-assess.ts" +} + +// AssessRunResult bundles the channels for stream consumption + a stdin +// writer the TUI uses to send interview-answer events back. +type AssessRunResult struct { + Events <-chan GenericEvent + Errors <-chan error + Stderr *bytes.Buffer + StdinW io.WriteCloser + closeFns []func() +} + +// Close cleans up the stdin writer if not already closed. +func (r *AssessRunResult) Close() { + for _, fn := range r.closeFns { + fn() + } +} + +// RunAssessServiceRunner spawns the TS service runner for the maturity +// assessment. The first stdin write is the AssessConfig JSON; the stream is +// kept open so the TUI can send subsequent interview-answer JSON lines. +func RunAssessServiceRunner(input AssessConfig) (*AssessRunResult, error) { + configJSON, err := json.Marshal(input) + if err != nil { + return nil, fmt.Errorf("failed to marshal assess config: %w", err) + } + + var cmd *exec.Cmd + if serviceBin := resolveServiceBinary(); serviceBin != "" { + // In future the bundled service could route to assess via env var. + // For now, prefer the bun script if available. + bunPath := resolveBunBinary() + scriptPath := assessScriptPath() + if _, statErr := os.Stat(scriptPath); statErr == nil { + cmd = exec.Command(bunPath, "run", scriptPath) + } else { + cmd = exec.Command(serviceBin, "--mode=assess") + } + } else { + bunPath := resolveBunBinary() + scriptPath := assessScriptPath() + cmd = exec.Command(bunPath, "run", scriptPath) + } + + stderrBuf := &bytes.Buffer{} + cmd.Stderr = stderrBuf + + stdinPipe, err := cmd.StdinPipe() + if err != nil { + return nil, fmt.Errorf("failed to create stdin pipe: %w", err) + } + stdoutPipe, err := cmd.StdoutPipe() + if err != nil { + return nil, fmt.Errorf("failed to create stdout pipe: %w", err) + } + + if err := cmd.Start(); err != nil { + return nil, fmt.Errorf("failed to start assess runner: %w", err) + } + + // Send the config as the first JSON line; keep stdin open afterward. + if _, err := stdinPipe.Write(append(configJSON, '\n')); err != nil { + stdinPipe.Close() + return nil, fmt.Errorf("failed to write config: %w", err) + } + + eventCh := make(chan GenericEvent, 64) + errCh := make(chan error, 1) + + go func() { + defer close(eventCh) + defer close(errCh) + + scanner := bufio.NewScanner(stdoutPipe) + scanner.Buffer(make([]byte, 0, 256*1024), 4*1024*1024) // 4MB max line for full audit JSON + + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if line == "" { + continue + } + var evt GenericEvent + if err := json.Unmarshal([]byte(line), &evt); err != nil { + continue + } + eventCh <- evt + } + + if err := cmd.Wait(); err != nil { + if exitErr, ok := err.(*exec.ExitError); ok { + errCh <- fmt.Errorf("assess runner exited with code %d", exitErr.ExitCode()) + } else { + errCh <- fmt.Errorf("assess runner error: %w", err) + } + } + }() + + return &AssessRunResult{ + Events: eventCh, + Errors: errCh, + Stderr: stderrBuf, + StdinW: stdinPipe, + closeFns: []func(){func() { _ = stdinPipe.Close() }}, + }, nil +} + +// SendInterviewAnswer writes a JSON-line answer event to the runner's stdin. +func SendInterviewAnswer(r *AssessRunResult, questionID, value string, isOption bool) error { + evt := InterviewAnswerEvent{ + Type: "interview-answer", + QuestionID: questionID, + Value: value, + IsOption: isOption, + } + data, err := json.Marshal(evt) + if err != nil { + return err + } + if _, err := r.StdinW.Write(append(data, '\n')); err != nil { + return err + } + return nil +} diff --git a/tui/assess_runner_test.go b/tui/assess_runner_test.go new file mode 100644 index 0000000..b05b954 --- /dev/null +++ b/tui/assess_runner_test.go @@ -0,0 +1,56 @@ +package main + +import ( + "bytes" + "strings" + "testing" +) + +func TestSendInterviewAnswer_WritesJSONLine(t *testing.T) { + buf := &bytes.Buffer{} + res := &AssessRunResult{ + StdinW: writerCloser{buf}, + } + if err := SendInterviewAnswer(res, "q1", "test value", true); err != nil { + t.Fatalf("SendInterviewAnswer: %v", err) + } + out := buf.String() + if !strings.HasSuffix(out, "\n") { + t.Error("output should end with newline") + } + if !strings.Contains(out, `"questionId":"q1"`) { + t.Errorf("missing questionId in output: %s", out) + } + if !strings.Contains(out, `"value":"test value"`) { + t.Errorf("missing value in output: %s", out) + } + if !strings.Contains(out, `"isOption":true`) { + t.Errorf("missing isOption in output: %s", out) + } + if !strings.Contains(out, `"type":"interview-answer"`) { + t.Errorf("missing type in output: %s", out) + } +} + +// writerCloser adapts a bytes.Buffer to io.WriteCloser for tests. +type writerCloser struct{ *bytes.Buffer } + +func (writerCloser) Close() error { return nil } + +func TestAssessScriptPath_FallbackString(t *testing.T) { + got := assessScriptPath() + if got == "" { + t.Error("assessScriptPath returned empty string") + } +} + +func TestAssessRunResult_CloseRunsCloseFns(t *testing.T) { + called := false + res := &AssessRunResult{ + closeFns: []func(){func() { called = true }}, + } + res.Close() + if !called { + t.Error("Close should invoke closeFns") + } +} diff --git a/tui/assess_summary.go b/tui/assess_summary.go new file mode 100644 index 0000000..c416bd0 --- /dev/null +++ b/tui/assess_summary.go @@ -0,0 +1,172 @@ +package main + +import ( + "strings" + + "github.com/charmbracelet/lipgloss" +) + +// renderAssessSummary produces the right-pane configuration summary that mirrors +// the visual style of summary.go::renderSummary used by the report flow. +// +// Each field shows a value when it has been resolved, "—" (dim) otherwise. +// The "Assessment Setup" header includes an AI badge on the right when an +// AI model has been selected (matches the report's "Report Setup" header). +func renderAssessSummary(cfg *AssessConfig, width int) string { + if width < 20 { + width = 20 + } + if cfg == nil { + return lipgloss.NewStyle(). + Border(lipgloss.RoundedBorder()). + BorderForeground(lipgloss.Color("240")). + Padding(0, 1). + Width(width). + Render("No configuration") + } + + headerStyle := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("212")) + labelStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + valueStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("15")) + dimStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("239")) + + boxStyle := lipgloss.NewStyle(). + Border(lipgloss.RoundedBorder()). + BorderForeground(lipgloss.Color("240")). + Padding(0, 1) + + innerWidth := width - boxStyle.GetHorizontalBorderSize() + contentWidth := innerWidth - boxStyle.GetHorizontalFrameSize() + + type entry struct { + label string + value string + } + + entries := []entry{ + {"Scope", fmtAssessScopeMode(cfg)}, + {"Target", fmtAssessTarget(cfg)}, + {"Display name", strings.TrimSpace(cfg.Scope.DisplayName)}, + {"Evidence tier", fmtAssessTier(cfg.EvidenceTier)}, + {"Output format", fmtAssessOutputFormat(cfg.OutputFormat)}, + {"Output path", strings.TrimSpace(cfg.OutputPath)}, + {"Interview answers", fmtAssessAnswersFile(cfg.InterviewAnswersPath)}, + {"Mode", fmtAssessRunMode(cfg)}, + } + + var lines []string + + header := headerStyle.Render("Assessment Setup") + rightBadge := "" + if cfg.DryRun { + rightBadge = lipgloss.NewStyle(). + Background(lipgloss.Color("63")). + Foreground(lipgloss.Color("15")). + Bold(true). + Render(" dry-run ") + } + if rightBadge != "" { + gap := contentWidth - lipgloss.Width(header) - lipgloss.Width(rightBadge) + if gap < 2 { + lines = append(lines, header) + } else { + lines = append(lines, header+strings.Repeat(" ", gap)+rightBadge) + } + } else { + lines = append(lines, header) + } + lines = append(lines, "") + + for _, e := range entries { + val := dimStyle.Render("—") + if v := strings.TrimSpace(e.value); v != "" { + val = valueStyle.Render(v) + } + lines = append(lines, labelStyle.Render(e.label+": ")+val) + } + + content := strings.Join(lines, "\n") + return boxStyle.Width(innerWidth).Render(content) +} + +func fmtAssessScopeMode(cfg *AssessConfig) string { + switch cfg.Scope.Mode { + case "org": + return "GitHub org" + case "local-repo": + return "Local repository" + case "both": + return "Org + local checkout" + } + return "" +} + +func fmtAssessTarget(cfg *AssessConfig) string { + switch cfg.Scope.Mode { + case "org": + if cfg.Scope.Org == "" { + return "" + } + if len(cfg.Scope.Repos) > 0 { + return cfg.Scope.Org + " (" + formatCompact(cfg.Scope.Repos) + ")" + } + return cfg.Scope.Org + case "local-repo": + return cfg.Scope.LocalPath + case "both": + parts := []string{} + if cfg.Scope.Org != "" { + parts = append(parts, cfg.Scope.Org) + } + if cfg.Scope.LocalPath != "" { + parts = append(parts, cfg.Scope.LocalPath) + } + return strings.Join(parts, " · ") + } + return "" +} + +func fmtAssessTier(tier string) string { + switch tier { + case "", "auto": + return "auto-detect" + case "gh": + return "1 — gh CLI" + case "github-mcp": + return "2 — GitHub MCP" + case "git-only": + return "3 — git-only" + } + return tier +} + +func fmtAssessOutputFormat(format string) string { + switch format { + case "": + return "both" + case "both": + return "both (md + json)" + case "markdown": + return "markdown" + case "json": + return "json" + } + return format +} + +func fmtAssessAnswersFile(path string) string { + if path == "" { + return "interactive" + } + return path +} + +func fmtAssessRunMode(cfg *AssessConfig) string { + if cfg.Mode != "" { + return cfg.Mode + } + if cfg.InteractiveInterview { + return "interactive" + } + return "headless" +} diff --git a/tui/assess_summary_test.go b/tui/assess_summary_test.go new file mode 100644 index 0000000..8237794 --- /dev/null +++ b/tui/assess_summary_test.go @@ -0,0 +1,137 @@ +package main + +import ( + "strings" + "testing" +) + +func TestRenderAssessSummary_NilConfig(t *testing.T) { + out := renderAssessSummary(nil, 60) + if !strings.Contains(out, "No configuration") { + t.Errorf("expected 'No configuration', got: %s", out) + } +} + +func TestRenderAssessSummary_LocalRepoMode(t *testing.T) { + cfg := &AssessConfig{ + Scope: AssessScope{ + Mode: "local-repo", + LocalPath: "/tmp/foo", + DisplayName: "foo", + }, + EvidenceTier: "auto", + OutputFormat: "both", + } + out := renderAssessSummary(cfg, 60) + if !strings.Contains(out, "Local repository") { + t.Errorf("expected 'Local repository' label, got: %s", out) + } + if !strings.Contains(out, "/tmp/foo") { + t.Errorf("expected target path, got: %s", out) + } + if !strings.Contains(out, "auto-detect") { + t.Errorf("expected tier auto-detect, got: %s", out) + } +} + +func TestRenderAssessSummary_OrgMode(t *testing.T) { + cfg := &AssessConfig{ + Scope: AssessScope{ + Mode: "org", + Org: "acme", + DisplayName: "acme", + }, + EvidenceTier: "gh", + OutputFormat: "markdown", + } + out := renderAssessSummary(cfg, 60) + if !strings.Contains(out, "GitHub org") { + t.Errorf("expected 'GitHub org', got: %s", out) + } + if !strings.Contains(out, "acme") { + t.Errorf("expected target 'acme', got: %s", out) + } + if !strings.Contains(out, "gh CLI") { + t.Errorf("expected tier 'gh CLI', got: %s", out) + } +} + +func TestRenderAssessSummary_BothMode(t *testing.T) { + cfg := &AssessConfig{ + Scope: AssessScope{ + Mode: "both", + Org: "acme", + LocalPath: "/tmp/bar", + DisplayName: "acme", + }, + } + out := renderAssessSummary(cfg, 60) + if !strings.Contains(out, "acme") || !strings.Contains(out, "/tmp/bar") { + t.Errorf("expected both 'acme' and '/tmp/bar': %s", out) + } +} + +func TestRenderAssessSummary_DryRunBadge(t *testing.T) { + cfg := &AssessConfig{ + Scope: AssessScope{Mode: "local-repo", LocalPath: ".", DisplayName: "."}, + EvidenceTier: "auto", + DryRun: true, + } + out := renderAssessSummary(cfg, 60) + if !strings.Contains(out, "dry-run") { + t.Errorf("expected 'dry-run' badge, got: %s", out) + } +} + +func TestFmtAssessTier(t *testing.T) { + cases := map[string]string{ + "": "auto-detect", + "auto": "auto-detect", + "gh": "1 — gh CLI", + "github-mcp": "2 — GitHub MCP", + "git-only": "3 — git-only", + "weird": "weird", + } + for input, want := range cases { + got := fmtAssessTier(input) + if got != want { + t.Errorf("fmtAssessTier(%q) = %q, want %q", input, got, want) + } + } +} + +func TestFmtAssessOutputFormat(t *testing.T) { + cases := map[string]string{ + "": "both", + "both": "both (md + json)", + "markdown": "markdown", + "json": "json", + } + for input, want := range cases { + got := fmtAssessOutputFormat(input) + if got != want { + t.Errorf("fmtAssessOutputFormat(%q) = %q, want %q", input, got, want) + } + } +} + +func TestFmtAssessAnswersFile(t *testing.T) { + if fmtAssessAnswersFile("") != "interactive" { + t.Error("empty path should render as interactive") + } + if fmtAssessAnswersFile("/tmp/answers.json") != "/tmp/answers.json" { + t.Error("non-empty path should pass through") + } +} + +func TestFmtAssessRunMode(t *testing.T) { + if got := fmtAssessRunMode(&AssessConfig{Mode: "headless"}); got != "headless" { + t.Errorf("Mode=headless -> %q", got) + } + if got := fmtAssessRunMode(&AssessConfig{InteractiveInterview: true}); got != "interactive" { + t.Errorf("InteractiveInterview=true -> %q", got) + } + if got := fmtAssessRunMode(&AssessConfig{}); got != "headless" { + t.Errorf("default -> %q (want headless)", got) + } +} diff --git a/tui/assess_wizard.go b/tui/assess_wizard.go new file mode 100644 index 0000000..f602774 --- /dev/null +++ b/tui/assess_wizard.go @@ -0,0 +1,252 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/charmbracelet/huh" +) + +// runAssessInteractive walks the user through scope selection, then runs the +// service runner under a Bubble Tea progress display that mirrors the report +// flow's two-pane layout. After the audit is written, a tabbed preview +// (Audit / Evidence / JSON Data) opens — same shape as RunReportPreviewFull. +func runAssessInteractive(cfg *AssessConfig) error { + if err := assessScopeWizard(cfg); err != nil { + return err + } + + cfg.Mode = "interactive" + cfg.InteractiveInterview = true + + res, err := RunAssessServiceRunner(*cfg) + if err != nil { + return err + } + defer res.Close() + + result := RunAssessProgressDisplay("Agent Maturity Assessment", cfg, res, promptInterviewQuestion) + + // Drain any errors emitted by the runner goroutine. + for err := range res.Errors { + if err != nil { + if res.Stderr != nil && res.Stderr.Len() > 0 { + fmt.Fprintln(os.Stderr, res.Stderr.String()) + } + return err + } + } + + if result.Cancelled { + fmt.Fprintln(os.Stderr, "\nAssessment cancelled.") + return nil + } + if result.ErrorMsg != "" { + RenderError(result.ErrorMsg) + return fmt.Errorf("assess: %s", result.ErrorMsg) + } + + if err := SaveAssessConfig(cfg); err != nil { + fmt.Fprintf(os.Stderr, "Note: failed to save assess config: %v\n", err) + } + + if result.ResultPath == "" { + return nil + } + + // Show the tabbed preview using the same Glamour rendering pipeline the + // report flow uses. Errors from the preview are non-fatal — the audit + // files are already on disk. + if err := RunAssessPreview(result.ResultPath, result.JsonPath, result.JsonData); err != nil { + fmt.Fprintf(os.Stderr, "Note: preview unavailable (%v). Audit at: %s\n", err, result.ResultPath) + } + return nil +} + +// assessScopeWizard collects the minimum config the service runner needs. +func assessScopeWizard(cfg *AssessConfig) error { + cwd, _ := os.Getwd() + + scopeMode := cfg.Scope.Mode + if scopeMode == "" { + scopeMode = "local-repo" + } + + scopeForm := huh.NewForm( + huh.NewGroup( + huh.NewSelect[string](). + Title("What's the scope of this audit?"). + Description("Choose what you're assessing."). + Options( + huh.NewOption("This local repo", "local-repo"), + huh.NewOption("A GitHub organization", "org"), + huh.NewOption("Both (org + a local checkout)", "both"), + ). + Value(&scopeMode), + ), + ) + if err := scopeForm.Run(); err != nil { + return err + } + cfg.Scope.Mode = scopeMode + + switch scopeMode { + case "local-repo": + path := cfg.Scope.LocalPath + if path == "" { + path = cwd + } + pathForm := huh.NewForm( + huh.NewGroup( + huh.NewInput(). + Title("Local repo path"). + Description("Path to the repo you want to audit."). + Value(&path). + Validate(validateLocalPath), + ), + ) + if err := pathForm.Run(); err != nil { + return err + } + cfg.Scope.LocalPath = strings.TrimSpace(path) + cfg.Scope.Org = "" + cfg.Scope.Repos = nil + if cfg.Scope.DisplayName == "" { + cfg.Scope.DisplayName = filepath.Base(cfg.Scope.LocalPath) + } + case "org": + org := cfg.Scope.Org + orgForm := huh.NewForm( + huh.NewGroup( + huh.NewInput(). + Title("GitHub organization"). + Description("e.g. acme-co (no slashes)."). + Value(&org). + Validate(func(s string) error { + if strings.TrimSpace(s) == "" { + return fmt.Errorf("org name is required") + } + return nil + }), + ), + ) + if err := orgForm.Run(); err != nil { + return err + } + cfg.Scope.Org = strings.TrimSpace(org) + cfg.Scope.LocalPath = "" + if cfg.Scope.DisplayName == "" { + cfg.Scope.DisplayName = cfg.Scope.Org + } + case "both": + path := cfg.Scope.LocalPath + if path == "" { + path = cwd + } + org := cfg.Scope.Org + bothForm := huh.NewForm( + huh.NewGroup( + huh.NewInput(). + Title("GitHub organization"). + Value(&org). + Validate(func(s string) error { + if strings.TrimSpace(s) == "" { + return fmt.Errorf("org name is required") + } + return nil + }), + huh.NewInput(). + Title("Local repo path"). + Value(&path). + Validate(validateLocalPath), + ), + ) + if err := bothForm.Run(); err != nil { + return err + } + cfg.Scope.Org = strings.TrimSpace(org) + cfg.Scope.LocalPath = strings.TrimSpace(path) + if cfg.Scope.DisplayName == "" { + cfg.Scope.DisplayName = cfg.Scope.Org + } + } + + if cfg.OutputFormat == "" { + cfg.OutputFormat = "both" + } + if cfg.EvidenceTier == "" { + cfg.EvidenceTier = "auto" + } + return nil +} + +func validateLocalPath(s string) error { + trimmed := strings.TrimSpace(s) + if trimmed == "" { + return fmt.Errorf("path is required") + } + info, err := os.Stat(trimmed) + if err != nil { + return fmt.Errorf("path does not exist: %s", trimmed) + } + if !info.IsDir() { + return fmt.Errorf("path is not a directory: %s", trimmed) + } + return nil +} + +// promptInterviewQuestion shows a single Phase-1 question via huh and returns +// the captured answer. Choosing "Other" pops a free-text follow-up. +func promptInterviewQuestion(evt GenericEvent) (string, bool, error) { + const freeTextSentinel = "__free_text__" + options := evt.Options + if len(options) == 0 { + options = []string{"I don't know"} + } + choice := "" + + huhOptions := make([]huh.Option[string], 0, len(options)+1) + for _, opt := range options { + huhOptions = append(huhOptions, huh.NewOption(opt, opt)) + } + if evt.AllowFreeText { + huhOptions = append(huhOptions, huh.NewOption("Other (type your own)", freeTextSentinel)) + } + + form := huh.NewForm( + huh.NewGroup( + huh.NewSelect[string](). + Title(evt.QuestionText). + Description(fmt.Sprintf("[%s]", evt.QuestionID)). + Options(huhOptions...). + Value(&choice), + ), + ) + if err := form.Run(); err != nil { + return "unknown", false, err + } + + if choice == freeTextSentinel { + freeText := "" + ftForm := huh.NewForm( + huh.NewGroup( + huh.NewText(). + Title("Your answer"). + Description("Free text — leave blank for 'unknown'."). + Value(&freeText), + ), + ) + if err := ftForm.Run(); err != nil { + return "unknown", false, err + } + freeText = strings.TrimSpace(freeText) + if freeText == "" { + return "unknown", false, nil + } + return freeText, false, nil + } + + return choice, true, nil +} diff --git a/tui/main.go b/tui/main.go index 0e4c8ab..7199f00 100644 --- a/tui/main.go +++ b/tui/main.go @@ -19,6 +19,7 @@ func printUsage() { Commands: report Generate a developer contribution report (default) + assess Run the Agent Maturity Assessment (12-criterion AI-readiness audit) setup Configure credentials and preferences doctor Validate installation health @@ -132,7 +133,7 @@ func main() { // Detect subcommand first so --help can be routed to the right usage. subcommand := "" for _, arg := range os.Args[1:] { - if arg == "report" || arg == "doctor" || arg == "setup" { + if arg == "report" || arg == "doctor" || arg == "setup" || arg == "assess" { subcommand = arg break } @@ -152,6 +153,8 @@ func main() { printSetupUsage() case "report": printReportUsage() + case "assess": + printAssessUsage() default: printUsage() } @@ -193,6 +196,16 @@ func main() { exitCode := runDoctor() os.Exit(exitCode) return + case "assess": + if err := runAssess(); err != nil { + if err == huh.ErrUserAborted { + fmt.Fprintln(os.Stderr, "\nAssessment cancelled.") + os.Exit(0) + } + RenderError(err.Error()) + os.Exit(2) + } + return } // Check for --version flag diff --git a/tui/protocol.go b/tui/protocol.go index 41f5458..29bf645 100644 --- a/tui/protocol.go +++ b/tui/protocol.go @@ -79,4 +79,12 @@ type GenericEvent struct { Items []DiscrepancyItem `json:"items,omitempty"` AllItems []DiscrepancyItem `json:"allItems,omitempty"` DiscrepancyThreshold int `json:"discrepancyThreshold,omitempty"` + + // Maturity-assessment interview event fields + // (type == "interview-frame" / "interview-question") + QuestionID string `json:"questionId,omitempty"` + QuestionText string `json:"questionText,omitempty"` + Options []string `json:"options,omitempty"` + AllowFreeText bool `json:"allowFreeText,omitempty"` + ConfigHeading string `json:"configHeading,omitempty"` } From fbabfa0d9d164f08f9776e757c4dadc82303173c Mon Sep 17 00:00:00 2001 From: Asa Baylus Date: Sun, 3 May 2026 17:40:38 -0400 Subject: [PATCH 2/6] fix(assess): wrap wizard in shell layout matching report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first thing the user saw running `teamhero assess` was a bare huh select prompt — no banner, no two-pane layout. The wizard now hosts its huh forms inside a Bubble Tea program that renders the same shell-header + left-form / right-summary / nav-hints layout the report wizard uses. - New `assessWizardModel` Bubble Tea model with state machine: awScopeMode → awLocalPath / awOrg / awBoth → awConfirm → awDone - View() composes renderShellHeader + form panel + renderAssessSummary + "esc back • ctrl+c quit" hints, identical structure to wizard.go::View - Esc navigates back through history; ctrl+c aborts; final confirm step matches the report's wsConfirmRun flow - 14 new unit tests cover state transitions, scope-specific advance paths, history navigation, abort handling, View() rendering, and the parseRepoCSV / requireNonEmpty helpers Co-Authored-By: Claude Opus 4.7 (1M context) --- tui/assess_wizard.go | 491 +++++++++++++++++++++++++++++--------- tui/assess_wizard_test.go | 217 +++++++++++++++++ 2 files changed, 596 insertions(+), 112 deletions(-) create mode 100644 tui/assess_wizard_test.go diff --git a/tui/assess_wizard.go b/tui/assess_wizard.go index f602774..51e2ecb 100644 --- a/tui/assess_wizard.go +++ b/tui/assess_wizard.go @@ -6,182 +6,424 @@ import ( "path/filepath" "strings" + tea "github.com/charmbracelet/bubbletea" "github.com/charmbracelet/huh" + "github.com/charmbracelet/lipgloss" ) -// runAssessInteractive walks the user through scope selection, then runs the -// service runner under a Bubble Tea progress display that mirrors the report -// flow's two-pane layout. After the audit is written, a tabbed preview -// (Audit / Evidence / JSON Data) opens — same shape as RunReportPreviewFull. +// runAssessInteractive is the entry point for `teamhero assess` (no flags). It: +// 1. Runs the framed scope wizard (matches the report's two-pane layout). +// 2. Spawns the service runner and drives the Bubble Tea progress display. +// 3. Round-trips interview questions through huh prompts (one at a time). +// 4. Opens the tabbed Glamour preview when the audit is written. func runAssessInteractive(cfg *AssessConfig) error { - if err := assessScopeWizard(cfg); err != nil { + res, err := runAssessScopeWizard(cfg) + if err != nil { return err } + if res.Aborted { + fmt.Fprintln(os.Stderr, "\nAssessment cancelled.") + return nil + } + if res.Config != nil { + *cfg = *res.Config + } cfg.Mode = "interactive" cfg.InteractiveInterview = true - res, err := RunAssessServiceRunner(*cfg) + runner, err := RunAssessServiceRunner(*cfg) if err != nil { return err } - defer res.Close() + defer runner.Close() - result := RunAssessProgressDisplay("Agent Maturity Assessment", cfg, res, promptInterviewQuestion) + progress := RunAssessProgressDisplay("Agent Maturity Assessment", cfg, runner, promptInterviewQuestion) - // Drain any errors emitted by the runner goroutine. - for err := range res.Errors { - if err != nil { - if res.Stderr != nil && res.Stderr.Len() > 0 { - fmt.Fprintln(os.Stderr, res.Stderr.String()) + for runErr := range runner.Errors { + if runErr != nil { + if runner.Stderr != nil && runner.Stderr.Len() > 0 { + fmt.Fprintln(os.Stderr, runner.Stderr.String()) } - return err + return runErr } } - if result.Cancelled { + if progress.Cancelled { fmt.Fprintln(os.Stderr, "\nAssessment cancelled.") return nil } - if result.ErrorMsg != "" { - RenderError(result.ErrorMsg) - return fmt.Errorf("assess: %s", result.ErrorMsg) + if progress.ErrorMsg != "" { + RenderError(progress.ErrorMsg) + return fmt.Errorf("assess: %s", progress.ErrorMsg) } if err := SaveAssessConfig(cfg); err != nil { fmt.Fprintf(os.Stderr, "Note: failed to save assess config: %v\n", err) } - if result.ResultPath == "" { + if progress.ResultPath == "" { return nil } - // Show the tabbed preview using the same Glamour rendering pipeline the - // report flow uses. Errors from the preview are non-fatal — the audit - // files are already on disk. - if err := RunAssessPreview(result.ResultPath, result.JsonPath, result.JsonData); err != nil { - fmt.Fprintf(os.Stderr, "Note: preview unavailable (%v). Audit at: %s\n", err, result.ResultPath) + if err := RunAssessPreview(progress.ResultPath, progress.JsonPath, progress.JsonData); err != nil { + fmt.Fprintf(os.Stderr, "Note: preview unavailable (%v). Audit at: %s\n", err, progress.ResultPath) } return nil } -// assessScopeWizard collects the minimum config the service runner needs. -func assessScopeWizard(cfg *AssessConfig) error { +// --------------------------------------------------------------------------- +// Bubble Tea wizard — scope selection in the same shell layout as report. +// --------------------------------------------------------------------------- + +type assessWizardState int + +const ( + awScopeMode assessWizardState = iota + awLocalPath + awOrg + awBoth + awConfirm + awDone +) + +type assessWizardModel struct { + state assessWizardState + cfg AssessConfig + form *huh.Form + width int + height int + confirmed bool + aborted bool + history []assessWizardState + + // Form bindings + scopeMode string + localPath string + orgName string + repoCSV string + displayName string + confirmRun bool +} + +// AssessWizardResult is returned after the framed wizard completes. +type AssessWizardResult struct { + Config *AssessConfig + Confirmed bool + Aborted bool +} + +// runAssessScopeWizard runs the scope-selection wizard inside a Bubble Tea +// program. The View() renders the same shell-header + two-pane layout as +// the report wizard, with the right pane showing renderAssessSummary(). +func runAssessScopeWizard(cfg *AssessConfig) (*AssessWizardResult, error) { cwd, _ := os.Getwd() - scopeMode := cfg.Scope.Mode - if scopeMode == "" { - scopeMode = "local-repo" + m := assessWizardModel{ + cfg: *cfg, + width: termWidth(), + state: awScopeMode, + scopeMode: defaultScopeMode(cfg, cwd), + localPath: defaultLocalPath(cfg, cwd), + orgName: strings.TrimSpace(cfg.Scope.Org), + repoCSV: strings.Join(cfg.Scope.Repos, ","), + displayName: strings.TrimSpace(cfg.Scope.DisplayName), + confirmRun: true, } + m.form = m.buildForm() - scopeForm := huh.NewForm( - huh.NewGroup( - huh.NewSelect[string](). - Title("What's the scope of this audit?"). - Description("Choose what you're assessing."). - Options( - huh.NewOption("This local repo", "local-repo"), - huh.NewOption("A GitHub organization", "org"), - huh.NewOption("Both (org + a local checkout)", "both"), - ). - Value(&scopeMode), - ), - ) - if err := scopeForm.Run(); err != nil { - return err + p := tea.NewProgram(&m, tea.WithOutput(os.Stderr), tea.WithAltScreen()) + finalModel, err := teaProgramRun(p) + if err != nil { + return nil, err + } + final := finalModel.(*assessWizardModel) + return &AssessWizardResult{ + Config: &final.cfg, + Confirmed: final.confirmed, + Aborted: final.aborted, + }, nil +} + +func defaultScopeMode(cfg *AssessConfig, cwd string) string { + if cfg.Scope.Mode != "" { + return cfg.Scope.Mode + } + _ = cwd + return "local-repo" +} + +func defaultLocalPath(cfg *AssessConfig, cwd string) string { + if cfg.Scope.LocalPath != "" { + return cfg.Scope.LocalPath + } + return cwd +} + +func (m *assessWizardModel) Init() tea.Cmd { + if m.form != nil { + return m.form.Init() + } + return nil +} + +func (m *assessWizardModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch msg := msg.(type) { + case tea.WindowSizeMsg: + m.width = msg.Width + m.height = msg.Height + if m.form != nil { + m.form = m.form.WithWidth(m.formWidth()) + } + return m, nil + + case tea.KeyMsg: + if msg.String() == "ctrl+c" { + m.aborted = true + return m, tea.Quit + } + if msg.String() == "esc" { + return m.goBack() + } + } + + if m.form != nil { + form, cmd := m.form.Update(msg) + if f, ok := form.(*huh.Form); ok { + m.form = f + } + switch m.form.State { + case huh.StateCompleted: + return m.advance() + case huh.StateAborted: + return m.goBack() + } + return m, cmd } - cfg.Scope.Mode = scopeMode + return m, nil +} + +func (m *assessWizardModel) advance() (tea.Model, tea.Cmd) { + prev := m.state + switch m.state { + case awScopeMode: + m.cfg.Scope.Mode = m.scopeMode + switch m.scopeMode { + case "local-repo": + m.state = awLocalPath + case "org": + m.state = awOrg + case "both": + m.state = awBoth + } - switch scopeMode { - case "local-repo": - path := cfg.Scope.LocalPath - if path == "" { - path = cwd + case awLocalPath: + m.cfg.Scope.LocalPath = strings.TrimSpace(m.localPath) + m.cfg.Scope.Org = "" + m.cfg.Scope.Repos = nil + if strings.TrimSpace(m.displayName) == "" { + m.cfg.Scope.DisplayName = filepath.Base(m.cfg.Scope.LocalPath) + } else { + m.cfg.Scope.DisplayName = strings.TrimSpace(m.displayName) } - pathForm := huh.NewForm( + m.state = awConfirm + + case awOrg: + m.cfg.Scope.Org = strings.TrimSpace(m.orgName) + m.cfg.Scope.Repos = parseRepoCSV(m.repoCSV) + m.cfg.Scope.LocalPath = "" + if strings.TrimSpace(m.displayName) == "" { + m.cfg.Scope.DisplayName = m.cfg.Scope.Org + } else { + m.cfg.Scope.DisplayName = strings.TrimSpace(m.displayName) + } + m.state = awConfirm + + case awBoth: + m.cfg.Scope.Org = strings.TrimSpace(m.orgName) + m.cfg.Scope.LocalPath = strings.TrimSpace(m.localPath) + m.cfg.Scope.Repos = parseRepoCSV(m.repoCSV) + if strings.TrimSpace(m.displayName) == "" { + if m.cfg.Scope.Org != "" { + m.cfg.Scope.DisplayName = m.cfg.Scope.Org + } else { + m.cfg.Scope.DisplayName = filepath.Base(m.cfg.Scope.LocalPath) + } + } else { + m.cfg.Scope.DisplayName = strings.TrimSpace(m.displayName) + } + m.state = awConfirm + + case awConfirm: + if !m.confirmRun { + m.aborted = true + m.state = awDone + return m, tea.Quit + } + fillAssessDefaults(&m.cfg) + m.confirmed = true + m.state = awDone + return m, tea.Quit + } + + m.history = append(m.history, prev) + m.form = m.buildForm() + if m.form == nil { + return m, nil + } + return m, m.form.Init() +} + +func (m *assessWizardModel) goBack() (tea.Model, tea.Cmd) { + if len(m.history) == 0 { + m.aborted = true + return m, tea.Quit + } + m.state = m.history[len(m.history)-1] + m.history = m.history[:len(m.history)-1] + m.form = m.buildForm() + if m.form == nil { + return m, nil + } + return m, m.form.Init() +} + +func (m *assessWizardModel) buildForm() *huh.Form { + switch m.state { + case awScopeMode: + return huh.NewForm( + huh.NewGroup( + huh.NewSelect[string](). + Title("What's the scope of this audit?"). + Description("Choose what you're assessing."). + Options( + huh.NewOption("This local repo", "local-repo"), + huh.NewOption("A GitHub organization", "org"), + huh.NewOption("Both (org + a local checkout)", "both"), + ). + Value(&m.scopeMode), + ), + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) + + case awLocalPath: + return huh.NewForm( huh.NewGroup( huh.NewInput(). Title("Local repo path"). Description("Path to the repo you want to audit."). - Value(&path). + Value(&m.localPath). Validate(validateLocalPath), + huh.NewInput(). + Title("Display name (optional)"). + Description("Used in the audit title and filename. Defaults to the directory name."). + Value(&m.displayName), ), - ) - if err := pathForm.Run(); err != nil { - return err - } - cfg.Scope.LocalPath = strings.TrimSpace(path) - cfg.Scope.Org = "" - cfg.Scope.Repos = nil - if cfg.Scope.DisplayName == "" { - cfg.Scope.DisplayName = filepath.Base(cfg.Scope.LocalPath) - } - case "org": - org := cfg.Scope.Org - orgForm := huh.NewForm( + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) + + case awOrg: + return huh.NewForm( huh.NewGroup( huh.NewInput(). Title("GitHub organization"). Description("e.g. acme-co (no slashes)."). - Value(&org). - Validate(func(s string) error { - if strings.TrimSpace(s) == "" { - return fmt.Errorf("org name is required") - } - return nil - }), + Value(&m.orgName). + Validate(requireNonEmpty("organization")), + huh.NewInput(). + Title("Repos to narrow scope (optional)"). + Description("Comma-separated repo names. Leave blank to assess the whole org."). + Value(&m.repoCSV), + huh.NewInput(). + Title("Display name (optional)"). + Description("Used in the audit title. Defaults to the org name."). + Value(&m.displayName), ), - ) - if err := orgForm.Run(); err != nil { - return err - } - cfg.Scope.Org = strings.TrimSpace(org) - cfg.Scope.LocalPath = "" - if cfg.Scope.DisplayName == "" { - cfg.Scope.DisplayName = cfg.Scope.Org - } - case "both": - path := cfg.Scope.LocalPath - if path == "" { - path = cwd - } - org := cfg.Scope.Org - bothForm := huh.NewForm( + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) + + case awBoth: + return huh.NewForm( huh.NewGroup( huh.NewInput(). Title("GitHub organization"). - Value(&org). - Validate(func(s string) error { - if strings.TrimSpace(s) == "" { - return fmt.Errorf("org name is required") - } - return nil - }), + Value(&m.orgName). + Validate(requireNonEmpty("organization")), huh.NewInput(). Title("Local repo path"). - Value(&path). + Description("A representative checkout to gather repo-side evidence from."). + Value(&m.localPath). Validate(validateLocalPath), + huh.NewInput(). + Title("Repos to narrow scope (optional)"). + Description("Comma-separated; leave blank for the whole org."). + Value(&m.repoCSV), + huh.NewInput(). + Title("Display name (optional)"). + Description("Defaults to the org name."). + Value(&m.displayName), ), - ) - if err := bothForm.Run(); err != nil { - return err - } - cfg.Scope.Org = strings.TrimSpace(org) - cfg.Scope.LocalPath = strings.TrimSpace(path) - if cfg.Scope.DisplayName == "" { - cfg.Scope.DisplayName = cfg.Scope.Org - } + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) + + case awConfirm: + return huh.NewForm( + huh.NewGroup( + huh.NewConfirm(). + Title("Run this audit?"). + Description("Press Enter to start, or Escape to go back and edit."). + Affirmative("Run audit"). + Negative("Cancel"). + Value(&m.confirmRun), + ), + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) } + return nil +} - if cfg.OutputFormat == "" { - cfg.OutputFormat = "both" +func (m *assessWizardModel) View() string { + if m.state == awDone { + return "" } - if cfg.EvidenceTier == "" { - cfg.EvidenceTier = "auto" + + w := m.width + if w <= 0 { + w = 80 } - return nil + + title := renderShellHeader(w) + + formWidth := m.formWidth() + summaryWidth := w - formWidth - 2 + + leftPanel := "" + if m.form != nil { + leftPanel = m.form.View() + } + leftFrame := lipgloss.NewStyle().Border(lipgloss.HiddenBorder()).Padding(0, 1) + leftInnerWidth := max(20, formWidth-leftFrame.GetHorizontalFrameSize()) + leftPanel = leftFrame.Width(leftInnerWidth).Render(leftPanel) + + rightPanel := renderAssessSummary(&m.cfg, summaryWidth) + + left := lipgloss.NewStyle().Width(formWidth).Render(leftPanel) + right := lipgloss.NewStyle().Width(summaryWidth).Render(rightPanel) + body := lipgloss.JoinHorizontal(lipgloss.Top, left, " ", right) + + hintStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + hints := hintStyle.Render("esc back • ctrl+c quit") + + return lipgloss.JoinVertical(lipgloss.Left, title, "", body, "", hints) } +func (m *assessWizardModel) formWidth() int { + w := m.width + if w <= 0 { + w = 80 + } + return w * 3 / 5 +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + func validateLocalPath(s string) error { trimmed := strings.TrimSpace(s) if trimmed == "" { @@ -197,8 +439,33 @@ func validateLocalPath(s string) error { return nil } +func requireNonEmpty(field string) func(string) error { + return func(s string) error { + if strings.TrimSpace(s) == "" { + return fmt.Errorf("%s is required", field) + } + return nil + } +} + +func parseRepoCSV(s string) []string { + parts := strings.Split(strings.TrimSpace(s), ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + out = append(out, p) + } + } + return out +} + // promptInterviewQuestion shows a single Phase-1 question via huh and returns // the captured answer. Choosing "Other" pops a free-text follow-up. +// +// Used as the askInterview callback by RunAssessProgressDisplay during the +// interactive interview round-trip — the Tea program releases the terminal +// for the duration of the prompt, then resumes. func promptInterviewQuestion(evt GenericEvent) (string, bool, error) { const freeTextSentinel = "__free_text__" options := evt.Options @@ -223,7 +490,7 @@ func promptInterviewQuestion(evt GenericEvent) (string, bool, error) { Options(huhOptions...). Value(&choice), ), - ) + ).WithTheme(huh.ThemeCharm()) if err := form.Run(); err != nil { return "unknown", false, err } @@ -237,7 +504,7 @@ func promptInterviewQuestion(evt GenericEvent) (string, bool, error) { Description("Free text — leave blank for 'unknown'."). Value(&freeText), ), - ) + ).WithTheme(huh.ThemeCharm()) if err := ftForm.Run(); err != nil { return "unknown", false, err } diff --git a/tui/assess_wizard_test.go b/tui/assess_wizard_test.go new file mode 100644 index 0000000..274dab9 --- /dev/null +++ b/tui/assess_wizard_test.go @@ -0,0 +1,217 @@ +package main + +import ( + "strings" + "testing" + + tea "github.com/charmbracelet/bubbletea" +) + +func newAssessWizardForTest(scope string) *assessWizardModel { + cfg := AssessConfig{} + m := &assessWizardModel{ + cfg: cfg, + width: 100, + state: awScopeMode, + scopeMode: scope, + localPath: "/tmp/foo", + orgName: "acme", + confirmRun: true, + } + m.form = m.buildForm() + return m +} + +func TestAssessWizard_AdvanceLocalRepo(t *testing.T) { + m := newAssessWizardForTest("local-repo") + updated, _ := m.advance() + final := updated.(*assessWizardModel) + if final.cfg.Scope.Mode != "local-repo" { + t.Errorf("Mode = %q, want local-repo", final.cfg.Scope.Mode) + } + if final.state != awLocalPath { + t.Errorf("state = %d, want awLocalPath (%d)", final.state, awLocalPath) + } +} + +func TestAssessWizard_AdvanceOrg(t *testing.T) { + m := newAssessWizardForTest("org") + updated, _ := m.advance() + final := updated.(*assessWizardModel) + if final.cfg.Scope.Mode != "org" { + t.Errorf("Mode = %q, want org", final.cfg.Scope.Mode) + } + if final.state != awOrg { + t.Errorf("state = %d, want awOrg (%d)", final.state, awOrg) + } +} + +func TestAssessWizard_AdvanceBoth(t *testing.T) { + m := newAssessWizardForTest("both") + updated, _ := m.advance() + final := updated.(*assessWizardModel) + if final.state != awBoth { + t.Errorf("state = %d, want awBoth (%d)", final.state, awBoth) + } +} + +func TestAssessWizard_LocalPathSetsDisplayNameFromBasename(t *testing.T) { + m := newAssessWizardForTest("local-repo") + m.advance() + m.localPath = "/home/foo/some-repo" + m.displayName = "" // empty -> derive from basename + m.state = awLocalPath + updated, _ := m.advance() + final := updated.(*assessWizardModel) + if final.cfg.Scope.DisplayName != "some-repo" { + t.Errorf("DisplayName = %q, want some-repo", final.cfg.Scope.DisplayName) + } + if final.state != awConfirm { + t.Errorf("state = %d, want awConfirm", final.state) + } +} + +func TestAssessWizard_OrgPopulatesScope(t *testing.T) { + m := newAssessWizardForTest("org") + m.advance() + m.orgName = "acme" + m.repoCSV = "frontend, backend , mobile" + m.displayName = "" + m.state = awOrg + updated, _ := m.advance() + final := updated.(*assessWizardModel) + if final.cfg.Scope.Org != "acme" { + t.Errorf("Org = %q, want acme", final.cfg.Scope.Org) + } + if final.cfg.Scope.DisplayName != "acme" { + t.Errorf("DisplayName = %q, want acme", final.cfg.Scope.DisplayName) + } + if got := final.cfg.Scope.Repos; len(got) != 3 || got[0] != "frontend" || got[1] != "backend" || got[2] != "mobile" { + t.Errorf("Repos = %v, want [frontend backend mobile]", got) + } +} + +func TestAssessWizard_ConfirmRunSetsConfirmed(t *testing.T) { + m := newAssessWizardForTest("local-repo") + m.state = awConfirm + m.confirmRun = true + updated, cmd := m.advance() + final := updated.(*assessWizardModel) + if !final.confirmed { + t.Error("confirmed should be true after confirm=true") + } + if final.state != awDone { + t.Errorf("state = %d, want awDone", final.state) + } + if cmd == nil { + t.Error("expected tea.Quit cmd") + } +} + +func TestAssessWizard_ConfirmCancelSetsAborted(t *testing.T) { + m := newAssessWizardForTest("local-repo") + m.state = awConfirm + m.confirmRun = false + updated, cmd := m.advance() + final := updated.(*assessWizardModel) + if !final.aborted { + t.Error("aborted should be true on confirm=false") + } + if cmd == nil { + t.Error("expected tea.Quit cmd") + } +} + +func TestAssessWizard_GoBackPopsHistory(t *testing.T) { + m := newAssessWizardForTest("local-repo") + m.advance() // -> awLocalPath + if len(m.history) != 1 { + t.Fatalf("history len = %d, want 1", len(m.history)) + } + updated, _ := m.goBack() + final := updated.(*assessWizardModel) + if final.state != awScopeMode { + t.Errorf("state after goBack = %d, want awScopeMode", final.state) + } + if len(final.history) != 0 { + t.Errorf("history len = %d, want 0", len(final.history)) + } +} + +func TestAssessWizard_GoBackEmptyHistoryAborts(t *testing.T) { + m := newAssessWizardForTest("local-repo") + updated, cmd := m.goBack() + final := updated.(*assessWizardModel) + if !final.aborted { + t.Error("expected aborted=true when goBack with empty history") + } + if cmd == nil { + t.Error("expected tea.Quit cmd") + } +} + +func TestAssessWizard_CtrlCAborts(t *testing.T) { + m := newAssessWizardForTest("local-repo") + updated, cmd := m.Update(tea.KeyMsg{Type: tea.KeyCtrlC}) + final := updated.(*assessWizardModel) + if !final.aborted { + t.Error("ctrl+c should set aborted") + } + if cmd == nil { + t.Error("expected tea.Quit cmd") + } +} + +func TestAssessWizard_ViewRendersHeaderAndPanels(t *testing.T) { + m := newAssessWizardForTest("local-repo") + view := m.View() + if !strings.Contains(view, "Assessment Setup") { + t.Error("view should include the Assessment Setup summary header") + } + if !strings.Contains(view, "esc back") { + t.Error("view should include nav hints (esc back • ctrl+c quit)") + } +} + +func TestAssessWizard_ViewEmptyWhenDone(t *testing.T) { + m := newAssessWizardForTest("local-repo") + m.state = awDone + if m.View() != "" { + t.Error("View() should be empty when state is awDone") + } +} + +func TestParseRepoCSV(t *testing.T) { + cases := map[string][]string{ + "": {}, + "foo": {"foo"}, + "foo,bar,baz": {"foo", "bar", "baz"}, + " foo , bar ,baz ": {"foo", "bar", "baz"}, + ",foo,,bar,": {"foo", "bar"}, + } + for input, want := range cases { + got := parseRepoCSV(input) + if len(got) != len(want) { + t.Errorf("parseRepoCSV(%q) = %v, want %v", input, got, want) + continue + } + for i, v := range want { + if got[i] != v { + t.Errorf("parseRepoCSV(%q)[%d] = %q, want %q", input, i, got[i], v) + } + } + } +} + +func TestRequireNonEmpty(t *testing.T) { + v := requireNonEmpty("name") + if err := v(""); err == nil { + t.Error("empty should error") + } + if err := v(" "); err == nil { + t.Error("whitespace should error") + } + if err := v("acme"); err != nil { + t.Errorf("non-empty should pass: %v", err) + } +} From ccc5fdc2ead25aae5e4871517b3a6859dfe9303c Mon Sep 17 00:00:00 2001 From: Asa Baylus Date: Sun, 3 May 2026 17:50:18 -0400 Subject: [PATCH 3/6] fix(assess): host interview prompts inside the framed progress display MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interview was breaking the layout: each Phase-1 question called huh.Form.Run() in a separate full-screen huh program after releasing the alt-screen. No shell header, no two-pane summary, just a bare prompt — discontinuous with the rest of the assess flow. The progress model now hosts the interview form directly, mirroring how the report wizard handles its wsConfirmRun step inside the same Bubble Tea program. When an interview-question event arrives: - Build a huh.Form for the option select (with "Other (type your own)" appended when allowFreeText is true) - Mount it on the model's `interviewForm` field - View() swaps the left pane from the progress panel to the form panel while keeping renderShellHeader on top, renderAssessSummary on the right, and an updated nav-hints footer ("↑↓ navigate • enter submit") - Update() routes key events to the form first; on huh.StateCompleted the model either transitions to a free-text follow-up (when the sentinel was chosen) or invokes the sendAnswer callback that writes the answer JSON line back to the runner's stdin Also reworked RunAssessProgressDisplay's signature: it now takes sendAnswer directly instead of an askInterview prompter, since prompting lives inside the model. Tests added/updated: TestAssessProgress_InterviewQuestionMountsForm, TestAssessProgress_SubmitInterviewAdvances, TestAssessProgress_FreeTextSentinelTransitions, TestAssessProgress_FreeTextEmptyMapsToUnknown, TestAssessProgress_QKeyDuringInterviewIsRoutedToForm. Co-Authored-By: Claude Opus 4.7 (1M context) --- tui/assess_progress.go | 291 ++++++++++++++++++++++++++++-------- tui/assess_progress_test.go | 127 ++++++++++++++-- tui/assess_wizard.go | 70 ++------- 3 files changed, 360 insertions(+), 128 deletions(-) diff --git a/tui/assess_progress.go b/tui/assess_progress.go index b1999e9..5418dc0 100644 --- a/tui/assess_progress.go +++ b/tui/assess_progress.go @@ -10,6 +10,7 @@ import ( "github.com/charmbracelet/bubbles/spinner" "github.com/charmbracelet/bubbles/viewport" tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/huh" "github.com/charmbracelet/lipgloss" ) @@ -22,37 +23,65 @@ type assessStepState struct { finishedAt time.Time } -// assessProgressModel is the Bubble Tea model for the maturity-assessment progress display. -// Mirrors progressModel (report) so visual design matches: two-pane layout, step list with -// ✔/✖/spinner icons, monotonic progress bar, right-side configuration summary. +// interviewSubState tracks where the embedded interview form is in its flow. +type interviewSubState int + +const ( + interviewIdle interviewSubState = iota // no interview active + interviewSelecting // showing the option select + interviewFreeText // showing the free-text input (after Other) +) + +const interviewFreeTextSentinel = "__free_text__" + +// assessProgressModel is the Bubble Tea model for the maturity-assessment +// progress display. Mirrors progressModel (report) so visual design matches: +// two-pane layout, step list with ✔/✖/spinner icons, monotonic progress bar, +// right-side configuration summary. +// +// The interview round-trip is hosted INSIDE this model — when an +// `interview-question` event arrives, the left pane swaps to a `huh.Form` +// inline (same shell header, same right-pane summary, same nav hints) instead +// of releasing the terminal. This keeps the framed layout continuous through +// the whole pipeline, matching how the report wizard handles its confirm +// step inside the same Bubble Tea program. type assessProgressModel struct { - steps []assessStepState - expectedSteps []string // canonical pipeline order, used to compute progress + show all steps from start - spinner spinner.Model - progressBar progress.Model - shellViewport viewport.Model - viewport viewport.Model - cfg *AssessConfig - title string - resultPath string - jsonPath string - jsonData string - errorMsg string - pendingQuestion *GenericEvent // set when an interview-question event arrives mid-flow - answersSent int - totalQuestions int - done bool - width int - height int - peakRatio float64 - cancelled bool + steps []assessStepState + expectedSteps []string // canonical pipeline order, used to compute progress + show all steps from start + spinner spinner.Model + progressBar progress.Model + shellViewport viewport.Model + viewport viewport.Model + cfg *AssessConfig + title string + resultPath string + jsonPath string + jsonData string + errorMsg string + + // Interview state — hosted in-model so the layout doesn't break. + interview interviewSubState + interviewEvent *GenericEvent + interviewForm *huh.Form + interviewChoice string + interviewFreeText string + answersSent int + totalQuestions int + + // sendAnswer is invoked when the embedded form completes. The model + // keeps no knowledge of the runner's stdin pipe — it just calls back. + sendAnswer func(questionID, value string, isOption bool) error + + done bool + width int + height int + peakRatio float64 + cancelled bool } // Messages used by the assess progress program. type assessStepMsg GenericEvent type assessDoneMsg struct{} -type assessAskMsg struct{ evt GenericEvent } -type assessAnswerSentMsg struct{ ok bool } type assessFatalMsg struct{ err error } // canonicalAssessSteps drives the right-pane progress denominator and the @@ -69,7 +98,12 @@ var canonicalAssessSteps = []string{ "complete", } -func newAssessProgressModel(title string, cfg *AssessConfig, totalQuestions int) assessProgressModel { +func newAssessProgressModel( + title string, + cfg *AssessConfig, + totalQuestions int, + sendAnswer func(qid, value string, isOption bool) error, +) assessProgressModel { s := spinner.New() s.Spinner = spinner.Dot s.Style = lipgloss.NewStyle().Foreground(lipgloss.Color("14")) @@ -94,6 +128,7 @@ func newAssessProgressModel(title string, cfg *AssessConfig, totalQuestions int) title: title, expectedSteps: canonicalAssessSteps, totalQuestions: totalQuestions, + sendAnswer: sendAnswer, width: w, height: 24, } @@ -110,16 +145,34 @@ func (m assessProgressModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { m.height = msg.Height m.reflow() m.syncViewportContent() + if m.interviewForm != nil { + m.interviewForm = m.interviewForm.WithWidth(m.formWidth()) + } return m, nil case tea.KeyMsg: - switch msg.String() { - case "ctrl+c", "q": - if m.pendingQuestion == nil { + // While an interview form is active, route keys to the form first + // (so users can type, navigate options, etc.). + if m.interviewForm != nil { + form, cmd := m.interviewForm.Update(msg) + if f, ok := form.(*huh.Form); ok { + m.interviewForm = f + } + if m.interviewForm.State == huh.StateCompleted { + return m.advanceInterview() + } + if m.interviewForm.State == huh.StateAborted { m.cancelled = true m.done = true return m, tea.Quit } + return m, cmd + } + switch msg.String() { + case "ctrl+c", "q": + m.cancelled = true + m.done = true + return m, tea.Quit } var cmd tea.Cmd m.viewport, cmd = m.viewport.Update(msg) @@ -153,6 +206,14 @@ func (m assessProgressModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { return m, tea.Quit } + // Forward non-key messages to the active form so its internal cmds run. + if m.interviewForm != nil { + form, cmd := m.interviewForm.Update(msg) + if f, ok := form.(*huh.Form); ok { + m.interviewForm = f + } + return m, cmd + } return m, nil } @@ -205,19 +266,22 @@ func (m assessProgressModel) handleStep(evt GenericEvent) (tea.Model, tea.Cmd) { return m, m.progressBar.SetPercent(m.peakRatio) case "interview-frame": - // Surface the framing message inline as an active step note. m.upsertActive("interview", evt.Message) m.syncViewportContent() return m, nil case "interview-question": - m.pendingQuestion = &evt + m.interviewEvent = &evt + m.interview = interviewSelecting + m.interviewChoice = "" + m.interviewFreeText = "" + m.interviewForm = m.buildInterviewSelectForm(evt) m.upsertActive( "interview", - fmt.Sprintf("Question %d of %d — awaiting answer (%s)…", m.answersSent+1, m.totalQuestions, evt.QuestionID), + fmt.Sprintf("Question %d of %d (%s)", m.answersSent+1, m.totalQuestions, evt.QuestionID), ) m.syncViewportContent() - return m, nil + return m, m.interviewForm.Init() case "result": m.resultPath = evt.OutputPath @@ -243,6 +307,97 @@ func (m assessProgressModel) handleStep(evt GenericEvent) (tea.Model, tea.Cmd) { return m, nil } +// advanceInterview runs after the embedded form completes. It either +// transitions to the free-text follow-up or submits the answer. +func (m assessProgressModel) advanceInterview() (tea.Model, tea.Cmd) { + switch m.interview { + case interviewSelecting: + if m.interviewChoice == interviewFreeTextSentinel { + // Switch to the free-text input form. + m.interview = interviewFreeText + m.interviewForm = m.buildInterviewFreeTextForm() + return m, m.interviewForm.Init() + } + return m.submitInterviewAnswer(m.interviewChoice, true) + case interviewFreeText: + value := strings.TrimSpace(m.interviewFreeText) + if value == "" { + value = "unknown" + } + return m.submitInterviewAnswer(value, false) + } + return m, nil +} + +func (m assessProgressModel) submitInterviewAnswer(value string, isOption bool) (tea.Model, tea.Cmd) { + qid := "" + if m.interviewEvent != nil { + qid = m.interviewEvent.QuestionID + } + if m.sendAnswer != nil && qid != "" { + if err := m.sendAnswer(qid, value, isOption); err != nil { + m.errorMsg = fmt.Sprintf("failed to send interview answer: %v", err) + m.done = true + return m, tea.Quit + } + } + m.answersSent++ + m.interview = interviewIdle + m.interviewEvent = nil + m.interviewForm = nil + m.interviewChoice = "" + m.interviewFreeText = "" + m.upsertActive( + "interview", + fmt.Sprintf("Answered %d of %d questions…", m.answersSent, m.totalQuestions), + ) + m.syncViewportContent() + return m, nil +} + +func (m *assessProgressModel) buildInterviewSelectForm(evt GenericEvent) *huh.Form { + options := evt.Options + if len(options) == 0 { + options = []string{"I don't know"} + } + huhOptions := make([]huh.Option[string], 0, len(options)+1) + for _, opt := range options { + huhOptions = append(huhOptions, huh.NewOption(opt, opt)) + } + if evt.AllowFreeText { + huhOptions = append( + huhOptions, + huh.NewOption("Other (type your own)", interviewFreeTextSentinel), + ) + } + + headerStyle := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("212")) + header := headerStyle.Render( + fmt.Sprintf("Question %d of %d — %s", m.answersSent+1, m.totalQuestions, evt.QuestionID), + ) + + return huh.NewForm( + huh.NewGroup( + huh.NewNote().Title(header).Description(evt.QuestionText), + huh.NewSelect[string](). + Title("Pick an answer"). + Options(huhOptions...). + Value(&m.interviewChoice), + ), + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) +} + +func (m *assessProgressModel) buildInterviewFreeTextForm() *huh.Form { + return huh.NewForm( + huh.NewGroup( + huh.NewText(). + Title("Your answer"). + Description("Free text — leave blank for 'unknown'."). + Value(&m.interviewFreeText), + ), + ).WithWidth(m.formWidth()).WithTheme(huh.ThemeCharm()) +} + func (m *assessProgressModel) upsertActive(stepName, message string) { idx := m.findStep(stepName) if idx < 0 { @@ -268,9 +423,6 @@ func (m assessProgressModel) findStep(step string) int { return -1 } -// recalcPeakRatio computes a monotonically increasing ratio over the -// canonical step list. Steps not yet seen contribute 0; active steps -// contribute 0.5; complete/failed contribute 1. func (m *assessProgressModel) recalcPeakRatio() { denom := float64(len(m.expectedSteps)) if denom == 0 { @@ -307,18 +459,47 @@ func (m assessProgressModel) View() string { m.syncViewportContent() title := renderShellHeader(m.width) - leftPanel := m.renderProgressPanel() + + leftPanel := m.renderLeftPanel() rightPanel := m.renderConfigPanel() left := lipgloss.NewStyle().Width(m.leftPanelWidth()).Render(leftPanel) right := lipgloss.NewStyle().Width(m.rightPanelWidth()).Render(rightPanel) body := lipgloss.JoinHorizontal(lipgloss.Top, left, " ", right) - shell := lipgloss.JoinVertical(lipgloss.Left, title, "", body) + hintStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + hints := hintStyle.Render(m.hintsText()) + + shell := lipgloss.JoinVertical(lipgloss.Left, title, "", body, "", hints) m.shellViewport.SetContent(shell) return m.shellViewport.View() } +func (m *assessProgressModel) hintsText() string { + if m.interviewForm != nil { + return "↑↓ navigate • enter submit • esc cancel" + } + return "ctrl+c quit" +} + +func (m *assessProgressModel) renderLeftPanel() string { + if m.interviewForm != nil { + return m.renderInterviewPanel() + } + return m.renderProgressPanel() +} + +func (m *assessProgressModel) renderInterviewPanel() string { + contentWidth := m.contentWidth() + frame := lipgloss.NewStyle(). + Border(lipgloss.HiddenBorder()). + Padding(0, 1). + Width(contentWidth) + inner := lipgloss.NewStyle().Width(max(20, contentWidth-2)) + body := inner.Render(m.interviewForm.View()) + return frame.Render(body) +} + func (m *assessProgressModel) leftPanelWidth() int { w := m.width if w <= 0 { @@ -331,6 +512,10 @@ func (m *assessProgressModel) leftPanelWidth() int { return lw } +func (m *assessProgressModel) formWidth() int { + return max(32, m.leftPanelWidth()-4) +} + func (m *assessProgressModel) rightPanelWidth() int { w := m.width if w <= 0 { @@ -485,40 +670,24 @@ type AssessProgressResult struct { Cancelled bool } -// RunAssessProgressDisplay drives the Bubble Tea progress program for the assess flow. -// It owns event-channel reading, dispatches interview prompts to a callback, and -// returns the final result + path data (or an error). +// RunAssessProgressDisplay drives the Bubble Tea progress program for the +// assess flow. Interview prompts are hosted INSIDE this same program — they +// don't release the terminal — so the framed two-pane layout is continuous. // -// askInterview is invoked synchronously when an "interview-question" event arrives; -// it must return the answer + isOption flag (or an error to abort). The Tea program -// exits the alt-screen for the duration of the prompt so huh can take over the TTY, -// then resumes. +// sendAnswer is invoked when each embedded interview form completes. It +// must write the answer JSON line back to the runner's stdin. func RunAssessProgressDisplay( title string, cfg *AssessConfig, res *AssessRunResult, - askInterview func(evt GenericEvent) (string, bool, error), + sendAnswer func(qid, value string, isOption bool) error, ) AssessProgressResult { - m := newAssessProgressModel(title, cfg, 7) + m := newAssessProgressModel(title, cfg, 7, sendAnswer) p := tea.NewProgram(m, tea.WithOutput(os.Stderr), tea.WithAltScreen()) go func() { for evt := range res.Events { - if evt.Type == "interview-question" { - // Pause the alt-screen, run the prompt synchronously, then forward the answer. - p.ReleaseTerminal() - value, isOption, err := askInterview(evt) - if err != nil { - p.Send(assessFatalMsg{err: err}) - p.RestoreTerminal() - continue - } - _ = SendInterviewAnswer(res, evt.QuestionID, value, isOption) - p.RestoreTerminal() - p.Send(assessStepMsg(evt)) - continue - } p.Send(assessStepMsg(evt)) } p.Send(assessDoneMsg{}) diff --git a/tui/assess_progress_test.go b/tui/assess_progress_test.go index d36b5ca..a68e912 100644 --- a/tui/assess_progress_test.go +++ b/tui/assess_progress_test.go @@ -9,7 +9,7 @@ import ( func newProgressForTest() assessProgressModel { cfg := DefaultAssessConfig() - m := newAssessProgressModel("Test", &cfg, 7) + m := newAssessProgressModel("Test", &cfg, 7, func(_, _ string, _ bool) error { return nil }) m.width = 100 m.height = 30 m.reflow() @@ -113,7 +113,7 @@ func TestAssessProgress_ErrorEventCapturesMessage(t *testing.T) { } } -func TestAssessProgress_InterviewQuestionUpdatesActiveStep(t *testing.T) { +func TestAssessProgress_InterviewQuestionMountsForm(t *testing.T) { m := newProgressForTest() updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: "interview", Status: "active", Message: "Gathering Phase-1…"}) m = updated.(assessProgressModel) @@ -121,17 +121,119 @@ func TestAssessProgress_InterviewQuestionUpdatesActiveStep(t *testing.T) { Type: "interview-question", QuestionID: "q1", QuestionText: "what?", + Options: []string{"a", "b", "I don't know"}, + AllowFreeText: true, }) final := updated.(assessProgressModel) - if final.pendingQuestion == nil { - t.Fatal("pendingQuestion should be set") + if final.interviewEvent == nil { + t.Fatal("interviewEvent should be set") + } + if final.interviewForm == nil { + t.Fatal("interviewForm should be mounted") + } + if final.interview != interviewSelecting { + t.Errorf("interview state = %d, want interviewSelecting (%d)", final.interview, interviewSelecting) } idx := final.findStep("interview") if idx < 0 { t.Fatal("interview step not found") } - if !strings.Contains(final.steps[idx].message, "awaiting answer") { - t.Errorf("message = %q, want awaiting-answer note", final.steps[idx].message) + if !strings.Contains(final.steps[idx].message, "Question 1 of 7") { + t.Errorf("message = %q, want 'Question 1 of 7' progress", final.steps[idx].message) + } +} + +func TestAssessProgress_SubmitInterviewAdvances(t *testing.T) { + captured := struct { + qid string + value string + isOption bool + }{} + cfg := DefaultAssessConfig() + m := newAssessProgressModel("Test", &cfg, 7, func(qid, value string, isOption bool) error { + captured.qid = qid + captured.value = value + captured.isOption = isOption + return nil + }) + m.width = 100 + m.height = 30 + m.reflow() + + // Fire the question then simulate the form completing with a chosen option. + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q1", + QuestionText: "what?", + Options: []string{"yes", "no", "I don't know"}, + }) + m = updated.(assessProgressModel) + m.interviewChoice = "yes" + + updated2, _ := m.advanceInterview() + final := updated2.(assessProgressModel) + if captured.qid != "q1" || captured.value != "yes" || !captured.isOption { + t.Errorf("captured = %+v, want q1/yes/true", captured) + } + if final.interview != interviewIdle { + t.Errorf("interview state should be idle after submit, got %d", final.interview) + } + if final.interviewForm != nil { + t.Error("interviewForm should be nil after submit") + } + if final.answersSent != 1 { + t.Errorf("answersSent = %d, want 1", final.answersSent) + } +} + +func TestAssessProgress_FreeTextSentinelTransitions(t *testing.T) { + cfg := DefaultAssessConfig() + m := newAssessProgressModel("Test", &cfg, 7, func(_, _ string, _ bool) error { return nil }) + m.width = 100 + m.height = 30 + m.reflow() + + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q5", + QuestionText: "?", + Options: []string{"a"}, + AllowFreeText: true, + }) + m = updated.(assessProgressModel) + m.interviewChoice = interviewFreeTextSentinel + + updated2, _ := m.advanceInterview() + final := updated2.(assessProgressModel) + if final.interview != interviewFreeText { + t.Errorf("expected transition to interviewFreeText, got %d", final.interview) + } + if final.interviewForm == nil { + t.Error("free-text form should be mounted") + } +} + +func TestAssessProgress_FreeTextEmptyMapsToUnknown(t *testing.T) { + captured := "" + cfg := DefaultAssessConfig() + m := newAssessProgressModel("Test", &cfg, 7, func(_, value string, _ bool) error { + captured = value + return nil + }) + m.width = 100 + m.height = 30 + m.reflow() + m.interviewEvent = &GenericEvent{QuestionID: "q5"} + m.interview = interviewFreeText + m.interviewFreeText = " " + + updated, _ := m.advanceInterview() + final := updated.(assessProgressModel) + if captured != "unknown" { + t.Errorf("empty free-text should map to 'unknown', got %q", captured) + } + if final.interview != interviewIdle { + t.Errorf("should reset to interviewIdle, got %d", final.interview) } } @@ -184,14 +286,21 @@ func TestAssessProgress_KeyCtrlCSetsCancelled(t *testing.T) { } } -func TestAssessProgress_PendingQuestionBlocksCancel(t *testing.T) { +func TestAssessProgress_QKeyDuringInterviewIsRoutedToForm(t *testing.T) { + // While the interview form is mounted, key events go to the form (not + // the progress display's quit handler) so users can type freely. m := newProgressForTest() - updated, _ := m.handleStep(GenericEvent{Type: "interview-question", QuestionID: "q1", QuestionText: "?"}) + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q1", + QuestionText: "?", + Options: []string{"a", "b"}, + }) m = updated.(assessProgressModel) updated2, _ := m.Update(tea.KeyMsg{Type: tea.KeyRunes, Runes: []rune{'q'}}) final := updated2.(assessProgressModel) if final.cancelled { - t.Error("cancelled should remain false while a question is pending") + t.Error("cancelled should remain false while interview form is active") } } diff --git a/tui/assess_wizard.go b/tui/assess_wizard.go index 51e2ecb..94d8e73 100644 --- a/tui/assess_wizard.go +++ b/tui/assess_wizard.go @@ -38,7 +38,14 @@ func runAssessInteractive(cfg *AssessConfig) error { } defer runner.Close() - progress := RunAssessProgressDisplay("Agent Maturity Assessment", cfg, runner, promptInterviewQuestion) + progress := RunAssessProgressDisplay( + "Agent Maturity Assessment", + cfg, + runner, + func(qid, value string, isOption bool) error { + return SendInterviewAnswer(runner, qid, value, isOption) + }, + ) for runErr := range runner.Errors { if runErr != nil { @@ -460,60 +467,7 @@ func parseRepoCSV(s string) []string { return out } -// promptInterviewQuestion shows a single Phase-1 question via huh and returns -// the captured answer. Choosing "Other" pops a free-text follow-up. -// -// Used as the askInterview callback by RunAssessProgressDisplay during the -// interactive interview round-trip — the Tea program releases the terminal -// for the duration of the prompt, then resumes. -func promptInterviewQuestion(evt GenericEvent) (string, bool, error) { - const freeTextSentinel = "__free_text__" - options := evt.Options - if len(options) == 0 { - options = []string{"I don't know"} - } - choice := "" - - huhOptions := make([]huh.Option[string], 0, len(options)+1) - for _, opt := range options { - huhOptions = append(huhOptions, huh.NewOption(opt, opt)) - } - if evt.AllowFreeText { - huhOptions = append(huhOptions, huh.NewOption("Other (type your own)", freeTextSentinel)) - } - - form := huh.NewForm( - huh.NewGroup( - huh.NewSelect[string](). - Title(evt.QuestionText). - Description(fmt.Sprintf("[%s]", evt.QuestionID)). - Options(huhOptions...). - Value(&choice), - ), - ).WithTheme(huh.ThemeCharm()) - if err := form.Run(); err != nil { - return "unknown", false, err - } - - if choice == freeTextSentinel { - freeText := "" - ftForm := huh.NewForm( - huh.NewGroup( - huh.NewText(). - Title("Your answer"). - Description("Free text — leave blank for 'unknown'."). - Value(&freeText), - ), - ).WithTheme(huh.ThemeCharm()) - if err := ftForm.Run(); err != nil { - return "unknown", false, err - } - freeText = strings.TrimSpace(freeText) - if freeText == "" { - return "unknown", false, nil - } - return freeText, false, nil - } - - return choice, true, nil -} +// Interview prompts are hosted inside the progress model now (see +// assess_progress.go::buildInterviewSelectForm). Keeping that logic out +// of a standalone huh.Form.Run() keeps the framed two-pane layout +// continuous through the entire pipeline. From bf0d9f09f0a60a7f68fc9f54253b83d381752cb8 Mon Sep 17 00:00:00 2001 From: Asa Baylus Date: Sun, 10 May 2026 21:06:10 -0400 Subject: [PATCH 4/6] test(assess): boost Go coverage past 85% threshold + ship shareable skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI's test-go job failed at 82.6% < 85% block coverage. Added tui/assess_coverage_test.go covering the uncovered functions in the new assess package: - assess_progress: Init, View with mounted interview form, hintsText, renderInterviewPanel, fitLine truncation paths, WindowSizeMsg reflow, failed-step rendering, report-data event handling, contentWidth / formWidth minimums - assess_preview: Init, reflow, previewFrameHeight floor, Update for rendered/keymsg/window-resize, tab switching forward+reverse with wraparound, q-quit, RunAssessPreview happy-path with stubbed teaProgramRun, View while rendering and after render - assess_wizard: Init, WindowSizeMsg, esc with empty history aborts, validateLocalPath edge cases (empty / nonexistent / file / dir), defaultScopeMode / defaultLocalPath - assess_flags: applyAssessFlagsTo with all flags set vs none set - assess_config: SaveAssessConfig dir-creation branch New total: 85.5%. Also ships share/skills/agent-maturity-assessment/ — a self-contained, harness-agnostic copy of the skill (SKILL.md + references/criteria.md + interview.md + output-template.md + preflight.md) intended for distribution to other Claude harnesses. Mentions the teamhero binary as an optional accelerator but works standalone in pure-Claude mode. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../skills/agent-maturity-assessment/SKILL.md | 129 +++++ .../references/criteria.md | 203 ++++++++ .../references/interview.md | 108 ++++ .../references/output-template.md | 102 ++++ .../references/preflight.md | 107 ++++ tui/assess_coverage_test.go | 473 ++++++++++++++++++ 6 files changed, 1122 insertions(+) create mode 100644 share/skills/agent-maturity-assessment/SKILL.md create mode 100644 share/skills/agent-maturity-assessment/references/criteria.md create mode 100644 share/skills/agent-maturity-assessment/references/interview.md create mode 100644 share/skills/agent-maturity-assessment/references/output-template.md create mode 100644 share/skills/agent-maturity-assessment/references/preflight.md create mode 100644 tui/assess_coverage_test.go diff --git a/share/skills/agent-maturity-assessment/SKILL.md b/share/skills/agent-maturity-assessment/SKILL.md new file mode 100644 index 0000000..6cbc5e5 --- /dev/null +++ b/share/skills/agent-maturity-assessment/SKILL.md @@ -0,0 +1,129 @@ +--- +name: agent-maturity-assessment +description: Run the Agent Maturity Assessment — a 12-criterion diagnostic for engineering organization readiness in the AI-agentic coding era. Items score 0/0.5/1 across four weighted categories (engineering basics 1.0×, knowledge & context 1.5×, AI governance & quality 1.25×, hiring 1.0×), producing a weighted percentage and a raw /12. Use whenever the user wants to audit, diagnose, or score an engineering organization, team, repo, or recently acquired company for AI readiness. Trigger on phrases like "agent maturity", "agent readiness", "AI maturity", "engineering org health", "engineering maturity", "audit the team", "score this repo", "diagnose dev experience", "is this team ready for AI", "is this team modern", "how healthy is this org", or any onboarding-era assessment — even when the user doesn't say "skill". Produces a scored audit with item-level evidence, category subtotals, weighted overall score, top fixes, and strengths to preserve. +--- + +# Agent Maturity Assessment + +A diagnostic for engineering organization health in the AI-agentic coding era. The question this assessment answers: **is this org capable of shipping safely with humans and agents working in parallel, on a codebase that doesn't degrade with every iteration?** + +This skill owns the criteria, the scoring rubric, and the audit output format. It runs against either a whole organization or a specific scope (team, product line, repo). + +## When to use + +- **One-shot audit**: assess an organization's current state during onboarding, or a specific team / repo / acquired company. +- **Recurring**: re-run quarterly against the same org to track movement, or against new sub-teams as they form or get acquired. +- **Spot-check**: a single repo or service can be scored against just the items that apply (note which items were skipped and why). + +The artifact is the deliverable. Always produce the written audit using the template in `references/output-template.md` — never just give a verbal summary. + +## Two ways to run + +### Option A — Pure-Claude mode (no binary, harness-agnostic) + +Default. Works in Claude Code, Cowork, Claude Desktop, and any other Claude harness that exposes a structured-question UI. Follow the steps in *How to run an audit* below — read the references on demand, use `AskUserQuestion` (or the equivalent tool) for the Phase-1 interview, write the output yourself. + +### Option B — Team Hero binary accelerator (optional) + +If `teamhero` is installed and the user has `OPENAI_API_KEY` configured, the binary automates the whole pipeline (preflight → adjacent-repo detection → Phase-1 interview → 12 deterministic evidence collectors → AI scoring → audit writer) and produces the markdown + JSON sidecar in one shot. + +```bash +# Detect availability +teamhero --version 2>/dev/null || teamhero-tui --version 2>/dev/null + +# Interactive (TUI walks you through scope + the 7 Phase-1 questions) +teamhero assess + +# Headless audit of the current repo +teamhero assess --headless --path . \ + --interview-answers ./answers.json +``` + +The binary is a strict superset of pure-Claude mode — it uses the exact same rubric, interview questions, output template, and preflight tier system documented here. If the binary isn't available, fall back to Option A. + +## The 12 criteria at a glance + +|# |Item |Category |Weight| +|--|--------------------------------------------------|--------------------------|------| +|1 |Reproducible dev environments |A. Engineering basics |1.0× | +|2 |Sub-day integration cadence with measured outcomes|A. Engineering basics |1.0× | +|3 |Testability and the agent inner loop |A. Engineering basics |1.0× | +|4 |Observability before features |A. Engineering basics |1.0× | +|5 |Design discipline as a first-class practice |B. Knowledge & context |1.5× | +|6 |Codebase composed of deep modules |B. Knowledge & context |1.5× | +|7 |Repo-local agent context |B. Knowledge & context |1.5× | +|8 |Sanctioned, governed AI tooling |C. AI governance & quality|1.25× | +|9 |Human review on every PR |C. AI governance & quality|1.25× | +|10|Evals for AI-touched code paths |C. AI governance & quality|1.25× | +|11|Blast-radius controls for agent actions |C. AI governance & quality|1.25× | +|12|Interviews assess judgment under AI augmentation |D. Hiring |1.0× | + +Each item scores **1.0** (pass), **0.5** (partial), or **0.0** (fail). Be conservative: if it's not visibly true, it's 0.5. If there's no evidence at all, it's 0. + +**For full score levels, repo checks, and diagnostic commands per item, read `references/criteria.md`.** + +Category B is weighted highest because it compounds — a team that gets B right tends to fix everything else. + +## How to run an audit + +1. **Decide scope.** Whole org, one product line, one repo, or one team. Score the appropriate level — don't average across heterogeneous teams. A 14-person backend team and a 3-person ML team should be scored separately. +2. **Environment preflight.** Read `references/preflight.md`. Probe for `gh` CLI / GitHub MCP / git access and select an evidence-fidelity tier before running any diagnostics. **Always announce the tier you're running at** so the audit is reproducible. +3. **Phase 1 — Org-level interview.** Read `references/interview.md` first. Read `docs/audits/CONFIG.md` for stored answers, present them for confirmation or refresh, ask fresh for any missing. Do this before evidence gathering so the answers can inform scoring on items 2, 5, 8, 10, 11, 12. **Critical:** ask one question at a time and wait for the answer before asking the next — even in auto / autonomous modes. Use the structured question UI (e.g., `AskUserQuestion`) when available with the option sets in `references/interview.md`. Dumping all 7 questions in one message and proceeding without answers produces a hollow audit; treat each question as a hard checkpoint. +4. **Map adjacent repos.** Read `references/preflight.md` (multi-repo section). CI templates, Terraform modules, QA suites, runbooks, and shared agent context often live in sibling repos. Capture the list before scoring; merge in any out-of-band repos surfaced by Phase 1 question 7. +5. **Gather evidence per item.** Don't take anyone's word for it. For each item, do at least one of: read the repo (and its adjacents), run the diagnostic commands listed in `references/criteria.md` at the highest fidelity tier available, ask a non-leadership IC the diagnostic question, or check the relevant dashboard/settings page. Combine repo evidence with Phase 1 answers using the mapping table in `references/interview.md`. +6. **Score conservatively.** When in doubt, 0.5. Revise up next quarter if evidence appears. If a Phase 1 answer was "I don't know", score that item `n/a` — never `0`. +7. **Write the audit** using the template in `references/output-template.md`. The artifact is the deliverable. Each "Why this score" cell is one sentence, ≤ 25 words. +8. **Update CONFIG.md** with confirmed/updated Phase 1 answers and today's date (see `references/interview.md` for format). +9. **Decide on distribution.** First audit at a new role is usually best kept internal until the calibration has been validated. Re-run in 90 days. + +## Scoring + +**Raw score**: sum of all 12 item scores. Max 12. + +**Weighted score** (recommended primary metric): + +``` +A_total = sum(items 1–4) × 1.00 // max 4.00 +B_total = sum(items 5–7) × 1.50 // max 4.50 +C_total = sum(items 8–11) × 1.25 // max 5.00 +D_total = sum(item 12) × 1.00 // max 1.00 + ────────── +weighted = A + B + C + D +max = 14.50 +score% = (weighted / 14.50) × 100 +``` + +If any item is scored `n/a`, drop it from both numerator and max for that audit and note it in the Summary. + +**Bands**: + +|Band |Score %|Interpretation | +|-----------------------|-------|---------------------------------------------------------------------------------------| +|Excellent |90%+ |Genuinely rare. Confirm with a second pass — first audits often score too generously. | +|Healthy |75–89% |Targeted fixes will compound. | +|Functional but slow |60–74% |Real risk of being out-shipped by AI-native competitors. Where most orgs actually live.| +|Significant dysfunction|40–59% |Treat as a turnaround. | +|Triage |<40% |Stop new feature work until basics are in. | + +The bar: **<11/12 raw and <80% weighted means there's leverage to capture.** + +## Operating principles + +- **Score conservatively.** Better to score 0.5 and revise up than to over-score on day one and have to explain why everything got "worse". +- **Evidence beats assertions.** A team that says they have ADRs but the last one was committed two years ago scores 0.5, not 1.0. +- **Unknown ≠ failing.** If a criterion can't be answered from the repo and the human indicates the answer is unknown or out of scope, score it `n/a`, drop it from numerator and max, and note what would resolve it. Do not default to 0 for absence of context. +- **Don't average heterogeneous teams.** Score them separately and report side-by-side. +- **Use it as a conversation tool, not a club.** The point is to find leverage, not to grade people. +- **Re-score quarterly.** Movement matters more than absolute level. +- **Calibrate against itself, not against other companies.** The first audit is the baseline; trends are the signal. + +## Adapting the assessment + +As organizations mature and the AI tooling landscape shifts, expect items to be added, dropped, or re-weighted. Track changes to the assessment itself (not just individual audits) in an `audits/CHANGELOG.md` so historical scores remain interpretable. + +## Reference files + +- `references/preflight.md` — Environment preflight, evidence tiers, multi-repo scope handling, host-side probe script. +- `references/criteria.md` — Full text of all 12 criteria: score levels, repo checks, diagnostic commands, why each matters. +- `references/interview.md` — Phase 1 questions, internal Q→criterion mapping, CONFIG.md storage format. +- `references/output-template.md` — Audit output template, rules for filling it out, worked example of a "Why this score" cell. diff --git a/share/skills/agent-maturity-assessment/references/criteria.md b/share/skills/agent-maturity-assessment/references/criteria.md new file mode 100644 index 0000000..c6da4fb --- /dev/null +++ b/share/skills/agent-maturity-assessment/references/criteria.md @@ -0,0 +1,203 @@ +# Criteria reference + +Full text of all 12 criteria for the Agent Maturity Assessment: score levels, repo checks, diagnostic commands, and rationale per item. Read this when gathering evidence (step 5 of *How to run an audit* in `SKILL.md`). + +Each item scores **1.0** (pass), **0.5** (partial), or **0.0** (fail). Be conservative: if it’s not visibly true, it’s 0.5. If there’s no evidence at all, it’s 0. If a criterion can’t be assessed from the repo and the user indicated unknown in Phase 1, score it `n/a` (see *Unknown ≠ failing* in `SKILL.md`). + +## Category A — Engineering basics (weight 1.0×) + +Non-negotiable foundations. Failure here multiplies risk on everything else. + +### 1. Reproducible dev environments + +- 1.0 — Clone-to-green-build in <30 min via devcontainer, Nix, or a single setup script. Same path works for an agent. +- 0.5 — README exists but bootstrap takes >2 hours or has known broken steps. +- 0.0 — “Ask Bob, he knows the trick.” + +**Repo check:** `.devcontainer/`, `flake.nix`, `setup.sh`, or equivalent. Run it from a clean machine. + +**Diagnostic commands:** + +- `ls .devcontainer/ flake.nix setup.sh scripts/bootstrap* 2>/dev/null` — bootstrap surface +- `time bash ` on a clean machine to verify the <30 min claim +- `gh repo view / 2>/dev/null` for any external bootstrap repo identified during scope mapping + +**Why it matters:** Onboarding latency is the first multiplier on team velocity, and agents need bootstrappable environments too. If a human can’t get green in 30 minutes, an agent definitely can’t. + +### 2. Sub-day integration cadence with measured outcomes + +- 1.0 — Code integrates to mainline at least daily. PRs are small and merge sub-day. All four DORA metrics (deployment frequency, lead time, change-fail rate, MTTR) are tracked and visible. Branching model can be trunk-based, GitHub flow, or short-lived Git flow — what matters is the absence of long-lived branches and the presence of measured integration discipline. +- 0.5 — Some metrics tracked, but cadence is weekly, PRs sit for days, or feature branches routinely outlive a sprint. +- 0.0 — Long-lived feature branches as the norm, release trains measured in months, no metrics. + +**Repo check:** age distribution of merged PRs over the last 90 days; presence of any DORA dashboard. + +**Diagnostic commands:** + +- `gh pr list --state merged --limit 200 --search "merged:>$(date -d '90 days ago' +%Y-%m-%d)" --json mergedAt,createdAt,additions,deletions,reviews,author` — cadence + lead time + PR size + review counts in one call +- `gh api "repos/{owner}/{repo}/branches?per_page=100" --paginate --jq '.[] | {name, last_commit_sha: .commit.sha}'` then resolve commit dates → branch staleness distribution +- `gh run list --workflow=deploy*.yml --limit 100 --json conclusion,createdAt,name --branch ` — deployment frequency proxy and change-fail rate (failed conclusions / total) +- For monorepos with deploys in adjacent infra/CD repos: rerun the `gh run list` against `/` + +**Combine with Phase 1 Q3** (DORA visibility): repo evidence covers cadence; the interview answer covers whether the four metrics are *actually visible to the team*. + +**Why it matters:** Integration cadence is the leading indicator of engineering performance. With agents in the loop the case is stronger — agents work fastest when changes validate against current main immediately, and long-lived branches accumulate integration debt humans have to resolve later. + +### 3. Testability and the agent inner loop + +- 1.0 — The application is *built* to be tested: real seams (DI, ports/adapters, deep modules with clean interfaces) so behaviors can be verified at module boundaries without spinning up the world. Unit tests are sub-second; the full suite runs in minutes; flaky tests are treated as bugs and fixed within a sprint. A single command runs the suite headlessly with machine-parseable output. TDD-style inner loops — write the test, make it pass, refactor — are the *default* mode of working with AI. +- 0.5 — Tests exist and mostly run, but the application has known untestable areas, the suite is slow enough to break flow, flaky tests get re-run rather than fixed, or TDD with agents is occasional rather than default. +- 0.0 — Manual QA, flaky-and-ignored test suite, or no seams in the application — agents can technically run `npm test` but the signal is garbage. + +**Repo check:** run the suite, time it, check failure rate over the last 50 CI runs; sample a recent feature PR and look at whether tests were written before or after the implementation. + +**Diagnostic commands:** + +- `time ` (e.g. `time pnpm test`, `time dotnet test`) — full suite duration +- `find . -name "*.test.*" -o -name "*.spec.*" -o -name "*Tests.cs" 2>/dev/null | wc -l` — test file count as a sanity floor +- `gh run list --workflow=ci.yml --limit 50 --json conclusion --jq '[.[] | .conclusion] | group_by(.) | map({status: .[0], count: length})'` — flake/fail rate +- `grep -rE "\\|\\|\\s*true|continue-on-error:\\s*true" .github/workflows/ 2>/dev/null` — CI swallowing failures (any hit = item probably 0.0 regardless of test count) +- For QA in adjacent repo (e.g. `/qa-e2e`): `gh repo view /` and inspect its CI run history the same way + +**Why it matters:** Humans can reason around bad tests (“yeah, that test is garbage, but I know the code works”). Agents can’t — they follow the signal. The test suite is the rate limit on agent throughput; agents without fast, trustworthy feedback outrun their headlights and produce thrash. + +### 4. Observability before features + +- 1.0 — Structured logs, distributed traces, error budgets defined, on-call with runbooks. New features ship instrumented. +- 0.5 — Logs and metrics exist but tracing is partial; runbooks stale. +- 0.0 — “We grep CloudWatch when something breaks.” + +**Repo check:** OTel libraries in deps, dashboards exist, error budget docs, recency of last runbook update. + +**Diagnostic commands:** + +- `grep -rEh "OpenTelemetry|opentelemetry|Microsoft\\.ApplicationInsights|datadog|prometheus|grafana|loki|tempo|sentry|honeycomb|newrelic|splunk" --include="*.csproj" --include="package.json" --include="go.mod" --include="requirements*.txt" --include="Cargo.toml" --include="pom.xml" --include="build.gradle*" 2>/dev/null` — instrumentation / agent libs (Grafana itself is viz; this catches the Grafana Cloud agent, faro SDK, Loki/Tempo clients that feed it) +- `find . \( -path "*/grafana/*.json" -o -path "*/dashboards/*.json" -o -name "*.libsonnet" -o -path "*/prometheus/*.yml" -o -path "*/alerts/*.yml" \) -not -path "*/node_modules/*" 2>/dev/null` — committed Grafana dashboards, Jsonnet, Prometheus alert rules +- `find . -ipath "*runbook*" -o -ipath "*incident*" -o -ipath "*sli*" -o -ipath "*slo*" 2>/dev/null` — runbook / SLO presence +- `git log --since="180 days ago" --oneline -- docs/runbooks/ docs/ops/ 2>/dev/null | wc -l` — recency of operational docs +- For dashboards/alerts in an adjacent repo (e.g. `/observability`, `/grafana-dashboards`): rerun the dashboard-file `find` there — score across both + +**Why it matters:** You can’t fix what you can’t see. AI accelerates ship rate, which accelerates incident rate — observability is the safety net that makes acceleration survivable. + +## Category B — Knowledge & context (weight 1.5×) + +This is what’s gotten *more* important with LLMs, not less. Agents perform at the level of context the org provides them, and codebase shape determines whether agents can navigate it at all. Weighted highest because this category compounds — a team that gets B right tends to fix everything else. + +### 5. Design discipline as a first-class practice + +- 1.0 — ADRs are current and dated. ARCHITECTURE.md exists per active repo. A **ubiquitous language glossary** is checked in, referenced in agent context, and the team enforces its terms in code, docs, and conversation. Design happens *before* code generation: agents are pointed at planning skills (e.g., “interview-me-until-shared-understanding” patterns) that force a shared design concept before any code is written. ADR/glossary commits are visible in the last 90 days — design is an ongoing investment, not a one-time write. +- 0.5 — Some design artifacts exist but are stale; ubiquitous language is implicit (people just know the terms); planning happens informally before some agent work but not consistently. +- 0.0 — Tribal knowledge. Architecture lives in one staff engineer’s head. Agents are turned loose without shared design concept and produce confidently wrong code. + +**Repo check:** `docs/adr/`, `ARCHITECTURE.md`, glossary or ubiquitous-language file; check git log on those paths for recency; sample an agent-driven PR for evidence of upfront design vs. straight-to-code. + +**Diagnostic commands:** + +- `find . -ipath "*adr*" -name "*.md" 2>/dev/null | head; find . -iname "ARCHITECTURE.md" -o -iname "GLOSSARY.md" -o -iname "*ubiquitous*" 2>/dev/null` — design surface +- `git log --since="90 days ago" --oneline -- docs/adr/ ARCHITECTURE.md 2>/dev/null | wc -l` — ongoing investment vs. one-time write +- For ADRs in a central docs repo: `gh api "repos///contents/adr" --jq '.[].name'` + +**Combine with Phase 1 Q4** (design before code): files prove artifacts exist; the interview answer proves design happens *before* code generation in practice. + +**Why it matters:** Specs-to-code without design discipline produces software entropy — each iteration makes the codebase worse. Investing in design daily is what keeps tactical AI execution aligned with strategic intent. The ubiquitous language is the bridge between domain experts, engineers, and agents — without it, every translation step introduces drift. + +### 6. Codebase composed of deep modules + +- 1.0 — The codebase is structured as **deep modules**: few large modules, each with substantial functionality hidden behind a simple, stable interface. Public interfaces are small and intentional; implementations can be sizeable but encapsulated. When agents add code, they add it inside an existing deep module’s boundary or create a new module with a clear interface — they don’t sprinkle helpers across the codebase. +- 0.5 — Some areas well-modularized; others are shallow / sprinkly. Agents tend to add code in surface-level helpers rather than respecting boundaries. A handful of god-classes exist but are known and bounded. +- 0.0 — Sprawling shallow modules with leaky interfaces; 4000-line god files alongside 30-line helper files with no clear pattern. Agents can’t navigate the module map and produce code that crosses arbitrary boundaries. + +**Repo check:** file size distribution, public API surface per module, sample two random modules and see whether you can summarize each one’s purpose in a sentence; drop one into an LLM and ask it to explain. + +**Why it matters:** AI excels at filling in implementation when given a clean interface; it produces sprawl when given no constraints. Deep modules give agents the right *shape* of problem to solve. Shallow codebases compound entropy with every agent-driven change. + +### 7. Repo-local agent context + +- 1.0 — `CLAUDE.md` / `AGENTS.md` / skill files checked into the repo. Team-level prompt and skill libraries are versioned. Agents joining the team get the same onboarding humans get. Agent context references the ubiquitous language and the module map (items 5 + 6). +- 0.5 — Some individuals have personal CLAUDE.md files; nothing shared at the repo level. +- 0.0 — No agent context anywhere; people copy-paste instructions into chat each time. + +**Repo check:** `CLAUDE.md`, `AGENTS.md`, `.claude/`, `.cursor/rules/`, `.skills/`, or equivalent. Read one — does it teach the agent something the engineer wouldn’t have to be told? + +**Diagnostic commands:** + +- `find . -maxdepth 4 \( -iname "CLAUDE.md" -o -iname "AGENTS.md" -o -name ".claude" -o -name ".cursor" -o -name ".skills" -o -name "memory-bank" \) -not -path "./node_modules/*" -not -path "./.git/*" 2>/dev/null` — agent-context surface +- For each found file/dir: `wc -l` and `git log -1 --format="%ar" -- ` to gauge depth and recency +- For shared agent context in adjacent repo (e.g. `/claude-skills`, `/.github`): `gh repo view /` and check whether this repo references it + +**Why it matters:** Agents perform at the level of context the repo provides them. Ad-hoc personal prompts mean each engineer’s agent operates at a different standard; checked-in context means everyone (and every agent) gets the same baseline. + +## Category C — AI governance & quality (weight 1.25×) + +The new control plane. + +### 8. Sanctioned, governed AI tooling + +- 1.0 — Approved model list, ZDR posture documented, secrets scanning on agent outputs, clear policy on what can / can’t be sent to third parties, paid seats budgeted. +- 0.5 — Tooling is paid for but governance is loose; or governance is tight but everyone uses personal accounts anyway. +- 0.0 — Shadow AI. People paste prod data into free-tier chatbots. + +**Diagnostic:** primary signal is the user interview answer (Phase 1 Q1). Cross-check against any policy docs in `/.github` or an internal handbook if reachable. If the user said “I don’t know”, score `n/a`. + +**Why it matters:** Shadow AI is shadow IT with worse confidentiality and IP risk. Governance now is cheaper than recovering from a leak later. + +### 9. Human review on every PR regardless of authorship + +- 1.0 — AI-generated code is reviewed by a human who understands it well enough to defend it in a postmortem. “The agent wrote it” is not a shield. +- 0.5 — Reviews happen but are cursory; AI-authored PRs get rubber-stamped. +- 0.0 — Auto-merge on agent PRs, or no review process at all. + +**Repo check:** PR review settings, review depth on a sample of recent AI-tagged PRs. + +**Diagnostic commands:** + +- `find . -name "CODEOWNERS" 2>/dev/null` — review enforcement file +- `gh api "repos/{owner}/{repo}/branches//protection" 2>/dev/null` — branch protection rules (auth scope permitting) +- `gh pr list --state merged --limit 50 --json reviews,author,additions,deletions --jq '[.[] | {pr: .number, author: .author.login, reviewers: [.reviews[].author.login] | unique, lines: (.additions + .deletions)}]'` — review depth and non-author reviewer presence per PR +- For org-level review policy in `/.github`: `gh api "repos//.github/contents/" --jq '.[].name'` + +**Why it matters:** AI-authored code that no human can defend is technical debt with no owner. Review discipline is what keeps the org accountable for what it ships. + +### 10. Evals for AI-touched code paths + +- 1.0 — If LLMs are in the product → offline eval suite + prod telemetry. If LLMs are in the dev loop → adoption, throughput, and defect rate measured honestly (not just “everyone loves it”). +- 0.5 — Vibes-based confidence; some metrics but no rigor. +- 0.0 — No evals, no measurement, no idea if the AI helps or hurts. + +**Repo check:** `evals/`, `benchmarks/`, internal AI tooling dashboards. + +**Combine with Phase 1 Q5** (eval coverage): repo evidence covers product-side evals; the interview answer covers dev-loop measurement, which rarely lives in the repo. If the user said “I don’t know” *and* no `evals/` or `benchmarks/` directory exists, score `n/a`. + +**Why it matters:** Without evals, you can’t tell whether AI is helping or hurting — you’re managing on vibes. Evals are also the only way to catch silent regressions in AI-driven product features. + +### 11. Blast-radius controls for agent actions + +- 1.0 — Scoped credentials per agent, dry-run modes, audit logs of every agent-triggered write, documented rollback paths. The “agent shipped a migration to prod at 2am” scenario has been red-teamed. +- 0.5 — Some controls exist but are inconsistent; audit logs partial. +- 0.0 — Agents have prod write access via human-equivalent creds; no audit trail. + +**Diagnostic question:** “what’s the dumbest possible agent action that could break prod, and would we know within 5 minutes?” + +**Diagnostic commands:** + +- `grep -rEh "azure/login@|aws-actions/configure-aws-credentials@|google-github-actions/auth@" .github/workflows/ 2>/dev/null` — OIDC adoption (presence of `with: client-id:` rather than `secrets.AWS_ACCESS_KEY_ID` is the green flag) +- `gh api "repos/{owner}/{repo}/environments" --jq '.environments[] | {name: .name, has_protection: (.protection_rules | length > 0)}' 2>/dev/null` — env-scoped deploys with reviewers +- `find infra/ terraform/ -name "*.tf" 2>/dev/null | xargs grep -lE "service_account|workload_identity|managed_identity|user_assigned_identity" 2>/dev/null` — scoped per-workload identities +- `grep -rEh "azurerm_role_assignment|google_project_iam|aws_iam_role" infra/ terraform/ 2>/dev/null | wc -l` — IAM blast-radius posture +- For Terraform/IAM in adjacent infra repo (e.g. `/infra`): clone shallow and rerun the same greps there + +**Combine with Phase 1 Q6** (red-team posture): files prove technical posture; the interview answer proves the worst-case scenario has been thought through. + +**Why it matters:** Autonomous agents will eventually do something stupid. The question is whether the blast radius is bounded by design or by luck. + +## Category D — Hiring (weight 1.0×) + +### 12. Interviews assess judgment under AI augmentation + +- 1.0 — Candidates use AI in interviews and are evaluated on critique, decomposition, recognizing wrong answers, and shipping correct work. The bar is “great judgment with AI”, not “no AI allowed”. +- 0.5 — AI is allowed but interviewers don’t know how to assess its use; or it’s banned for “purity” reasons. +- 0.0 — Old-style whiteboard-only interviews; or no real technical bar at all. + +**Diagnostic:** primary signal is the user interview answer (Phase 1 Q2). If a rubric is reachable in an internal repo, cross-check. If the user said “I don’t know”, score `n/a`. + +**Why it matters:** Hiring is a forward-looking bet. The skill that matters in the AI-agentic era isn’t “can write code without AI” — it’s “can use AI well.” Interviews that don’t measure that bet on the wrong skill. \ No newline at end of file diff --git a/share/skills/agent-maturity-assessment/references/interview.md b/share/skills/agent-maturity-assessment/references/interview.md new file mode 100644 index 0000000..436d385 --- /dev/null +++ b/share/skills/agent-maturity-assessment/references/interview.md @@ -0,0 +1,108 @@ +# Phase 1: Org-level interview + +Several criteria can't be answered from the codebase alone — they're behavioral, organizational, or policy facts. Phase 1 collects those answers from a human before scoring begins, and persists them to `docs/audits/CONFIG.md` so re-audits can confirm-or-refresh rather than re-interview from scratch. + +Read this when running step 3 of *How to run an audit* in `SKILL.md`. + +## How to ask the questions — read this carefully + +**Phase 1 is a real interview, not a form-dump.** The signal you get back depends entirely on the human actually engaging with each question. If you paste all seven questions in one message and then proceed, the human will skim, give you `n/a` to most, and the audit will be hollow. **Don't do that.** + +### The rule + +**Ask one question. Stop. Wait for the answer. Only then move to the next question.** This applies even in auto / autonomous modes — the interview is the rare place where blocking on a human is the *correct* behavior, because there is no other source for these answers. Treat each question as a hard checkpoint. + +If the user has not yet replied to question N, you may not ask question N+1 and you may not begin evidence gathering. The only exception is if the user explicitly says "skip the rest" or "just score what you can without me" — in which case mark every remaining question `unknown` and proceed. + +### Use the structured question UI when it's available + +If you have access to a tool that presents the user with a question + a small set of pre-written answer options (in Claude Desktop / Claude Code this is the `AskUserQuestion` tool — the user sees buttons or a list they can click; in other harnesses it may have a different name), **use it for every Phase 1 question**. It dramatically increases response rates and gives you cleaner answers to persist into CONFIG.md. + +For each question: + +1. Frame the question itself (verbatim from the list below — don't paraphrase, the wording is calibrated). +2. Provide 3-4 answer options that map cleanly to the score levels for the corresponding criterion. Always include an "I don't know / not sure" option — that maps to `n/a`, never `0`. +3. Allow a free-text override so the user can give nuance the options miss. + +If no structured-question tool is available in this harness, fall back to plain chat — but still **one question per message, and wait for the reply before sending the next one**. + +### Suggested option sets + +These are starting points — adapt the wording to the org if you have context, but keep the spread of "good / partial / bad / unknown" intact. + +|Q#|Suggested options | +|--|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|1 |• Company-paid managed seats + documented data-handling policy
• Company-paid seats but governance is loose / no written policy
• Mostly personal accounts or free tier; no policy
• I don't know | +|2 |• AI allowed in interviews, interviewers trained to assess judgment with AI
• AI allowed but assessment is informal / uncalibrated
• AI banned, or interviews don't really test technical judgment
• I don't know | +|3 |• All four DORA metrics tracked on a dashboard the team actually uses
• Some DORA metrics tracked but not actively watched
• Not really tracked / vibes-based
• I don't know | +|4 |• Consistent upfront design step (ADR / spec / shared-understanding) before agent code
• Some engineers do it, others prompt straight into code
• No design step — agents are pointed at problems and turned loose
• I don't know | +|5 |• LLMs in product with offline evals + prod telemetry
• LLMs in dev loop with tracked metrics — any deliberate tracking counts (Asana, spreadsheet, sprint retro numbers, GitHub label analysis, etc.)
• LLMs used but purely gut-feel — no numbers anyone could point to
• No LLMs in product or dev loop
• I don't know | +|6 |• Worst-case agent scenarios have been red-teamed; rollback paths documented
• Some controls in place but no explicit red-teaming
• No red-teaming; agents share human-equivalent prod creds
• I don't know | +|7 |• Yes — list the repos
• No, scope is just the primary repo(s) you've found
• I don't know | + +## Behavior on each run + +1. **Read `docs/audits/CONFIG.md`** for an `## Org-level answers` section. +2. **If the section exists**, present each stored answer to the user verbatim, with the `last_updated` date, and ask: *"Still accurate? (yes / updated answer / I don't know)"*. For confirmation-or-refresh you may batch the stored answers into a single review message — that's a different mode from a fresh interview, because the user is *editing* known state rather than producing it cold. +3. **For any question without a stored answer** (or where the user said the stored answer is no longer accurate), conduct the fresh interview using the **one-question-at-a-time** rule above. +4. **For any question with no stored answer and no fresh answer either** (user says "I don't know"), record the answer as `unknown` in CONFIG.md and score the mapped criterion as `n/a` for this run. +5. **After scoring, write back** the confirmed/updated answers to `docs/audits/CONFIG.md` under `## Org-level answers`, with `last_updated: `. If CONFIG.md doesn't exist, create a minimal version with just this section and add a line to *Notes for re-audit* recommending the user run `setup-agent-maturity-assessment` for full setup. + +## Questions to ask (verbatim, in order, one at a time) + +Before the first question, send a short framing message: *"I'm going to ask 7 quick questions one at a time — they cover the parts of the audit that aren't visible in the repo. 'I don't know' or 'n/a' is a valid answer to any of them and will mark that criterion as not assessed, not failed."* + +1. What AI tooling do engineers actually use day-to-day (Claude, Copilot, Cursor, etc.)? Is it company-paid with managed accounts, or are people using personal accounts or free tiers? Is there a documented policy on what data can be sent to third-party AI providers? +2. Do technical interviews allow candidates to use AI, and are interviewers trained to evaluate *how well* they use it (critique, decomposition, catching wrong outputs)? Or is AI either banned or effectively unassessed? +3. Are all four DORA metrics (deployment frequency, lead time, change failure rate, MTTR) actively tracked and visible to the team — e.g., a dashboard engineers actually look at? Or are some tracked in theory but not used? +4. When engineers hand work to AI agents, is there a consistent upfront design step (ADR, shared-understanding session, spec) before code generation? Or is it ad hoc — some engineers do it, others prompt straight into code? +5. Are LLMs in the product (user-facing features), in the dev loop only, or both? If in the product: is there an offline eval suite plus production telemetry? If dev-loop only: is AI impact tracked deliberately — even a spreadsheet, Asana board, or sprint retro metric counts — or is it purely gut-feel with no numbers anyone could point to? +6. Has anyone explicitly red-teamed a worst-case agent scenario in prod (bad migration, runaway infra change, secret exfiltration)? Are rollback paths for agent-triggered writes documented? +7. Are there adjacent repos I should treat as in-scope that automated detection might miss — e.g., an internal handbook, security/IT policy repo, org-wide `.github` repo, shared skill library? + +## Internal mapping (for scoring — do not show to the user) + +|Q#|Criterion |How to combine with repo evidence | +|--|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------| +|1 |C8 — Sanctioned AI tooling |Primary signal. Cross-check `/.github` policies if reachable. | +|2 |D12 — Judgment under AI augmentation |Primary signal. Cross-check rubric repo if reachable. | +|3 |A2 — Sub-day integration cadence |Combine with `gh pr list` / `gh run list` evidence. Repo evidence covers cadence; interview covers metric *visibility*. | +|4 |B5 — Design discipline |Combine with ADR / glossary file evidence. Files prove artifacts exist; interview proves design happens *before* code. | +|5 |C10 — Evals for AI-touched code paths|Repo evidence covers product-side evals (`evals/`, `benchmarks/`); interview covers dev-loop measurement, which rarely lives in the repo. | +|6 |C11 — Blast-radius controls |Combine with OIDC / IAM / branch-protection grep evidence. Files prove technical posture; interview proves the scenario has been thought through.| +|7 |Scope expansion |Merge into the adjacent-repo detection list before evidence gathering. Not a scored criterion. | + +If the user answers "I don't know" to any question, score the mapped criterion as `n/a`, exclude it from numerator and max, and add a line to *Notes for re-audit* in the audit output describing exactly what info would resolve it. + +## CONFIG.md storage format + +Append to or create `docs/audits/CONFIG.md`: + +```markdown +## Org-level answers + +last_updated: 2026-05-02 + +### AI tooling (Q1) + + +### Hiring (Q2) + + +### DORA visibility (Q3) + + +### Design before code (Q4) + + +### Eval coverage (Q5) + + +### Blast-radius red-teaming (Q6) + + +### Out-of-band adjacent repos (Q7) + +``` + +Use `unknown` as the answer text when the user said "I don't know". Do not delete previous answers — update in place so the file's git history shows movement over time. diff --git a/share/skills/agent-maturity-assessment/references/output-template.md b/share/skills/agent-maturity-assessment/references/output-template.md new file mode 100644 index 0000000..6f44d2a --- /dev/null +++ b/share/skills/agent-maturity-assessment/references/output-template.md @@ -0,0 +1,102 @@ +# Audit output template + +Read this when running step 7 of *How to run an audit* in `SKILL.md`. Always produce this exact structure. The per-criterion tables ARE the report — they should be readable in one pass, especially when comparing audits across multiple repos. + +## Rules for filling out the score tables + +- Fill in every row. Use `n/a` with a one-line reason if an item genuinely doesn’t apply to the scope or the user marked the corresponding Phase 1 answer as unknown (then exclude that item from both numerator and max in the score math). +- The *Why this score* column is **one sentence, ≤ 25 words**. State the single most decisive piece of evidence — the thing that pushed the score up or down. No bullet lists, no multi-clause sentences stitched with semicolons, no “but also” hedging. +- If you have more to say, save it for *Top 3 fixes*, *Strengths to preserve*, or *Notes for re-audit*. The table is for the verdict, not the working. +- Score in the column as `0`, `0.5`, `1`, or `n/a` — nothing else. + +## Template + +```markdown +# Agent Maturity Assessment — + +## Summary +- Raw score: X / 12 +- Weighted score: XX.X% +- Band: **** () +- Evidence tier: **<1: gh / 2: GitHub MCP / 3: git-only>** (see references/preflight.md) +- One-line take: + +### Maturity scale (where this audit lands) + +| Band | % range | This audit | +|------|---------|:----------:| +| Excellent | 90%+ | | +| Healthy | 75–89% | | +| Functional but slow | 60–74% | | +| Significant dysfunction | 40–59% | | +| Triage | <40% | | + +Mark the row this audit falls in with `◉` in the right column; leave the others blank. This makes relative position visible at a glance and survives copy-paste to Slack / a doc / a slide. + +## Scores + +### A. Engineering basics (weight 1.0×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 1 | Reproducible dev environments | 0/0.5/1 | | +| 2 | Sub-day integration cadence with measured outcomes | 0/0.5/1 | | +| 3 | Testability and agent inner loop | 0/0.5/1 | | +| 4 | Observability before features | 0/0.5/1 | | + +Subtotal: X.X × 1.00 = X.X / 4.00 + +### B. Knowledge & context (weight 1.5×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 5 | Design discipline as a practice | 0/0.5/1 | | +| 6 | Codebase composed of deep modules | 0/0.5/1 | | +| 7 | Repo-local agent context | 0/0.5/1 | | + +Subtotal: X.X × 1.50 = X.X / 4.50 + +### C. AI governance & quality (weight 1.25×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 8 | Sanctioned, governed AI tooling | 0/0.5/1 | | +| 9 | Human review on every PR | 0/0.5/1 | | +| 10 | Evals for AI-touched code paths | 0/0.5/1 | | +| 11 | Blast-radius controls for agents | 0/0.5/1 | | + +Subtotal: X.X × 1.25 = X.X / 5.00 + +### D. Hiring (weight 1.0×) +| # | Item | Score | Why this score | +|---|------|-------|----------------| +| 12 | Judgment under AI augmentation | 0/0.5/1 | | + +Subtotal: X.X × 1.00 = X.X / 1.00 + +## Top 3 fixes (highest leverage) +1. **** — why this one, what good looks like, suggested owner. +2. **** — … +3. **** — … + +## Strengths to preserve +- +- + +## Adjacent repos consulted +- `/` — +- `/` — … + +(If none: write "None — all evidence within scope repo.") + +## Notes for re-audit +- +- +``` + +## Worked example of a “Why this score” cell + +Do not include this in actual audits — it’s a calibration example for getting the cell length right. + +|Quality |Cell content | +|----------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +|Too long |`pnpm -r test resolves to nothing — no package implements test. ci.yml line 80: dotnet test || true with comment 'no real tests yet'. Zero test files anywhere. Architecture is testable in principle but the inner loop runs nothing.`| +|Too vague |`No tests exist.` | +|Right size|`CI runs dotnet test || true, no test files exist anywhere, and the architecture's seams sit unused.` | \ No newline at end of file diff --git a/share/skills/agent-maturity-assessment/references/preflight.md b/share/skills/agent-maturity-assessment/references/preflight.md new file mode 100644 index 0000000..74a02b8 --- /dev/null +++ b/share/skills/agent-maturity-assessment/references/preflight.md @@ -0,0 +1,107 @@ +# Environment preflight & multi-repo scope + +Read this when running steps 2 (preflight) and 4 (adjacent repo mapping) of *How to run an audit* in `SKILL.md`. + +## Environment preflight + +**First, read `docs/audits/CONFIG.md` if it exists.** That file is scaffolded by the `setup-agent-maturity-assessment` skill and declares the GitHub auth method, the canonical org/repo/branch, the pre-approved list of adjacent repos in scope, and the audit cadence. When it’s present, use its declared values as the source of truth — skip the runtime probes below for the parts CONFIG.md already answers, and treat the runtime probes as drift-detection only. + +If CONFIG.md is **missing** or its declared auth method fails the probe (e.g. CONFIG says “gh” but `gh auth status` errors), fall back to the full preflight below and surface the gap in *Notes for re-audit* so the user can re-run `setup-agent-maturity-assessment` later. + +The diagnostic commands assume `gh` CLI is in `$PATH` and authenticated. In a sandboxed runtime (e.g. Cowork) this is often not true even if `gh` is installed on the host. Run this preflight before scoring and select the tier: + +```bash +# Tier 1 — gh CLI authenticated → highest fidelity (full GitHub API access) +command -v gh >/dev/null 2>&1 && gh auth status >/dev/null 2>&1 && echo "tier=1 gh" + +# Tier 2 — GitHub MCP server connected → equivalent fidelity via MCP tools +# (Detect via host capabilities; in Claude Code, look for tools named like +# list_pull_requests, get_workflow_runs, get_branch_protection.) + +# Tier 3 — git + filesystem only → reduced fidelity +git -C . rev-parse --is-inside-work-tree >/dev/null 2>&1 && echo "tier=3 git-only" +``` + +### Tier behavior + +|Tier |Available |Use for | +|------------------------|--------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------| +|1. `gh` authenticated |All `gh pr list`, `gh api`, `gh run list` commands|Default. Highest-fidelity audits. | +|2. GitHub MCP |Equivalent MCP-routed tools |Use when running in a sandbox where `gh` isn’t on the host but a GitHub MCP is connected. | +|3. git + filesystem only|`git log`, `find`, `grep` |Fallback. Items 2, 3, 9, 11 score against approximations (merge commits as PR proxies, no branch-protection visibility, no review-depth metrics).| + +**At Tier 3, the audit MUST:** + +- State “Tier 3 (git-only) audit — limited GitHub-side evidence” in the Summary’s *One-line take*. +- Add an entry to *Notes for re-audit* listing which items were scored against fallback evidence and what to re-verify when running at Tier 1. +- Never auto-promote a Tier 3 score to 1.0 on items 2, 3, 9, or 11 — the missing GitHub-side data could pull them down. Cap those at 0.5 unless filesystem evidence alone is sufficient. + +**To upgrade Tier 3 → Tier 1 in Cowork (or any sandbox):** add a GitHub MCP server. Cowork’s curated MCP registry doesn’t currently bundle one, so add it as a custom MCP via Settings → MCP Servers, pointing at GitHub’s official `github/github-mcp-server` (remote-hostable) or Anthropic’s reference implementation. Auth flows through your GitHub OAuth/PAT scoped to the orgs you want to audit — no creds touch the sandbox. + +### Optional — host-side probe script + +When the sandbox is stuck at Tier 3 but the user has `gh` on their host, ask them to run this and paste the output back. The audit can incorporate the results without any creds entering the sandbox. + +```bash +#!/usr/bin/env bash +# audit-gh-probe.sh — run on host, paste output to Claude +set -euo pipefail +REPO="${1:?usage: audit-gh-probe.sh }" +SINCE="$(date -d '90 days ago' +%Y-%m-%d 2>/dev/null || date -v-90d +%Y-%m-%d)" + +echo "### gh-pr-list (cadence + lead time + review depth) ###" +gh pr list --repo "$REPO" --state merged --limit 200 \ + --search "merged:>$SINCE" \ + --json number,mergedAt,createdAt,additions,deletions,reviews,author + +echo "### gh-branch-protection ###" +gh api "repos/$REPO/branches/$(gh repo view "$REPO" --json defaultBranchRef --jq .defaultBranchRef.name)/protection" 2>&1 || true + +echo "### gh-environments ###" +gh api "repos/$REPO/environments" --jq '.environments[] | {name, has_protection: (.protection_rules | length > 0)}' 2>&1 || true + +echo "### gh-deploy-runs ###" +gh run list --repo "$REPO" --workflow=deploy --limit 100 \ + --json conclusion,createdAt,name 2>&1 || true + +echo "### gh-ci-runs (flake/fail rate) ###" +gh run list --repo "$REPO" --workflow=ci.yml --limit 50 \ + --json conclusion 2>&1 || true +``` + +## Handling multi-repo scope + +A real engineering org doesn’t fit in one repo. CI workflow templates, Terraform/OpenTofu modules, QA / E2E suites, runbooks and dashboards, and shared agent-context skill libraries frequently live in adjacent repos. Auditing only the primary repo under-scores items that depend on those external sources. + +**If `docs/audits/CONFIG.md` exists, use its `## Adjacent repos` table as the seed list** — those repos are already approved to be in scope. Re-run the detection commands below only as **drift detection** to catch new adjacent repos that have been added since the last setup. Surface any new findings in the audit’s *Adjacent repos consulted* section and recommend a re-run of `setup-agent-maturity-assessment` if the list has grown. + +If CONFIG.md is missing, run the full detection from scratch. + +### Detection — run these from the primary repo before scoring + +```bash +# 1. External GitHub Actions referenced from this repo's workflows +grep -rhE "uses:\s*[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+" .github/workflows/ 2>/dev/null \ + | grep -oE "[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+(@[a-zA-Z0-9_.-]+)?" | sort -u + +# 2. Terraform / OpenTofu modules sourced from external Git +grep -rhE "source\s*=\s*\".*\"" infra/ terraform/ 2>/dev/null \ + | grep -E "git::|github\.com/" | sort -u + +# 3. Submodules +git submodule status 2>/dev/null + +# 4. Generic cross-repo references in docs and scripts +grep -rEh "github\.com/[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+" \ + docs/ scripts/ .github/ README.md 2>/dev/null \ + | grep -oE "github\.com/[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+" | sort -u +``` + +### For each adjacent repo discovered + +- Score the relevant criterion *across both repos*. Examples: if reusable workflows live in `/ci-templates`, item #2 (cadence) and item #9 (review) evidence comes from both. If Terraform modules live in `/infra-modules`, item #11 (blast-radius) needs both. +- Use `gh repo view /` and targeted `gh api`/`gh search` calls to inspect — don’t clone unless necessary. +- If access is blocked (private repo, no permission), score against what’s visible and flag in *Notes for re-audit*. +- List every adjacent repo consulted in the audit’s *Adjacent repos consulted* section so a re-auditor can reproduce. + +**Org-level criteria (#8 governance, #12 hiring) are inherently outside any one repo.** Look for them in `/.github` policy repo, internal handbook, IT/security docs. If you can’t reach those, mark `n/a` with the reason. Phase 1 question 7 is intended to surface these out-of-band sources from the human before evidence gathering. \ No newline at end of file diff --git a/tui/assess_coverage_test.go b/tui/assess_coverage_test.go new file mode 100644 index 0000000..d3915bb --- /dev/null +++ b/tui/assess_coverage_test.go @@ -0,0 +1,473 @@ +package main + +import ( + "flag" + "os" + "path/filepath" + "strings" + "testing" + + tea "github.com/charmbracelet/bubbletea" +) + +// --------------------------------------------------------------------------- +// Coverage helpers: bump the lines that the basic unit tests didn't already +// reach. These tests focus on functions that are pure / easily exercised +// without a real TTY — Init() commands, View() with a mounted form, +// reflow / window-size handling, flag application, validators. +// --------------------------------------------------------------------------- + +func TestAssessProgress_InitReturnsCmd(t *testing.T) { + m := newProgressForTest() + cmd := m.Init() + if cmd == nil { + t.Error("Init should return a Bubble Tea cmd") + } +} + +func TestAssessProgress_FitLineTruncatesLongInput(t *testing.T) { + m := newProgressForTest() + m.viewport.Width = 12 + long := strings.Repeat("a", 100) + got := m.fitLine(long) + if !strings.HasSuffix(got, "…") { + t.Errorf("fitLine should append ellipsis on truncation, got %q", got) + } +} + +func TestAssessProgress_FitLineHandlesEmptyWidth(t *testing.T) { + m := newProgressForTest() + m.viewport.Width = 0 + got := m.fitLine(strings.Repeat("b", 50)) + if got == "" { + t.Error("fitLine should still produce output when width is 0") + } +} + +func TestAssessProgress_HintsTextSwitchesWithInterview(t *testing.T) { + m := newProgressForTest() + if got := m.hintsText(); !strings.Contains(got, "ctrl+c") { + t.Errorf("default hints should mention ctrl+c, got %q", got) + } + // Mount a fake interview form + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q1", + QuestionText: "?", + Options: []string{"a"}, + }) + m = updated.(assessProgressModel) + if got := m.hintsText(); !strings.Contains(got, "navigate") { + t.Errorf("interview hints should mention navigate, got %q", got) + } +} + +func TestAssessProgress_RenderInterviewPanelNonEmpty(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q1", + QuestionText: "?", + Options: []string{"a", "b"}, + }) + m = updated.(assessProgressModel) + if got := m.renderInterviewPanel(); got == "" { + t.Error("renderInterviewPanel should produce non-empty output when form is mounted") + } +} + +func TestAssessProgress_ViewWithInterviewShowsBothPanes(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + QuestionID: "q1", + QuestionText: "Hello?", + Options: []string{"yes", "no"}, + }) + m = updated.(assessProgressModel) + view := m.View() + if !strings.Contains(view, "Assessment Setup") { + t.Error("View() during interview should still render the right-pane summary") + } +} + +func TestAssessProgress_WindowSizeReflowsWithMountedForm(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{ + Type: "interview-question", + Options: []string{"a"}, + }) + m = updated.(assessProgressModel) + updated2, _ := m.Update(tea.WindowSizeMsg{Width: 120, Height: 40}) + final := updated2.(assessProgressModel) + if final.width != 120 { + t.Errorf("width = %d, want 120", final.width) + } +} + +func TestAssessProgress_FailedStepRenders(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "active"}) + m = updated.(assessProgressModel) + updated, _ = m.handleStep(GenericEvent{Type: "progress", Step: "preflight", Status: "failed", Message: "gh missing"}) + m = updated.(assessProgressModel) + view := m.View() + if !strings.Contains(view, "gh missing") { + t.Error("View should include the failure message in the step list") + } +} + +func TestAssessProgress_ReportDataEventStoresJSON(t *testing.T) { + m := newProgressForTest() + updated, _ := m.handleStep(GenericEvent{ + Type: "report-data", + Data: []byte(`{"foo":1}`), + }) + final := updated.(assessProgressModel) + if final.jsonData != `{"foo":1}` { + t.Errorf("jsonData = %q", final.jsonData) + } +} + +// --------------------------------------------------------------------------- +// assess_preview +// --------------------------------------------------------------------------- + +func TestAssessPreview_InitReturnsCmd(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + if cmd := m.Init(); cmd == nil { + t.Error("Init should return a Bubble Tea cmd") + } +} + +func TestAssessPreview_ReflowSetsViewportDimensions(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + m.width = 120 + m.height = 50 + m.reflow() + if m.viewports[assessTabAudit].Width <= 0 { + t.Error("reflow should set viewport width") + } + if m.viewports[assessTabAudit].Height <= 0 { + t.Error("reflow should set viewport height") + } +} + +func TestAssessPreview_PreviewFrameHeightMinimum(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + m.height = 5 // smaller than the 11-line shell budget + if h := m.previewFrameHeight(); h < 10 { + t.Errorf("previewFrameHeight should floor at 10, got %d", h) + } + m.height = 100 + if h := m.previewFrameHeight(); h <= 10 { + t.Errorf("previewFrameHeight at height=100 should grow, got %d", h) + } +} + +func TestAssessPreview_UpdateProcessesRenderedMsg(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + msg := assessRenderedMsg{rendered: [assessTabCount]string{"audit-content", "evidence-content", "json-content"}} + updated, _ := m.Update(msg) + final := updated.(assessPreviewModel) + if final.rendering { + t.Error("rendering should be false after content rendered") + } +} + +func TestAssessPreview_UpdateTabSwitching(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", `{"items":[]}`) + // First land the rendered msg so the model isn't in rendering state. + updated, _ := m.Update(assessRenderedMsg{}) + m = updated.(assessPreviewModel) + // Tab right + updated, _ = m.Update(tea.KeyMsg{Type: tea.KeyTab}) + m = updated.(assessPreviewModel) + if m.activeTab != assessTabEvidence { + t.Errorf("after Tab, activeTab = %d, want %d", m.activeTab, assessTabEvidence) + } + updated, _ = m.Update(tea.KeyMsg{Type: tea.KeyTab}) + m = updated.(assessPreviewModel) + if m.activeTab != assessTabJSON { + t.Errorf("after second Tab, activeTab = %d, want %d", m.activeTab, assessTabJSON) + } + updated, _ = m.Update(tea.KeyMsg{Type: tea.KeyTab}) + m = updated.(assessPreviewModel) + if m.activeTab != assessTabAudit { + t.Errorf("Tab should wrap around to Audit, got %d", m.activeTab) + } + // Shift+tab wraps backwards + updated, _ = m.Update(tea.KeyMsg{Type: tea.KeyShiftTab}) + m = updated.(assessPreviewModel) + if m.activeTab != assessTabJSON { + t.Errorf("Shift+Tab should wrap to JSON, got %d", m.activeTab) + } +} + +func TestAssessPreview_QKeyQuits(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + _, cmd := m.Update(tea.KeyMsg{Type: tea.KeyRunes, Runes: []rune{'q'}}) + if cmd == nil { + t.Error("q should produce a tea.Quit cmd") + } +} + +func TestAssessPreview_WindowResizeReRenders(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + updated, _ := m.Update(assessRenderedMsg{}) // exit rendering state + m = updated.(assessPreviewModel) + updated2, cmd := m.Update(tea.WindowSizeMsg{Width: 130, Height: 50}) + final := updated2.(assessPreviewModel) + if !final.rendering { + t.Error("resize after initial render should kick off a re-render") + } + if cmd == nil { + t.Error("resize should return a cmd to drive the re-render") + } +} + +func TestAssessPreview_ViewWhileRendering(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + m.width = 100 + m.height = 30 + view := m.View() + if !strings.Contains(view, "Rendering audit") { + t.Errorf("View while rendering should show spinner message, got: %s", view) + } +} + +func TestAssessPreview_ViewAfterRender(t *testing.T) { + m := newAssessPreviewModel("/no/such.md", "", "") + m.width = 100 + m.height = 30 + updated, _ := m.Update(assessRenderedMsg{rendered: [assessTabCount]string{"audit", "evidence", "json"}}) + m = updated.(assessPreviewModel) + view := m.View() + if !strings.Contains(view, "Audit Ready") { + t.Errorf("View after render should show Audit Ready title, got: %s", view) + } +} + +func TestAssessPreview_RunReturnsErrorFromTeaProgramRun(t *testing.T) { + origTPR := teaProgramRun + t.Cleanup(func() { teaProgramRun = origTPR }) + // Stub teaProgramRun to return immediately + teaProgramRun = func(p *tea.Program) (tea.Model, error) { + return newAssessPreviewModel("/no/such.md", "", ""), nil + } + if err := RunAssessPreview("/no/such.md", "", ""); err != nil { + t.Errorf("RunAssessPreview should succeed with stubbed tea, got: %v", err) + } +} + +// --------------------------------------------------------------------------- +// assess_wizard +// --------------------------------------------------------------------------- + +func TestAssessWizard_InitReturnsCmd(t *testing.T) { + m := newAssessWizardForTest("local-repo") + cmd := m.Init() + if cmd == nil { + t.Error("Init should return a form-init cmd") + } +} + +func TestAssessWizard_WindowSizeUpdatesWidth(t *testing.T) { + m := newAssessWizardForTest("local-repo") + updated, _ := m.Update(tea.WindowSizeMsg{Width: 140, Height: 50}) + final := updated.(*assessWizardModel) + if final.width != 140 { + t.Errorf("width = %d, want 140", final.width) + } +} + +func TestValidateLocalPath(t *testing.T) { + t.Run("empty rejects", func(t *testing.T) { + if err := validateLocalPath(""); err == nil { + t.Error("empty path should error") + } + }) + t.Run("nonexistent rejects", func(t *testing.T) { + if err := validateLocalPath("/no/such/dir/here"); err == nil { + t.Error("nonexistent path should error") + } + }) + t.Run("file rejects", func(t *testing.T) { + tmp := t.TempDir() + p := filepath.Join(tmp, "file.txt") + _ = os.WriteFile(p, []byte("hi"), 0o600) + if err := validateLocalPath(p); err == nil { + t.Error("regular file should error (must be a directory)") + } + }) + t.Run("directory accepts", func(t *testing.T) { + tmp := t.TempDir() + if err := validateLocalPath(tmp); err != nil { + t.Errorf("temp dir should pass: %v", err) + } + }) +} + +func TestDefaultScopeMode(t *testing.T) { + cfg := &AssessConfig{} + if got := defaultScopeMode(cfg, "/tmp"); got != "local-repo" { + t.Errorf("default = %q, want local-repo", got) + } + cfg.Scope.Mode = "org" + if got := defaultScopeMode(cfg, "/tmp"); got != "org" { + t.Errorf("preserves existing mode, got %q", got) + } +} + +func TestDefaultLocalPath(t *testing.T) { + cfg := &AssessConfig{} + if got := defaultLocalPath(cfg, "/tmp/cwd"); got != "/tmp/cwd" { + t.Errorf("default = %q, want /tmp/cwd", got) + } + cfg.Scope.LocalPath = "/preset" + if got := defaultLocalPath(cfg, "/tmp/cwd"); got != "/preset" { + t.Errorf("preserves existing path, got %q", got) + } +} + +func TestAssessWizard_GoBackEmptyHistory(t *testing.T) { + m := newAssessWizardForTest("local-repo") + updated, cmd := m.Update(tea.KeyMsg{Type: tea.KeyEsc}) + final := updated.(*assessWizardModel) + if !final.aborted { + t.Error("esc with empty history should abort") + } + if cmd == nil { + t.Error("esc should return a tea.Quit cmd") + } +} + +// --------------------------------------------------------------------------- +// assess_flags +// --------------------------------------------------------------------------- + +func TestApplyAssessFlagsTo_AllFields(t *testing.T) { + // Build a fresh flag.FlagSet so this test doesn't depend on global state. + // We rely on the package-level flagAssess* vars but pass our own wasSet. + cfg := AssessConfig{} + + // Pre-populate the global flag vars (they're already declared at package + // scope as flag.String/Bool results — we can write through the pointers). + *flagAssessScopeMode = "org" + *flagAssessOrg = "acme" + *flagAssessRepos = "frontend,backend" + *flagAssessPath = "/tmp/repo" + *flagAssessDisplayName = "acme-engineering" + *flagAssessTier = "gh" + *flagAssessAnswers = "/tmp/answers.json" + *flagAssessOutput = "/tmp/audit.md" + *flagAssessOutputFormat = "markdown" + *flagAssessDryRun = true + *flagAssessFlushCache = true + + // Reset to defaults at end so other tests aren't affected. + t.Cleanup(func() { + *flagAssessScopeMode = "" + *flagAssessOrg = "" + *flagAssessRepos = "" + *flagAssessPath = "" + *flagAssessDisplayName = "" + *flagAssessTier = "" + *flagAssessAnswers = "" + *flagAssessOutput = "" + *flagAssessOutputFormat = "" + *flagAssessDryRun = false + *flagAssessFlushCache = false + }) + + // Simulate all flags set. + allSet := func(name string) bool { return true } + applyAssessFlagsTo(&cfg, allSet) + + if cfg.Scope.Mode != "org" { + t.Errorf("Mode = %q", cfg.Scope.Mode) + } + if cfg.Scope.Org != "acme" { + t.Errorf("Org = %q", cfg.Scope.Org) + } + if got := cfg.Scope.Repos; len(got) != 2 || got[0] != "frontend" || got[1] != "backend" { + t.Errorf("Repos = %v", got) + } + if cfg.Scope.LocalPath != "/tmp/repo" { + t.Errorf("LocalPath = %q", cfg.Scope.LocalPath) + } + if cfg.Scope.DisplayName != "acme-engineering" { + t.Errorf("DisplayName = %q", cfg.Scope.DisplayName) + } + if cfg.EvidenceTier != "gh" { + t.Errorf("EvidenceTier = %q", cfg.EvidenceTier) + } + if cfg.InterviewAnswersPath != "/tmp/answers.json" { + t.Errorf("InterviewAnswersPath = %q", cfg.InterviewAnswersPath) + } + if cfg.OutputPath != "/tmp/audit.md" { + t.Errorf("OutputPath = %q", cfg.OutputPath) + } + if cfg.OutputFormat != "markdown" { + t.Errorf("OutputFormat = %q", cfg.OutputFormat) + } + if !cfg.DryRun { + t.Error("DryRun should be true") + } + if !cfg.FlushCache { + t.Error("FlushCache should be true") + } +} + +func TestApplyAssessFlagsTo_NoneSet(t *testing.T) { + cfg := AssessConfig{Scope: AssessScope{Mode: "preserved", Org: "preserved-org"}} + noneSet := func(name string) bool { return false } + applyAssessFlagsTo(&cfg, noneSet) + if cfg.Scope.Mode != "preserved" || cfg.Scope.Org != "preserved-org" { + t.Errorf("none-set should preserve existing values, got: %+v", cfg.Scope) + } +} + +// --------------------------------------------------------------------------- +// assess_progress.View() with mounted form during reflow / window resize +// --------------------------------------------------------------------------- + +func TestAssessProgress_ContentWidthMinimum(t *testing.T) { + m := newProgressForTest() + m.width = 30 + w := m.contentWidth() + if w < 20 { + t.Errorf("contentWidth should floor at 20, got %d", w) + } +} + +func TestAssessProgress_FormWidthScales(t *testing.T) { + m := newProgressForTest() + m.width = 120 + w := m.formWidth() + if w < 32 { + t.Errorf("formWidth should be at least 32, got %d", w) + } +} + +// --------------------------------------------------------------------------- +// SaveAssessConfig directory creation branch +// --------------------------------------------------------------------------- + +func TestSaveAssessConfig_CreatesDirectoryStructure(t *testing.T) { + dir := t.TempDir() + t.Setenv("XDG_CONFIG_HOME", dir) + cfg := AssessConfig{Scope: AssessScope{Mode: "local-repo", LocalPath: "/", DisplayName: "x"}} + if err := SaveAssessConfig(&cfg); err != nil { + t.Fatalf("SaveAssessConfig: %v", err) + } + if _, err := os.Stat(filepath.Join(dir, "teamhero", "assess-config.json")); err != nil { + t.Errorf("config file missing: %v", err) + } +} + +// Make sure the flag package has been parsed at least once so flag.Visit +// doesn't choke (some Go versions assert state before Visit). +func TestMain_FlagPackageState(t *testing.T) { + _ = flag.CommandLine +} From 99dc3a9df5a768748ca0c5da1ac073a86fe86461 Mon Sep 17 00:00:00 2001 From: Asa Baylus Date: Sun, 10 May 2026 21:31:56 -0400 Subject: [PATCH 5/6] docs(assess): full feature reference + shareable-skill install guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds two new docs and wires them into the existing index: - docs/MATURITY_ASSESSMENT.md — complete user-facing reference for `teamhero assess`: what gets scored (12 items, 4 weighted categories, bands), the two ways to run (interactive TUI / headless), full Phase-1 interview table with Q→criterion mapping, answers.json shape, evidence tiers, every CLI flag, scoring pipeline, output structure, env vars, re-audit cadence, troubleshooting - share/skills/agent-maturity-assessment/INSTALL.md — install guide for the standalone harness-agnostic skill bundle: covers Claude Code (~/.claude/skills/), Cowork / Workbench, custom SDK harnesses, verifying the install, customizing rubric/triggers/interview, and attribution Cross-links: - README.md — links to docs/MATURITY_ASSESSMENT.md in the assess section and the Learn-more list; adds a "Shareable maturity- assessment skill" subsection that points at the share/skills/ bundle - docs/ARCHITECTURE.md — adds a "MaturityService" key-components entry summarizing the pipeline and linking to the full reference No code changes. CI should still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 13 +- docs/ARCHITECTURE.md | 20 ++ docs/MATURITY_ASSESSMENT.md | 233 ++++++++++++++++++ .../agent-maturity-assessment/INSTALL.md | 115 +++++++++ 4 files changed, 380 insertions(+), 1 deletion(-) create mode 100644 docs/MATURITY_ASSESSMENT.md create mode 100644 share/skills/agent-maturity-assessment/INSTALL.md diff --git a/README.md b/README.md index d6587ca..bd6e1e8 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ teamhero assess --headless --path . --dry-run | `--dry-run` | Skip the AI scorer; emit a placeholder audit | | `--show-assess-config` | Print saved configuration as JSON and exit | -Run `teamhero assess --help` for the full list. +Run `teamhero assess --help` for the full list, or read the [**full maturity assessment reference**](docs/MATURITY_ASSESSMENT.md) for everything — every flag, all 7 interview questions, evidence tiers, output format, troubleshooting. ### How the score is built @@ -230,6 +230,7 @@ Run `teamhero assess --help` for the full list. ## Learn more - [Configuration Reference](docs/CONFIG_FORMAT.md) — all settings, credentials, and user identity mapping +- [Maturity Assessment Reference](docs/MATURITY_ASSESSMENT.md) — full `teamhero assess` docs: rubric, interview, tiers, output, troubleshooting - [Architecture Overview](docs/ARCHITECTURE.md) — how the system works under the hood --- @@ -275,6 +276,16 @@ claude plugin install teamhero-scripts@teamhero In Cowork, the plugin uses MCP connectors for GitHub and Asana (OAuth-based, no API tokens to manage). +### Shareable maturity-assessment skill + +The Agent Maturity Assessment is also packaged as a **standalone, harness-agnostic skill** you can drop into any Claude environment (Claude Code, Cowork, Workbench, custom SDK harness) without installing TeamHero itself: + +``` +share/skills/agent-maturity-assessment/ ← copy this folder to ~/.claude/skills/ +``` + +See [`share/skills/agent-maturity-assessment/INSTALL.md`](share/skills/agent-maturity-assessment/INSTALL.md) for installation steps for each harness. The skill works in pure-Claude mode by default and uses the `teamhero` binary as an optional accelerator when it's installed. + ### Further reading - [Distribution & Release Process](docs/DISTRIBUTION.md) diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md index a33c449..40bd960 100644 --- a/docs/ARCHITECTURE.md +++ b/docs/ARCHITECTURE.md @@ -296,6 +296,26 @@ class AsanaService { - Comment collection - Redirect handling for Asana API +### MaturityService (src/services/maturity/maturity.service.ts) + +Orchestrates the Agent Maturity Assessment — a parallel feature to `ReportService` that scores an engineering org against a 12-criterion rubric. + +```typescript +class MaturityService { + run(input: AssessCommandInput): Promise; +} +``` + +**Pipeline:** +1. **Preflight** — auto-detects evidence tier (`gh` CLI / GitHub MCP / git-only) +2. **Adjacent repos** — scans workflow `uses:`, Terraform sources, submodules, README cross-refs +3. **Phase-1 interview** — 7 verbatim questions over a bidirectional JSON-lines stdin protocol (interactive mode) or pre-supplied via JSON file (headless mode) +4. **Evidence collection** — 12 deterministic detectors, one per criterion (`MaturityProvider` port in `src/core/types.ts`) +5. **AI scoring** — OpenAI Responses API with `text.format.json_schema` strict mode; tier-3 caps on items 2/3/9/11 enforced post-hoc +6. **Audit writer** — renders canonical-template markdown + JSON sidecar; round-trips confirmed interview answers to `docs/audits/CONFIG.md` + +The hardcoded rubric lives in `src/services/maturity/rubric.ts` (`RUBRIC_VERSION` participates in the cache key). The TUI integration is in `tui/assess_*.go` and uses the same framed two-pane layout as the report flow. See [Maturity Assessment Reference](MATURITY_ASSESSMENT.md) for full details. + --- ## Configuration diff --git a/docs/MATURITY_ASSESSMENT.md b/docs/MATURITY_ASSESSMENT.md new file mode 100644 index 0000000..e2834cb --- /dev/null +++ b/docs/MATURITY_ASSESSMENT.md @@ -0,0 +1,233 @@ +# Agent Maturity Assessment + +Score an engineering organization against the 12-criterion **Agent Maturity Assessment** — a diagnostic for whether an org is ready to ship safely with humans and agents working in parallel on a codebase that doesn't degrade with every iteration. + +```bash +teamhero assess +``` + +The deliverable is a written audit (`teamhero-maturity--.md`) plus a `.json` sidecar. The audit shows a per-category score table, a weighted percentage, a maturity band, the top-3 fixes, strengths to preserve, and notes for re-audit. + +--- + +## What gets scored + +Twelve items across four weighted categories: + +| # | Item | Category | Weight | +|---|------|----------|:---:| +| 1 | Reproducible dev environments | A. Engineering basics | 1.0× | +| 2 | Sub-day integration cadence with measured outcomes | A. Engineering basics | 1.0× | +| 3 | Testability and the agent inner loop | A. Engineering basics | 1.0× | +| 4 | Observability before features | A. Engineering basics | 1.0× | +| 5 | Design discipline as a first-class practice | B. Knowledge & context | **1.5×** | +| 6 | Codebase composed of deep modules | B. Knowledge & context | **1.5×** | +| 7 | Repo-local agent context | B. Knowledge & context | **1.5×** | +| 8 | Sanctioned, governed AI tooling | C. AI governance & quality | 1.25× | +| 9 | Human review on every PR | C. AI governance & quality | 1.25× | +| 10 | Evals for AI-touched code paths | C. AI governance & quality | 1.25× | +| 11 | Blast-radius controls for agent actions | C. AI governance & quality | 1.25× | +| 12 | Interviews assess judgment under AI augmentation | D. Hiring | 1.0× | + +Each item scores **1.0** (pass), **0.5** (partial), **0.0** (fail), or **n/a** (genuinely doesn't apply / unknowable from context). Weighted total max: **14.5**. + +### Maturity bands + +| Band | Range | Interpretation | +|---|---|---| +| **Excellent** | 90%+ | Rare. Confirm with a second pass — first audits often over-score. | +| **Healthy** | 75–89% | Targeted fixes will compound. | +| **Functional but slow** | 60–74% | Real risk of being out-shipped by AI-native competitors. | +| **Significant dysfunction** | 40–59% | Treat as a turnaround. | +| **Triage** | <40% | Stop new feature work until basics are in. | + +The bar to clear: **≥11/12 raw and ≥80% weighted.** + +--- + +## Two ways to run + +### Interactive TUI + +```bash +teamhero assess +``` + +Same framed two-pane layout as `teamhero report`: +1. **Scope wizard** — pick local repo / GitHub org / both, set display name, confirm. +2. **Progress display** — step list with ✔/✖/○ icons, monotonic progress bar, right-pane configuration summary, the same Bubble Tea program throughout. +3. **Phase-1 interview** — the 7 questions appear **one at a time** as a `huh` select in the left pane (the right pane keeps showing the config summary). Each has a small set of pre-written options plus an "Other (type your own)" free-text option. `I don't know` maps to `n/a` for the linked criterion. +4. **Audit preview** — tabbed Glamour-rendered viewer (Audit / Evidence / JSON Data). + +### Headless / scripted + +```bash +# Smoke test (no OpenAI call — placeholder scores) +teamhero assess --headless --path . --dry-run + +# Real audit of the current repo, with interview answers supplied up front +teamhero assess --headless --path . --interview-answers ./answers.json + +# Org-wide audit +teamhero assess --headless --target-org acme --interview-answers ./answers.json + +# Both — assess an org and a representative local checkout in one run +teamhero assess --headless \ + --target-org acme --path . \ + --interview-answers ./answers.json +``` + +When `--interview-answers` is omitted in headless mode, the runner reads `docs/audits/CONFIG.md` (if it exists in the repo). Anything still missing is recorded as `unknown` and the linked criterion is scored `n/a`. + +--- + +## Phase-1 interview + +Seven questions cover the parts of the audit that aren't visible in the repo. The wording is **verbatim from the upstream skill** — don't paraphrase. + +| Q# | Question | Linked criterion | +|----|----------|---| +| 1 | What AI tooling do engineers actually use day-to-day? Is it company-paid? Is there a data-handling policy? | #8 Sanctioned AI tooling (primary) | +| 2 | Do interviews allow candidates to use AI and assess judgment under AI? | #12 Hiring (primary) | +| 3 | Are all four DORA metrics tracked and visible to the team? | #2 Cadence (combined) | +| 4 | Is there a consistent upfront design step before agent code generation? | #5 Design discipline (combined) | +| 5 | LLMs in product / dev loop? With evals or just gut feel? | #10 Evals (combined) | +| 6 | Has anyone red-teamed worst-case agent scenarios in prod? | #11 Blast-radius (combined) | +| 7 | Adjacent repos detection might miss (handbook, .github, skills, etc.)? | scope expansion | + +### `answers.json` shape + +```json +{ + "q1": "Company-paid Claude with documented policy", + "q2": "AI allowed; interviewers trained to assess judgment with AI", + "q3": "DORA tracked via Grafana the team checks daily", + "q4": "Consistent ADR step before agent code", + "q5": "LLMs in dev loop; tracked via sprint retro metrics", + "q6": "unknown", + "q7": "No" +} +``` + +Use `"unknown"` (or `"I don't know"`) to mark a question as unanswered. + +--- + +## Evidence tiers + +The runner auto-detects the highest-fidelity evidence path available. + +| Tier | Detection | What's available | +|---|---|---| +| **1 — `gh` CLI** | `gh auth status` succeeds | Full GitHub API: PR cadence, lead time, review depth, branch protection, environment protection rules, deployment runs | +| **2 — GitHub MCP** | `TEAMHERO_GITHUB_MCP=1` env var set | Equivalent fidelity routed through an MCP server | +| **3 — git-only** | Inside a git repo, no `gh` or MCP | Local filesystem + `git log` only. Items #2, #3, #9, #11 are **capped at 0.5** because GitHub-side evidence isn't observable. | + +Override with `--evidence-tier auto|gh|github-mcp|git-only`. + +--- + +## CLI reference + +### Scope flags + +| Flag | Purpose | +|---|---| +| `--scope-mode {org\|local-repo\|both}` | Override scope (auto-inferred from other flags) | +| `--target-org ` | GitHub org name (org or both modes) | +| `--target-repos ` | Comma-separated repo names — narrows the scope inside the org | +| `--path ` | Local repo path (local-repo or both modes) | +| `--display-name ` | Override the audit's scope display name | + +### Run flags + +| Flag | Default | Purpose | +|---|---|---| +| `--headless` | auto | Skip the wizard; auto-detected in CI / piped stdin | +| `--evidence-tier ` | `auto` | Pin the evidence tier | +| `--interview-answers ` | (none) | JSON file with pre-supplied Phase-1 answers | +| `--audit-output ` | timestamped, cwd | Override output file path | +| `--audit-output-format {markdown\|json\|both}` | `both` | Output format | +| `--dry-run` | false | Skip the AI scorer; emit a placeholder audit | +| `--flush-assess-cache` | false | Flush cached assessment(s) before running | +| `--show-assess-config` | false | Print saved configuration as JSON and exit | + +Run `teamhero assess --help` for the full list. + +--- + +## How scoring works + +1. **Preflight** auto-detects the evidence tier. +2. **Adjacent repo detection** scans workflow `uses:`, Terraform module sources, submodules, and README cross-refs to find sibling repos. Surfaced in the audit's *Adjacent repos consulted* section. +3. **Phase-1 interview** captures the 7 org-level answers (interactively, from `--interview-answers`, or from `docs/audits/CONFIG.md` if it exists). Confirmed answers are written back to `CONFIG.md` after every successful run. +4. **Evidence** — 12 deterministic detectors run against the local repo and emit structured facts (positive / neutral / negative signal) per criterion. +5. **AI scoring** — OpenAI Responses API with `text.format.json_schema` strict mode receives the rubric, evidence, and interview answers; returns per-item scores, ≤25-word evidence sentences, top-3 fixes, and strengths. +6. **Tier-3 caps** — on git-only audits, items 2/3/9/11 are post-hoc capped at 0.5 even if the AI awarded 1.0 (because GitHub-side evidence isn't observable). +7. **Audit writer** renders the markdown using the canonical template and a `.json` sidecar with the full artifact (rubric version, evidence facts, category subtotals). +8. **CONFIG.md round-trip** — confirmed interview answers persist to `docs/audits/CONFIG.md` so re-audits can confirm-or-refresh rather than re-interview cold. + +--- + +## Output + +Two files written to the current directory (or `--audit-output`): + +- `teamhero-maturity--.md` — full audit using the canonical template +- `teamhero-maturity--.json` — full data (rubric version, item scores, evidence facts, category subtotals, interview answers) + +### Audit structure + +1. **Summary** — raw score, weighted %, band, evidence tier, one-line take +2. **Maturity scale** — band table with ◉ marking the current audit +3. **Scores** — four per-category tables (A/B/C/D) with item, score, and `whyThisScore` (≤25 words each) +4. **Top 3 fixes** — highest-leverage items scoring <1.0, with suggested owners +5. **Strengths to preserve** — what's already working +6. **Adjacent repos consulted** +7. **Notes for re-audit** — calibration warnings, items scored `n/a`, what would resolve them + +--- + +## Configuration + +Saved settings live at `~/.config/teamhero/assess-config.json` after each interactive run; headless mode reuses them. Inspect with: + +```bash +teamhero assess --show-assess-config +``` + +### Environment variables + +Beyond the core credentials (`OPENAI_API_KEY`, `GITHUB_PERSONAL_ACCESS_TOKEN`), the assess command honors: + +| Variable | Purpose | +|---|---| +| `MATURITY_AI_MODEL` | Override AI model for the scorer (falls back to `AI_MODEL`, default `gpt-5-mini`) | +| `TEAMHERO_GITHUB_MCP=1` | Tells the runner a GitHub MCP server is connected → choose Tier 2 instead of git-only | + +--- + +## Re-audit cadence + +Re-run **quarterly** against the same org to track movement. Movement matters more than absolute level — the first audit is the baseline, trends are the signal. The runner persists Phase-1 answers to `docs/audits/CONFIG.md` so re-audits can confirm-or-refresh instead of re-interviewing cold. + +--- + +## Troubleshooting + +**"OPENAI_API_KEY required for maturity assessment AI scoring"** — set the key in `~/.config/teamhero/.env` (via `teamhero setup`) or pass `--dry-run` for a placeholder audit. + +**Wizard runs but no questions appear in interactive mode** — confirm you're running the latest binary (`just build-all` from the project root, or download a fresh release). Earlier builds released the alt-screen for each question; the current build hosts the form inside the framed layout. + +**Items 2/3/9/11 scored 0.5 even though the team is great at them** — you're on Tier 3 (git-only). Run `gh auth login` first, or run from a sandbox with `TEAMHERO_GITHUB_MCP=1` set, to unlock the full GitHub-side evidence path. + +**Audit shows `unknown` for everything** — `--interview-answers` file path was wrong, or the file's keys don't match `q1`–`q7`. Verify the JSON shape. + +--- + +## See also + +- [`claude-plugin/skills/agent-maturity-assessment/SKILL.md`](../claude-plugin/skills/agent-maturity-assessment/SKILL.md) — Claude Code plugin skill that documents how to invoke `teamhero assess` from Claude +- [`share/skills/agent-maturity-assessment/`](../share/skills/agent-maturity-assessment/) — self-contained shareable skill bundle (works in any Claude harness without the binary) +- [`docs/maturity-skill-ref/`](maturity-skill-ref/) — canonical upstream skill reference (criteria, interview, output-template, preflight) +- `src/services/maturity/rubric.ts` — hardcoded 12-criterion rubric (the canonical source for what the runner scores) diff --git a/share/skills/agent-maturity-assessment/INSTALL.md b/share/skills/agent-maturity-assessment/INSTALL.md new file mode 100644 index 0000000..a7b2c00 --- /dev/null +++ b/share/skills/agent-maturity-assessment/INSTALL.md @@ -0,0 +1,115 @@ +# Installing the Agent Maturity Assessment skill + +This skill is **harness-agnostic** — it works in any Claude environment that supports the Anthropic Skills convention (a directory with a `SKILL.md` + optional `references/`). Drop it in, and Claude can run a 12-criterion maturity audit on demand. + +## What's in the bundle + +``` +agent-maturity-assessment/ +├── SKILL.md ← entry point with frontmatter +├── INSTALL.md ← this file (delete after install) +└── references/ + ├── criteria.md ← 12-criterion rubric (full text) + ├── interview.md ← 7 Phase-1 questions + Q→criterion mapping + ├── output-template.md ← canonical audit template + └── preflight.md ← evidence tiers + multi-repo handling +``` + +The skill works in two modes: +- **Pure-Claude** (default): Claude reads the references on demand, uses `AskUserQuestion` for the Phase-1 interview, writes the audit by hand. Works everywhere. +- **Team Hero binary** (optional accelerator): if `teamhero` is installed and `OPENAI_API_KEY` is configured, the skill calls `teamhero assess` to run the whole pipeline automatically. + +You don't have to install Team Hero — the skill is fully functional in pure-Claude mode. + +--- + +## Claude Code + +Skills live in `~/.claude/skills//`. Drop the bundle in: + +```bash +mkdir -p ~/.claude/skills +cp -r path/to/agent-maturity-assessment ~/.claude/skills/ +rm ~/.claude/skills/agent-maturity-assessment/INSTALL.md # not needed at runtime +``` + +Restart Claude Code (or run `/skills reload` if your installation supports it). Test: + +``` +You: audit this repo's agent readiness +``` + +Claude should pick up the skill from its `description` (the trigger phrases include "agent readiness", "AI maturity", "audit the team", "score this repo", etc.). + +### From a Claude Code plugin + +If you'd rather ship the skill as part of a plugin, put it under `/skills/agent-maturity-assessment/` and add the plugin to your `~/.claude/plugins/` directory (or via `claude plugin install`). The structure inside `skills/` is identical. + +--- + +## Cowork / Anthropic Workbench + +In Cowork sessions, skills load from the workspace skills directory. Upload the `agent-maturity-assessment/` folder via the skills UI, or commit it to a repo Cowork has access to. + +Trigger phrases work the same — the `description` frontmatter is what the runtime matches against. + +--- + +## Custom Claude harness (Anthropic SDK) + +If you're embedding Claude via the Anthropic SDK and using the [Managed Agents SDK](https://docs.anthropic.com) or rolling your own skill loader, point your loader at the directory. + +For raw API + skills: include the `SKILL.md` body as part of the system prompt (or as a tool-result message), and either inline the references or expose them via a file-reading tool the model can call. + +--- + +## Verifying the install + +Ask Claude something the skill should trigger on: + +> "Can you run an agent maturity assessment on this repo?" +> "How healthy is this engineering org?" +> "Score this codebase for AI readiness." + +You should see Claude: +1. Read `SKILL.md` and one or more `references/` files +2. Run a preflight probe (looking for `gh` CLI, GitHub MCP, or git-only) +3. Ask the 7 Phase-1 questions **one at a time** (this is a hard checkpoint — if it dumps all 7 in one message, the skill isn't loading correctly) +4. Gather evidence per criterion +5. Write the audit using the template + +If Claude doesn't pick up the skill, the most common cause is that the harness isn't loading skills from your install location. Check the harness's skill-loading docs. + +--- + +## Optional: Team Hero binary accelerator + +[Team Hero](https://github.com/asabaylus/teamhero.cli) is a CLI that automates the assessment pipeline end-to-end (preflight → adjacent-repo detection → interview → 12 deterministic evidence collectors → AI scoring → audit writer). The skill detects whether the binary is available and uses it when present: + +```bash +# Install +brew install asabaylus/teamhero/teamhero # or download from releases + +# Configure credentials (one-time) +teamhero setup + +# Then trigger the skill normally — it'll call `teamhero assess` under the hood +``` + +The binary is a **strict superset** of pure-Claude mode: same rubric, same interview wording, same output template, same preflight tiers. You can run `teamhero assess` directly without invoking the skill at all. + +--- + +## Customizing + +- **Trigger phrases** — edit the `description` field in `SKILL.md`'s frontmatter to add or remove trigger words. +- **Rubric** — fork the skill and edit `references/criteria.md`. If you change scoring math (weights, max), update `SKILL.md`'s *Scoring* section to match. +- **Interview questions** — edit `references/interview.md`. Keep the one-question-at-a-time rule — that's what produces useful answers vs. a hollow audit. + +Track your changes in a `CHANGELOG.md` next to `SKILL.md` so historical audit scores stay interpretable. + +--- + +## License & attribution + +The rubric, interview wording, and output template are derived from the upstream Agent Maturity Assessment skill. Redistribute freely with attribution to the original. From 9212796ddbc703fb01618446c16fe8fb2d2d3d6b Mon Sep 17 00:00:00 2001 From: "coderabbitai[bot]" <136622811+coderabbitai[bot]@users.noreply.github.com> Date: Sun, 10 May 2026 22:40:44 -0400 Subject: [PATCH 6/6] =?UTF-8?q?=F0=9F=93=9D=20Add=20docstrings=20to=20`cla?= =?UTF-8?q?ude/condescending-tereshkova-88a936`=20(#11)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Docstrings generation was requested by @asabaylus. * https://github.com/asabaylus/teamhero.cli/pull/6#issuecomment-4417042628 The following files were modified: * `scripts/run-assess.ts` * `src/cli/index.ts` * `src/services/maturity/adjacent-repos.ts` * `src/services/maturity/ai-scorer.ts` * `src/services/maturity/audit-store.ts` * `src/services/maturity/audit-writer.ts` * `src/services/maturity/evidence-collectors.ts` * `src/services/maturity/fs-utils.ts` * `src/services/maturity/interview.ts` * `src/services/maturity/maturity-prompts.ts` * `src/services/maturity/maturity.service.ts` * `src/services/maturity/preflight.ts` * `src/services/maturity/rubric.ts` * `src/services/maturity/scoring.ts` * `tui/assess.go` * `tui/assess_config.go` * `tui/assess_flags.go` * `tui/assess_preview.go` * `tui/assess_progress.go` * `tui/assess_runner.go` * `tui/assess_summary.go` * `tui/assess_wizard.go` * `tui/main.go` Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- scripts/run-assess.ts | 25 ++++++- src/cli/index.ts | 19 ++++++ src/services/maturity/adjacent-repos.ts | 19 ++++-- src/services/maturity/ai-scorer.ts | 25 +++++-- src/services/maturity/audit-store.ts | 28 ++++++-- src/services/maturity/audit-writer.ts | 69 +++++++++++++++++++- src/services/maturity/evidence-collectors.ts | 34 ++++++++++ src/services/maturity/fs-utils.ts | 51 +++++++++++++-- src/services/maturity/interview.ts | 13 ++++ src/services/maturity/maturity-prompts.ts | 35 ++++++++++ src/services/maturity/maturity.service.ts | 8 ++- src/services/maturity/preflight.ts | 28 ++++++-- src/services/maturity/rubric.ts | 14 ++++ src/services/maturity/scoring.ts | 58 ++++++++++++++-- tui/assess.go | 14 +++- tui/assess_config.go | 15 +++-- tui/assess_flags.go | 21 +++++- tui/assess_preview.go | 13 +++- tui/assess_progress.go | 16 ++++- tui/assess_runner.go | 15 ++++- tui/assess_summary.go | 28 +++++++- tui/assess_wizard.go | 21 +++++- tui/main.go | 8 +++ 23 files changed, 520 insertions(+), 57 deletions(-) diff --git a/scripts/run-assess.ts b/scripts/run-assess.ts index a0488a2..86659e9 100644 --- a/scripts/run-assess.ts +++ b/scripts/run-assess.ts @@ -43,6 +43,13 @@ const emit: JsonLineEmitter = (event) => { process.stdout.write(`${JSON.stringify(event)}\n`); }; +/** + * Emit a standardized progress event for the current audit step. + * + * @param step - Identifier or name of the progress step + * @param status - Progress state: `active`, `complete`, or `failed` + * @param message - Human-readable status message + */ function emitProgress( step: string, status: "active" | "complete" | "failed", @@ -52,7 +59,14 @@ function emitProgress( } // readConfigLine + interview answers share a single stdin reader so the -// stdin pipe doesn't get half-consumed by an async iterator and then closed. +/** + * Load interview answers from a JSON file on disk. + * + * Attempts to read and parse interview answers from the filesystem; if reading or parsing fails the function logs a warning and returns an empty array. + * + * @param path - Filesystem path to the answers JSON file + * @returns An array of parsed `InterviewAnswer` objects, or an empty array if the file could not be read or parsed + */ async function loadInterviewAnswersFromFile( path: string, @@ -67,6 +81,15 @@ async function loadInterviewAnswersFromFile( } } +/** + * Run the headless maturity-assessment flow driven by a single JSON config line on stdin. + * + * Reads an `AssessCommandInput` object from the first stdin line, configures the assessment + * (interactive interview transport or preloaded answers, optional filesystem audit store, + * and the AI scorer), executes the assessment, and emits progress, result, and error events + * as JSON-lines on stdout while logging to stderr. Exits the process with code `0` on success + * or `1` on error or malformed/missing input. + */ async function main(): Promise { const logger = createConsola({ defaults: { tag: "maturity" } }); diff --git a/src/cli/index.ts b/src/cli/index.ts index 2758fe5..3ee990f 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -128,6 +128,14 @@ async function spawnTui(deps: CliDependencies, args: string[]): Promise { }); } +/** + * Build the CLI program with subcommands that forward execution to the Go TUI. + * + * @param deps - Runtime dependencies (auth and logger) used when delegating work to the TUI and reporting errors. + * @param options - Optional CLI construction flags. + * @param options.exitOverride - If set, configures Commander to throw instead of exiting on parse errors. + * @returns The configured Commander `Command` instance ready to parse CLI arguments. + */ export function createCli( deps: CliDependencies, options: CliOptions = {}, @@ -224,6 +232,17 @@ export async function createDefaultDependencies(): Promise { } satisfies CliDependencies; } +/** + * Parse CLI arguments and dispatch execution to the Commander program or the external TUI binary. + * + * When the first positional subcommand is one of "report", "doctor", "setup", or "assess" + * and the arguments include `--help`, this function forwards the raw arguments to the Go TUI + * binary instead of letting Commander render top-level help. Otherwise it delegates to the + * Commander program returned by `createCli`. + * + * @param argv - The argument vector to parse; defaults to `process.argv` + * @param deps - Optional runtime dependencies (logger and auth); if omitted, defaults are created + */ export async function run( argv: string[] = process.argv, deps?: CliDependencies, diff --git a/src/services/maturity/adjacent-repos.ts b/src/services/maturity/adjacent-repos.ts index b771ae6..3e2cf61 100644 --- a/src/services/maturity/adjacent-repos.ts +++ b/src/services/maturity/adjacent-repos.ts @@ -13,13 +13,12 @@ const STDLIB_OWNERS = new Set([ ]); /** - * Detect adjacent repos referenced from the local repo. Mirrors the four - * detection commands in references/preflight.md (multi-repo section): + * Discover external GitHub repositories referenced by the local repository. * - * 1. External GitHub Actions referenced in workflows (`uses: owner/repo@vX`) - * 2. Terraform modules sourced from external Git - * 3. Submodules - * 4. Generic cross-repo references in docs/scripts + * Scans the repository root (scope.localPath) for references via GitHub Actions `uses`, Terraform module sources, `.gitmodules` submodules, and `github.com` links in README.md; excludes known standard-library owners and de-duplicates results case-insensitively. + * + * @param scope - Descriptor whose `localPath` is the repository root to scan; if `localPath` is falsy the function returns an empty array + * @returns An array of detected `AdjacentRepo` objects (`{ owner, name, reason }`), de-duplicated by `owner/name` (case-insensitive) */ export async function detectAdjacentRepos( scope: ScopeDescriptor, @@ -98,6 +97,14 @@ export async function detectAdjacentRepos( return [...found.values()]; } +/** + * Add an adjacent repository to the map if it is not already present. + * + * @param map - Map used for de-duplication, keyed by the lowercase `owner/name` + * @param owner - Repository owner (organization or user) + * @param name - Repository name + * @param reason - Short human-readable reason describing why the repository was detected + */ function addRepo( map: Map, owner: string, diff --git a/src/services/maturity/ai-scorer.ts b/src/services/maturity/ai-scorer.ts index 07b1d6c..d32eea6 100644 --- a/src/services/maturity/ai-scorer.ts +++ b/src/services/maturity/ai-scorer.ts @@ -28,6 +28,13 @@ export interface MaturityAIScorerOptions { const TIER3_CAPPED = new Set([2, 3, 9, 11]); +/** + * Convert a score string produced by the AI into a typed `ItemScoreValue`. + * + * @param raw - The raw score string (expected: `"0"`, `"1"`, `"0.5"`, or `"n/a"`). + * @returns `0`, `1`, `0.5`, or `"n/a"` corresponding to the input string. + * @throws Error if `raw` is not one of the expected strings. + */ function parseScore(raw: string): ItemScoreValue { if (raw === "0") return 0; if (raw === "1") return 1; @@ -37,9 +44,15 @@ function parseScore(raw: string): ItemScoreValue { } /** - * Enforce tier-3 caps: if an item is in TIER3_CAPPED and the AI awarded 1.0 - * on a git-only audit, downgrade to 0.5 and append a note. We do this - * post-hoc so the AI's reasoning is preserved but the rubric is honored. + * Apply tier-3 capping rules to item scores for git-only audits. + * + * For `tier === "git-only"`, any item whose `itemId` is in `TIER3_CAPPED` and + * whose score equals `1` will be downgraded to `0.5` and have a marker + * appended to `whyThisScore`. Returns the adjusted items and explanatory notes. + * + * @param items - The list of item scores to process + * @param tier - The audit tier; caps are applied only when equal to `"git-only"` + * @returns An object containing `items` (the possibly modified scores) and `notes` (explanations for any caps applied) */ function applyTier3Caps( items: ItemScore[], @@ -64,9 +77,9 @@ function applyTier3Caps( } /** - * Validate that the AI returned exactly 12 items covering ids 1..12. - * Adds neutral 0/0.5/1 placeholders for any missing items so the audit - * always renders all rows. + * Ensures the returned item list includes every rubric item (IDs 1–12) by adding neutral placeholders for any missing entries. + * + * @returns An object with `items`: the original items augmented with placeholder `ItemScore` entries for missing rubric IDs (sorted by `itemId`), and `missing`: an array of rubric IDs that were absent from the input. */ function ensureAllItems(items: ItemScore[]): { items: ItemScore[]; diff --git a/src/services/maturity/audit-store.ts b/src/services/maturity/audit-store.ts index 6a7f969..91b194d 100644 --- a/src/services/maturity/audit-store.ts +++ b/src/services/maturity/audit-store.ts @@ -33,9 +33,12 @@ export class FileSystemAuditStore implements AuditStore { } /** - * Parse the `## Org-level answers` section of CONFIG.md. Heading mapping - * comes from interview.md verbatim. - */ + * Extracts org-level interview answers from a CONFIG.md document. + * + * Parses the "## Org-level answers" section, reading each `###` question heading and its following lines as the answer value. + * + * @param text - Full contents of a CONFIG.md file + * @returns An array of `InterviewAnswer` objects for recognized questions. Headings are matched case-insensitively to known interview questions; multi-line answers are preserved and trimmed; empty answers and unknown headings are ignored. export function parseConfigMd(text: string): InterviewAnswer[] { const answers: InterviewAnswer[] = []; const lines = text.split(/\r?\n/); @@ -83,6 +86,12 @@ export function parseConfigMd(text: string): InterviewAnswer[] { return answers; } +/** + * Finds the interview question id whose config heading matches the given heading (case-insensitive). + * + * @param heading - The heading text to match against question config headings. + * @returns The matching `InterviewQuestionId` if found, `null` otherwise. + */ function matchQuestionByHeading(heading: string): InterviewQuestionId | null { const q = INTERVIEW_QUESTIONS.find( (q) => q.configHeading.toLowerCase() === heading.toLowerCase(), @@ -90,6 +99,13 @@ function matchQuestionByHeading(heading: string): InterviewQuestionId | null { return q?.id ?? null; } +/** + * Render the contents of CONFIG.md's "Org-level answers" section from provided answers. + * + * @param answers - Collected interview answers to include in the document + * @param today - Date string written to the `last_updated` line + * @returns A CONFIG.md-formatted string containing the header, `last_updated: {today}`, and one `### {question}` section per interview question; unanswered questions are rendered as `unknown` + */ export function renderConfigMd( answers: InterviewAnswer[], today: string, @@ -109,8 +125,10 @@ export function renderConfigMd( } /** - * Read pre-supplied interview answers from a JSON file (used by --interview-answers - * in headless mode). Format: { "q1": "...", "q2": "...", ... }. + * Load pre-supplied interview answers from a JSON file for headless mode. + * + * @param path - Filesystem path to a JSON file shaped like `{ "q1": "…", "q2": "…", … }` + * @returns An array of `InterviewAnswer` for entries whose question IDs are recognized; unrecognized IDs are skipped. */ export async function readAnswersJson( path: string, diff --git a/src/services/maturity/audit-writer.ts b/src/services/maturity/audit-writer.ts index 1a3792a..324f5c3 100644 --- a/src/services/maturity/audit-writer.ts +++ b/src/services/maturity/audit-writer.ts @@ -8,6 +8,12 @@ import type { ItemScoreValue, } from "./types.js"; +/** + * Map an artifact evidence tier code to its formatted label. + * + * @param tier - One of `"gh"`, `"github-mcp"`, or `"git-only"` representing the evidence tier + * @returns A human-readable label for `tier` (e.g. `"1: gh"`, `"2: GitHub MCP"`, `"3: git-only"`) + */ function tierLabel(tier: AssessmentArtifact["tier"]): string { switch (tier) { case "gh": @@ -19,6 +25,12 @@ function tierLabel(tier: AssessmentArtifact["tier"]): string { } } +/** + * Format an ItemScoreValue into its display string. + * + * @param score - The score value (may be `1`, `0`, `"n/a"`, or another numeric value) + * @returns The display string: `"1"` for `1`, `"0"` for `0`, `"n/a"` for `"n/a"`, and `"0.5"` for any other numeric value + */ function formatScore(score: ItemScoreValue): string { if (score === "n/a") return "n/a"; if (score === 1) return "1"; @@ -26,10 +38,25 @@ function formatScore(score: ItemScoreValue): string { return "0.5"; } +/** + * Format a number as a string with a fixed number of decimal places. + * + * @param num - The number to format + * @param digits - The number of digits after the decimal point (defaults to 1) + * @returns The numeric value formatted as a string with exactly `digits` digits after the decimal point + */ function fixed(num: number, digits = 1): string { return num.toFixed(digits); } +/** + * Retrieve the ItemScore object for a specific item ID from a list. + * + * @param items - Array of item scores to search + * @param itemId - The item identifier to find + * @returns The matching `ItemScore` + * @throws Error if no score for `itemId` is found (message: `Missing score for item `) + */ function findItemScore(items: ItemScore[], itemId: number): ItemScore { const score = items.find((s) => s.itemId === itemId); if (!score) { @@ -38,6 +65,15 @@ function findItemScore(items: ItemScore[], itemId: number): ItemScore { return score; } +/** + * Builds the Markdown section for a rubric category, including the category header, a table of items with scores and reasons, and a computed subtotal line. + * + * @param artifact - The assessment artifact containing item scores and category subtotals + * @param categoryId - The rubric category identifier to render + * @returns A Markdown-formatted string for the category section (header, item table, and subtotal) + * @throws Error If the `categoryId` is not defined in `RUBRIC_CATEGORIES` (`Unknown category `) + * @throws Error If the artifact has no subtotal for `categoryId` (`Missing subtotal for `) + */ function categoryTable( artifact: AssessmentArtifact, categoryId: CategoryId, @@ -69,6 +105,12 @@ function categoryTable( return lines.join("\n"); } +/** + * Render a complete Markdown maturity assessment report for an assessment artifact. + * + * @param artifact - The assessment artifact containing scope, scores, band/tier info, fixes, strengths, adjacent repos, notes, and rubric metadata + * @returns The full report as a Markdown-formatted string + */ export function renderAuditMarkdown(artifact: AssessmentArtifact): string { const lines: string[] = []; lines.push( @@ -159,10 +201,23 @@ export function renderAuditMarkdown(artifact: AssessmentArtifact): string { return lines.join("\n"); } +/** + * Serializes an assessment artifact to a pretty-printed JSON string. + * + * @param artifact - The assessment artifact to serialize + * @returns The artifact as a JSON string formatted with 2-space indentation + */ export function renderAuditJson(artifact: AssessmentArtifact): string { return JSON.stringify(artifact, null, 2); } +/** + * Ensure the directory containing `filePath` exists, creating it recursively if needed. + * + * Skips creation when the directory portion is ".", "" or "/". + * + * @param filePath - The target file path whose parent directory should be ensured + */ async function ensureDir(filePath: string): Promise { const dir = dirname(filePath); if (dir === "." || dir === "" || dir === "/") return; @@ -175,6 +230,13 @@ export interface WriteAuditOptions { format: "markdown" | "json" | "both"; } +/** + * Writes an assessment artifact to disk in Markdown, JSON, or both formats. + * + * @param artifact - The assessment data to serialize and write + * @param options - Output options including destination path(s) and format + * @returns An object with `outputPath` set to the primary file written and `jsonOutputPath` set when a separate JSON file was written. If `options.format` is `"json"`, `outputPath` will point to the JSON file. + */ export async function writeAudit( artifact: AssessmentArtifact, options: WriteAuditOptions, @@ -211,8 +273,11 @@ export async function writeAudit( } /** - * Compute the default markdown output path from a scope + date, mirroring the - * report file convention (`teamhero-report--.md`). + * Build a default Markdown filename for an audit by slugifying the display name and appending the date. + * + * @param displayName - Source string to slugify: converted to lowercase, runs of non-alphanumeric characters replaced with `-`, and leading/trailing `-` removed + * @param date - Date portion to append (used verbatim, e.g. `YYYY-MM-DD`) + * @returns A relative path like `./teamhero-maturity--.md` */ export function defaultOutputPath(displayName: string, date: string): string { const slug = displayName diff --git a/src/services/maturity/evidence-collectors.ts b/src/services/maturity/evidence-collectors.ts index 0e5f0b1..ce9c422 100644 --- a/src/services/maturity/evidence-collectors.ts +++ b/src/services/maturity/evidence-collectors.ts @@ -27,10 +27,26 @@ interface CollectInput { adjacentRepos: AdjacentRepo[]; } +/** + * Retrieve the repository local filesystem path from a scope descriptor. + * + * @param scope - The scope descriptor containing contextual data about the repository + * @returns The `localPath` string from `scope`, or `null` if it is not set + */ function localPath(scope: ScopeDescriptor): string | null { return scope.localPath ?? null; } +/** + * Constructs an EvidenceFact object for a rubric item. + * + * @param itemId - The numeric rubric item identifier (e.g., 1–12) + * @param signal - The signal for the fact (`"positive"`, `"neutral"`, or `"negative"`) + * @param summary - A short, human-readable summary of the evidence + * @param source - Identifier of the evidence source (for example `"evidence-collectors"`) + * @param details - Optional additional metadata to attach to the fact + * @returns The assembled `EvidenceFact` containing the provided fields; `details` is included only if supplied + */ function fact( itemId: number, signal: EvidenceFact["signal"], @@ -708,6 +724,14 @@ class HiringCollector implements MaturityProvider { } } +/** + * Create the default set of evidence collectors for all rubric items in the canonical order. + * + * @returns An array of `MaturityProvider` instances for items 1 through 12, ordered as: + * ReproducibleDevCollector, IntegrationCadenceCollector, TestabilityCollector, ObservabilityCollector, + * DesignDisciplineCollector, DeepModulesCollector, AgentContextCollector, SanctionedAiCollector, + * HumanReviewCollector, EvalsCollector, BlastRadiusCollector, HiringCollector. + */ export function defaultCollectors(): MaturityProvider[] { return [ new ReproducibleDevCollector(), @@ -725,6 +749,16 @@ export function defaultCollectors(): MaturityProvider[] { ]; } +/** + * Run each maturity collector in order and aggregate their emitted evidence facts. + * + * If a collector throws, its error is caught and a single neutral `EvidenceFact` is appended + * for that item with the error message as the summary. + * + * @param collectors - Array of maturity collectors to execute + * @param input - Collection input (scope and tier) supplied to each collector + * @returns The concatenated list of `EvidenceFact` objects produced by all collectors, in execution order + */ export async function runAllCollectors( collectors: MaturityProvider[], input: CollectInput, diff --git a/src/services/maturity/fs-utils.ts b/src/services/maturity/fs-utils.ts index 098d219..9e8f5ea 100644 --- a/src/services/maturity/fs-utils.ts +++ b/src/services/maturity/fs-utils.ts @@ -27,8 +27,18 @@ export interface FindOptions { } /** - * Walk a directory tree and return matching file paths (relative to root). - * Skips DEFAULT_IGNORES entries and symlinks. + * Recursively collect relative file paths under `root` that match the provided filters. + * + * Traversal stops at `options.maxDepth` (default 4) and after collecting `options.limit` matches (default 200). + * Skips entries listed in `DEFAULT_IGNORES` and symbolic links. If `root` is not a directory or cannot be read, returns an empty array. + * + * @param root - The directory to scan; returned paths are relative to this root + * @param options - Optional filters and limits: + * - `maxDepth` — maximum recursion depth + * - `nameRegex` — only include files whose basename matches this regex + * - `pathContains` — only include files whose lowercased relative path contains at least one of these substrings + * - `limit` — maximum number of matches to return + * @returns An array of matching file paths relative to `root` */ export async function findFiles( root: string, @@ -38,6 +48,14 @@ export async function findFiles( const limit = options.limit ?? 200; const matches: string[] = []; + /** + * Recursively traverses a directory subtree and appends relative file paths that satisfy the configured filters to the surrounding `matches` collection. + * + * Traversal stops when `depth` exceeds the configured maximum, when the match `limit` is reached, or when a directory cannot be read. During iteration this function skips ignored entry names, symbolic links, non-matching file names (when `options.nameRegex` is set), and files whose lowercased relative path does not contain any of the `options.pathContains` needles. + * + * @param dir - Absolute path of the directory to walk + * @param depth - Current recursion depth (root call uses 0) + */ async function walk(dir: string, depth: number): Promise { if (depth > maxDepth || matches.length >= limit) return; let entries; @@ -79,7 +97,13 @@ export async function findFiles( return matches; } -/** Convenience: does any file matching options exist? */ +/** + * Check whether any file matching the given options exists under `root`. + * + * @param root - The directory path to search from + * @param options - Optional search filters and limits + * @returns `true` if at least one matching file exists, `false` otherwise. + */ export async function anyFile( root: string, options: FindOptions = {}, @@ -88,7 +112,11 @@ export async function anyFile( return found.length > 0; } -/** Read a file or return null if it doesn't exist / can't be read. */ +/** + * Read a UTF-8 file and return its contents, or `null` if the file cannot be read. + * + * @returns The file contents as a UTF-8 string, or `null` if the file does not exist or is unreadable + */ export async function readIfExists(path: string): Promise { try { return await readFile(path, "utf8"); @@ -98,8 +126,11 @@ export async function readIfExists(path: string): Promise { } /** - * Check whether the file content matches a regex. Returns true if the file - * exists AND contains a match. + * Determine whether a file's contents match a regular expression. + * + * @param path - Filesystem path to the file to test + * @param pattern - Regular expression to test against the file contents + * @returns `true` if the file exists and its contents match `pattern`, `false` otherwise */ export async function fileContains( path: string, @@ -110,7 +141,13 @@ export async function fileContains( return pattern.test(content); } -/** Look for a substring across many candidate files; return first hit's path. */ +/** + * Finds the first path whose file contents match a regular expression. + * + * @param paths - Ordered list of file paths to check + * @param pattern - Regular expression to test against each file's contents + * @returns The first path whose file content matches `pattern`, or `null` if none match + */ export async function firstFileContaining( paths: string[], pattern: RegExp, diff --git a/src/services/maturity/interview.ts b/src/services/maturity/interview.ts index 5a9b4fa..61ead9a 100644 --- a/src/services/maturity/interview.ts +++ b/src/services/maturity/interview.ts @@ -109,10 +109,23 @@ const UNKNOWN_TOKENS = new Set( ), ); +/** + * Determine whether an answer string represents "unknown" or "not applicable". + * + * @param value - The raw answer text to classify (may include surrounding whitespace or mixed case) + * @returns `true` if `value` matches a known unknown/not-applicable token (case- and whitespace-insensitive), `false` otherwise + */ export function isUnknownAnswer(value: string): boolean { return UNKNOWN_TOKENS.has(value.trim().toLowerCase()); } +/** + * Retrieve the interview question object for the given question identifier. + * + * @param id - The interview question id (e.g., `"q1"` through `"q7"`) to look up + * @returns The matching `InterviewQuestion` for `id` + * @throws Error if no question with the supplied `id` exists + */ export function getQuestion(id: InterviewQuestionId): InterviewQuestion { const q = INTERVIEW_QUESTIONS.find((q) => q.id === id); if (!q) { diff --git a/src/services/maturity/maturity-prompts.ts b/src/services/maturity/maturity-prompts.ts index 9b1fbb7..a461181 100644 --- a/src/services/maturity/maturity-prompts.ts +++ b/src/services/maturity/maturity-prompts.ts @@ -78,6 +78,15 @@ export interface MaturityScoringContext { interviewAnswers: InterviewAnswer[]; } +/** + * Render the full rubric as a Markdown-formatted text block. + * + * Includes category headings with their formatted weights and, for each rubric item, an item header, + * score level explanations for `1.0`, `0.5`, and `0.0`, an optional interview linkage line, + * an optional tier-3 cap note, and the item's "Why it matters" text. + * + * @returns A Markdown string containing the complete rubric organized by category and item. + */ function rubricBlock(): string { const lines: string[] = []; for (const cat of RUBRIC_CATEGORIES) { @@ -106,6 +115,17 @@ function rubricBlock(): string { return lines.join("\n"); } +/** + * Render deterministic evidence grouped by rubric item into a Markdown string. + * + * Groups the provided evidence facts by their `itemId` and produces a section for + * every rubric item. Each section contains either bullet lines in the form + * `- [] ` for facts or the placeholder + * `- (no deterministic evidence collected)` when no facts exist for that item. + * + * @param evidence - Collected deterministic evidence facts to include + * @returns A Markdown-formatted string with a section per rubric item listing its evidence or a placeholder when none is present + */ function evidenceBlock(evidence: EvidenceFact[]): string { const byItem = new Map(); for (const f of evidence) { @@ -129,11 +149,26 @@ function evidenceBlock(evidence: EvidenceFact[]): string { return lines.join("\n"); } +/** + * Formats interview answers into a Markdown bullet list. + * + * @param answers - Array of interview answers; each item should contain `questionId` and `value` + * @returns `_No interview answers supplied._` if `answers` is empty, otherwise a newline-separated list of `- : ` lines + */ function interviewBlock(answers: InterviewAnswer[]): string { if (answers.length === 0) return "_No interview answers supplied._"; return answers.map((a) => `- ${a.questionId}: ${a.value}`).join("\n"); } +/** + * Builds the full audit prompt used to assess agent maturity, embedding scope, rules, + * the full rubric, collected deterministic evidence, and interview answers. + * + * @param context - Inputs that populate the prompt: scope (mode/displayName), evidence tier, + * adjacent repositories, deterministic evidence facts, and interview answers. + * @returns The complete Markdown prompt text instructing the auditor and ending with an + * instruction to return JSON matching the `agent_maturity_assessment` schema. + */ export function buildMaturityPrompt(context: MaturityScoringContext): string { const scopeLine = `${context.scope.mode} | ${context.scope.displayName}`; const adjacentLine = diff --git a/src/services/maturity/maturity.service.ts b/src/services/maturity/maturity.service.ts index 8a43e19..2c3c9b2 100644 --- a/src/services/maturity/maturity.service.ts +++ b/src/services/maturity/maturity.service.ts @@ -196,7 +196,13 @@ export class MaturityService { } } -/** Convenience: quick non-interactive run with default deps. */ +/** + * Run an assessment non-interactively using default dependencies. + * + * @param input - Assessment command input describing scope, tier, and output options + * @param overrides - Optional dependency overrides for collectors, scorer, transport, logger, or audit store + * @returns The assessment result containing the generated artifact and output path(s) + */ export async function runHeadlessAssessment( input: AssessCommandInput, overrides?: MaturityServiceDeps, diff --git a/src/services/maturity/preflight.ts b/src/services/maturity/preflight.ts index db77a2f..a27a392 100644 --- a/src/services/maturity/preflight.ts +++ b/src/services/maturity/preflight.ts @@ -5,12 +5,13 @@ import { getEnv } from "../../lib/env.js"; import type { EvidenceTier } from "./types.js"; /** - * Detects which evidence-fidelity tier we can operate at. + * Choose the evidence-fidelity tier the system should operate at. * - * Order: - * 1. `gh` CLI in PATH and authenticated → "gh" - * 2. Hint env var TEAMHERO_GITHUB_MCP=1 (set by the Go TUI when an MCP is wired) → "github-mcp" - * 3. Anything else → "git-only" + * Detection precedence (highest → lowest): explicit `override` (unless `"auto"`), authenticated `gh` CLI, `TEAMHERO_GITHUB_MCP="1"`, then git-only fallback. + * + * @param cwd - Working directory used when probing for a Git repository + * @param override - Explicit tier to use or `"auto"` to perform detection + * @returns The selected evidence tier: `'gh'`, `'github-mcp'`, or `'git-only'` */ export async function detectTier( cwd: string, @@ -27,6 +28,12 @@ export async function detectTier( return "git-only"; } +/** + * Determines whether the given directory appears to be a Git repository by checking for a `.git` entry. + * + * @param cwd - Filesystem path to the directory to inspect + * @returns `true` if a `.git` entry exists and is a file or directory, `false` otherwise + */ async function isGitRepo(cwd: string): Promise { try { const s = await stat(join(cwd, ".git")); @@ -36,6 +43,11 @@ async function isGitRepo(cwd: string): Promise { } } +/** + * Detects whether the GitHub CLI is installed and currently authenticated. + * + * @returns `true` if the `gh` CLI is present and reports an authenticated session, `false` otherwise. + */ async function ghIsAuthenticated(): Promise { return new Promise((resolve) => { const child = spawn("gh", ["auth", "status"], { @@ -46,6 +58,12 @@ async function ghIsAuthenticated(): Promise { }); } +/** + * Provide a human-readable label for an evidence-fidelity tier. + * + * @param tier - The evidence tier to describe (`"gh"`, `"github-mcp"`, or `"git-only"`) + * @returns A descriptive label for `tier` indicating its name and relative fidelity. + */ export function describeTier(tier: EvidenceTier): string { switch (tier) { case "gh": diff --git a/src/services/maturity/rubric.ts b/src/services/maturity/rubric.ts index ff75641..cc6a873 100644 --- a/src/services/maturity/rubric.ts +++ b/src/services/maturity/rubric.ts @@ -314,6 +314,13 @@ export const RUBRIC_ITEMS: ReadonlyArray = [ }, ] as const; +/** + * Retrieves a rubric item by its numeric identifier. + * + * @param id - The numeric identifier of the rubric item + * @returns The matching RubricItem + * @throws Error when no rubric item with the given `id` exists + */ export function getRubricItem(id: number): RubricItem { const item = RUBRIC_ITEMS.find((i) => i.id === id); if (!item) { @@ -322,6 +329,13 @@ export function getRubricItem(id: number): RubricItem { return item; } +/** + * Retrieve a rubric category by its ID. + * + * @param id - The category identifier (`"A"`, `"B"`, `"C"`, or `"D"`) + * @returns The `RubricCategory` matching `id` + * @throws Error if no category with the provided `id` exists + */ export function getCategory(id: "A" | "B" | "C" | "D"): RubricCategory { const cat = RUBRIC_CATEGORIES.find((c) => c.id === id); if (!cat) { diff --git a/src/services/maturity/scoring.ts b/src/services/maturity/scoring.ts index 4d5e8fa..c5de258 100644 --- a/src/services/maturity/scoring.ts +++ b/src/services/maturity/scoring.ts @@ -13,7 +13,10 @@ import type { } from "./types.js"; /** - * Per-item numeric value, treating "n/a" as null. + * Convert an item's score to a numeric value, returning `null` for `"n/a"`. + * + * @param score - The item's score, which may be a number or the string `"n/a"` + * @returns The numeric score, or `null` if `score` is `"n/a"` */ function scoreNumeric(score: ItemScore["score"]): number | null { if (score === "n/a") return null; @@ -28,6 +31,19 @@ export interface CategorySubtotal { maxWeighted: number; // adjusted for n/a } +/** + * Compute per-category subtotals for the provided item scores. + * + * Items with `"n/a"` scores are excluded from sums and assessment counts. + * + * @param items - Array of item scores to aggregate by rubric category + * @returns An array of `CategorySubtotal` objects (one per rubric category, in the same order as `RUBRIC_CATEGORIES`). Each subtotal includes: + * - `id`: category id + * - `rawSum`: sum of numeric scores in the category + * - `weighted`: `rawSum` multiplied by the category weight + * - `maxRaw`: number of assessed items in the category (each contributes at most 1.0) + * - `maxWeighted`: `maxRaw` multiplied by the category weight + */ export function categorySubtotals(items: ItemScore[]): CategorySubtotal[] { return RUBRIC_CATEGORIES.map((cat) => { const inCat = items.filter((s) => { @@ -67,6 +83,18 @@ export interface OverallScore { band: MaturityBand; } +/** + * Computes aggregated raw and weighted scores, the percent score, and its maturity band for the supplied item scores. + * + * @param items - Array of `ItemScore` entries to include; `"n/a"` scores are excluded from numeric aggregates. + * @returns An `OverallScore` object containing: + * - `rawScore`: sum of raw (unweighted) scores across categories + * - `rawScoreMax`: maximum possible raw score given assessed items + * - `weightedScore`: sum of category-weighted scores + * - `weightedScoreMax`: maximum possible weighted score given assessed items + * - `scorePercent`: weighted score expressed as a percentage of `weightedScoreMax` (0 when `weightedScoreMax` is 0) + * - `band`: the maturity band corresponding to `scorePercent` + */ export function computeOverallScore(items: ItemScore[]): OverallScore { const subtotals = categorySubtotals(items); @@ -89,6 +117,12 @@ export function computeOverallScore(items: ItemScore[]): OverallScore { }; } +/** + * Selects the maturity band whose inclusive range contains the given score percentage. + * + * @param scorePercent - The score percentage (typically 0–100) to classify + * @returns The `MaturityBand` whose `min`..`max` range includes `scorePercent`; if no band matches, returns the last entry of `MATURITY_BANDS` as a fallback + */ export function classifyBand(scorePercent: number): MaturityBand { for (const band of MATURITY_BANDS) { if (scorePercent >= band.min && scorePercent <= band.max) { @@ -99,6 +133,13 @@ export function classifyBand(scorePercent: number): MaturityBand { return MATURITY_BANDS[MATURITY_BANDS.length - 1]; } +/** + * Get the maturity band for the given band name. + * + * @param name - The name of the maturity band to look up + * @returns The `MaturityBand` whose `name` matches `name` + * @throws Error if no maturity band with the provided name exists + */ export function bandByName(name: MaturityBandName): MaturityBand { const band = MATURITY_BANDS.find((b) => b.name === name); if (!band) { @@ -107,14 +148,23 @@ export function bandByName(name: MaturityBandName): MaturityBand { return band; } -/** Returns the unweighted-max constants for diagnostics. */ +/** + * Provide the maximum attainable raw and weighted scores for diagnostics. + * + * @returns An object with `raw` equal to the maximum raw score and `weighted` equal to the maximum weighted score + */ export function maxScores(): { raw: number; weighted: number } { return { raw: MAX_RAW_SCORE, weighted: MAX_WEIGHTED_SCORE }; } /** - * Validate that a list of ItemScores covers all 12 items exactly once. - * Returns missing item IDs (empty array if valid). + * Identify which of the 12 expected rubric item IDs (1–12) are not present in the provided scores. + * + * This compares against the fixed set of expected IDs {1..12} and returns those that never appear in `items`. + * Duplicate or extra entries in `items` are ignored; only the presence of an `itemId` matters. + * + * @param items - Array of scored items to check for coverage + * @returns A sorted array of missing item IDs from 1 through 12; empty if all are present */ export function findMissingItems(items: ItemScore[]): number[] { const expected = new Set([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]); diff --git a/tui/assess.go b/tui/assess.go index 7479083..20bed20 100644 --- a/tui/assess.go +++ b/tui/assess.go @@ -8,6 +8,10 @@ import ( "github.com/charmbracelet/lipgloss" ) +// printAssessUsage prints the usage and help message for the `teamhero assess` subcommand to stderr. +// +// The message describes the assessment's purpose and outputs, saved configuration location, +// scope and run flags (including headless and interview options), examples, and exit codes. func printAssessUsage() { fmt.Fprintf(os.Stderr, `Usage: teamhero assess [flags] @@ -55,7 +59,11 @@ Exit codes: } // runAssess is the entry point for the "assess" subcommand. It dispatches to -// either the headless run loop or the interactive wizard based on environment. +// runAssess dispatches the `assess` subcommand behavior. +// If `--show-assess-config` is set, it prints the saved assess configuration (exits with status 1 if none) and returns nil. +// Otherwise it loads or initializes the config, applies flag overrides, and fills defaults. +// In headless mode it verifies that minimal scope is present (exits with status 1 if missing) and runs the headless assessment, returning any error from that run. +// In interactive mode it runs the interactive assessment and returns any error from that run. func runAssess() error { if *flagAssessShowConfig { cfg, err := LoadAssessConfig() @@ -83,6 +91,8 @@ func runAssess() error { return runAssessInteractive(&cfg) } +// loadOrInitAssessConfig returns a previously saved AssessConfig if one exists. +// If no saved config is available or loading it fails, it returns the DefaultAssessConfig(). func loadOrInitAssessConfig() AssessConfig { saved, _ := LoadAssessConfig() if saved != nil { @@ -92,7 +102,7 @@ func loadOrInitAssessConfig() AssessConfig { } // runAssessHeadless drives the assess service runner without any TTY UI. -// Interview answers must come from --interview-answers or a CONFIG.md file. +// command to fail. func runAssessHeadless(cfg AssessConfig) error { cfg.Mode = "headless" cfg.InteractiveInterview = false diff --git a/tui/assess_config.go b/tui/assess_config.go index dd6a935..896aae1 100644 --- a/tui/assess_config.go +++ b/tui/assess_config.go @@ -29,13 +29,16 @@ type AssessScope struct { DisplayName string `json:"displayName"` } -// assessConfigPath returns ~/.config/teamhero/assess-config.json (XDG-compliant). +// assessConfigPath returns the full path to the XDG-compliant assess-config.json file +// inside the application's configuration directory. func assessConfigPath() string { return filepath.Join(configDir(), "assess-config.json") } // LoadAssessConfig reads the saved assess configuration. Returns nil with no -// error if the file does not exist. +// LoadAssessConfig reads the saved assess configuration from the user's config directory and returns it. +// If the config file does not exist it returns (nil, nil). +// If reading the file or decoding the JSON fails it returns a non-nil error. func LoadAssessConfig() (*AssessConfig, error) { data, err := os.ReadFile(assessConfigPath()) if err != nil { @@ -51,7 +54,9 @@ func LoadAssessConfig() (*AssessConfig, error) { return &cfg, nil } -// SaveAssessConfig persists the assess configuration to disk. +// SaveAssessConfig writes cfg to the persistent assess configuration file. +// It creates the parent directory if necessary, writes the configuration as +// indented JSON with file mode 0600, and returns any error encountered. func SaveAssessConfig(cfg *AssessConfig) error { if err := os.MkdirAll(filepath.Dir(assessConfigPath()), 0o755); err != nil { return err @@ -63,7 +68,9 @@ func SaveAssessConfig(cfg *AssessConfig) error { return os.WriteFile(assessConfigPath(), data, 0o600) } -// DefaultAssessConfig returns a sensible starting config for a new user. +// DefaultAssessConfig builds a sensible starting AssessConfig configured for a local repository. +// The returned config uses the current working directory as Scope.LocalPath and Scope.DisplayName, +// sets Scope.Mode to "local-repo", EvidenceTier to "auto", and OutputFormat to "both". func DefaultAssessConfig() AssessConfig { cwd, _ := os.Getwd() return AssessConfig{ diff --git a/tui/assess_flags.go b/tui/assess_flags.go index 5e48c2d..d597dad 100644 --- a/tui/assess_flags.go +++ b/tui/assess_flags.go @@ -23,7 +23,8 @@ var ( flagAssessShowConfig = flag.Bool("show-assess-config", false, "Print saved assess configuration as JSON and exit") ) -// applyAssessFlagsTo merges explicitly-set CLI flags into cfg. +// applyAssessFlagsTo updates cfg with values from assess CLI flags that were explicitly set. +// For each supported flag, if wasSet reports it was provided, the corresponding cfg field is overwritten with the flag's value. func applyAssessFlagsTo(cfg *AssessConfig, wasSet func(string) bool) { if wasSet("scope-mode") { cfg.Scope.Mode = strings.TrimSpace(*flagAssessScopeMode) @@ -61,7 +62,20 @@ func applyAssessFlagsTo(cfg *AssessConfig, wasSet func(string) bool) { } // fillAssessDefaults populates required fields if they're missing. Mirrors -// DefaultAssessConfig but applied to an already-loaded config. +// fillAssessDefaults populates missing fields on an AssessConfig with sensible defaults. +// +// If Scope.Mode is empty it is derived from the presence of Scope.Org and Scope.LocalPath: +// - only LocalPath present -> "local-repo" +// - only Org present -> "org" +// - both present -> "both" +// - neither present -> "local-repo" +// +// If Scope.DisplayName is empty it is set based on Scope.Mode: +// - "org" -> Scope.Org +// - "local-repo" -> base name of Scope.LocalPath (if set) +// - "both" -> Scope.Org if present, otherwise base name of Scope.LocalPath (if set) +// +// OutputFormat defaults to "both" and EvidenceTier defaults to "auto" when unset. func fillAssessDefaults(cfg *AssessConfig) { if cfg.Scope.Mode == "" { if cfg.Scope.LocalPath != "" && cfg.Scope.Org == "" { @@ -99,7 +113,8 @@ func fillAssessDefaults(cfg *AssessConfig) { } // hasMinimalAssessConfig returns true if enough config is present to run -// headless without further interactive input. +// hasMinimalAssessConfig determines whether cfg contains the minimal fields required to run an assess operation without interactive input. +// It returns true when cfg is non-nil, the scope mode is one of "org", "local-repo", or "both" with the corresponding required scope value present (org for "org"/"both", local path for "local-repo"), and Scope.DisplayName is non-empty after trimming whitespace. func hasMinimalAssessConfig(cfg *AssessConfig) bool { if cfg == nil { return false diff --git a/tui/assess_preview.go b/tui/assess_preview.go index eeeac4f..e59877a 100644 --- a/tui/assess_preview.go +++ b/tui/assess_preview.go @@ -46,6 +46,12 @@ type assessPreviewModel struct { spinner spinner.Model } +// newAssessPreviewModel creates and initializes an assessPreviewModel for the given audit and JSON inputs. +// +// It converts path to an absolute path and attempts to read the audit file into the model's markdown. +// On read failure the model's renderErr is set and markdown is left empty. It also stores jsonPath and +// jsonData, allocates and sizes viewports for each tab, configures the initial spinner, sets the initial +// active tab to the audit tab, and marks the model as awaiting its initial render. func newAssessPreviewModel(path, jsonPath, jsonData string) assessPreviewModel { absPath, _ := filepath.Abs(path) @@ -308,7 +314,7 @@ func (m *assessPreviewModel) previewFrameHeight() int { // buildAssessEvidenceMarkdown extracts the evidence facts and per-item scores // from the audit JSON and renders them as a single markdown document for the -// Evidence tab. Falls back to a placeholder when no JSON is present. +// optional explanation, followed by a "Notes for re-audit" section when present. func buildAssessEvidenceMarkdown(jsonData string) string { if jsonData == "" { return "## Evidence\n\n_No JSON data available — re-run with `--audit-output-format both`._\n" @@ -357,7 +363,10 @@ func buildAssessEvidenceMarkdown(jsonData string) string { } // RunAssessPreview displays the audit markdown in a tabbed Glamour-rendered -// preview matching the report flow's RunReportPreviewFull look-and-feel. +// RunAssessPreview launches a full-screen interactive preview UI for an audit file and its associated JSON. +// It presents three tabs — Audit (rendered markdown from path), Evidence (markdown built from jsonData), and JSON Data (pretty-printed jsonData) — and handles resizing, tab switching, scrolling, and exiting. +// path is the path to the audit markdown file to display. jsonPath, if provided, is shown in the info panel. jsonData is the raw audit JSON used to populate the Evidence and JSON Data tabs. +// It returns any error encountered while running the TUI program. func RunAssessPreview(path, jsonPath, jsonData string) error { m := newAssessPreviewModel(path, jsonPath, jsonData) p := tea.NewProgram(m, tea.WithOutput(os.Stderr), tea.WithAltScreen()) diff --git a/tui/assess_progress.go b/tui/assess_progress.go index 5418dc0..c55905d 100644 --- a/tui/assess_progress.go +++ b/tui/assess_progress.go @@ -98,6 +98,12 @@ var canonicalAssessSteps = []string{ "complete", } +// newAssessProgressModel creates an assessProgressModel configured for the TUI run. +// +// The returned model is initialized with a styled spinner, a progress bar (width derived +// from terminal width), two viewports (content and shell) sized for the terminal, and +// populated fields: cfg, title, canonical expected steps, totalQuestions, sendAnswer, +// and initial width/height. func newAssessProgressModel( title string, cfg *AssessConfig, @@ -675,7 +681,7 @@ type AssessProgressResult struct { // don't release the terminal — so the framed two-pane layout is continuous. // // sendAnswer is invoked when each embedded interview form completes. It -// must write the answer JSON line back to the runner's stdin. +// an error message if the display/runtime failed, and whether the run was cancelled. func RunAssessProgressDisplay( title string, cfg *AssessConfig, @@ -710,6 +716,11 @@ func RunAssessProgressDisplay( } } +// assessStepElapsed returns a formatted elapsed-time string for the given step. +// If the step has no start time, it returns an empty string. If the step has a +// finish time, it returns the duration from start to finish. If the step is +// still running, it returns the elapsed duration only after at least 3 seconds +// have passed since the start; otherwise it returns an empty string. func assessStepElapsed(s assessStepState, now time.Time) string { if s.startedAt.IsZero() { return "" @@ -724,7 +735,8 @@ func assessStepElapsed(s assessStepState, now time.Time) string { } // humanizeStep maps the lower-kebab step name to a label that fits the -// existing report's tone (capitalized verb-phrases). +// humanizeStep maps a canonical step key to a human-readable, capitalized label. +// If the step is unknown, it returns the input unchanged. func humanizeStep(step string) string { switch step { case "startup": diff --git a/tui/assess_runner.go b/tui/assess_runner.go index eeffa50..bd8ac75 100644 --- a/tui/assess_runner.go +++ b/tui/assess_runner.go @@ -13,7 +13,11 @@ import ( ) // assessScriptPath returns the path to scripts/run-assess.ts. Mirrors -// resolveScriptPath but for the assess service runner. +// assessScriptPath determines the filesystem path to scripts/run-assess.ts using a series of fallbacks. +// It first checks for ../scripts/run-assess.ts relative to the running executable, then checks +// "scripts/run-assess.ts" and "./scripts/run-assess.ts" in the current working directory, and finally +// checks $HOME/teamhero.cli/scripts/run-assess.ts when the home directory is available. If none of the +// candidates exist, it returns "scripts/run-assess.ts" as a final fallback (which may not exist). func assessScriptPath() string { exePath, err := os.Executable() if err == nil { @@ -61,7 +65,9 @@ func (r *AssessRunResult) Close() { // RunAssessServiceRunner spawns the TS service runner for the maturity // assessment. The first stdin write is the AssessConfig JSON; the stream is -// kept open so the TUI can send subsequent interview-answer JSON lines. +// RunAssessServiceRunner starts the external "assess" runner with the given AssessConfig and streams parsed JSON events from its stdout. +// It writes the config as the first newline-delimited JSON line to the runner's stdin and keeps stdin open so callers may send subsequent interview-answer messages. +// The returned AssessRunResult exposes channels for streamed events and a single termination error, a buffer capturing the runner's stderr, and a writer for stdin; callers should call Close on the result to run cleanup. func RunAssessServiceRunner(input AssessConfig) (*AssessRunResult, error) { configJSON, err := json.Marshal(input) if err != nil { @@ -147,7 +153,10 @@ func RunAssessServiceRunner(input AssessConfig) (*AssessRunResult, error) { }, nil } -// SendInterviewAnswer writes a JSON-line answer event to the runner's stdin. +// SendInterviewAnswer writes an `interview-answer` JSON-line event for the given +// question and value to the runner's stdin. +// +// It returns an error if marshaling the event or writing to the runner's stdin fails. func SendInterviewAnswer(r *AssessRunResult, questionID, value string, isOption bool) error { evt := InterviewAnswerEvent{ Type: "interview-answer", diff --git a/tui/assess_summary.go b/tui/assess_summary.go index c416bd0..7edd2c1 100644 --- a/tui/assess_summary.go +++ b/tui/assess_summary.go @@ -11,7 +11,12 @@ import ( // // Each field shows a value when it has been resolved, "—" (dim) otherwise. // The "Assessment Setup" header includes an AI badge on the right when an -// AI model has been selected (matches the report's "Report Setup" header). +// renderAssessSummary renders a right-pane, bordered summary of an assessment configuration sized to the provided width. +// +// The rendered box contains labeled fields for Scope, Target, Display name, Evidence tier, Output format, Output path, +// Interview answers, and Mode. If cfg is nil the box contains "No configuration". A minimum width of 20 is enforced. +// Empty or whitespace-only values are shown as a dim em dash ("—"). When cfg.DryRun is true a right-aligned "dry-run" +// badge is shown and is placed on the same header line if there is sufficient space. func renderAssessSummary(cfg *AssessConfig, width int) string { if width < 20 { width = 20 @@ -89,6 +94,9 @@ func renderAssessSummary(cfg *AssessConfig, width int) string { return boxStyle.Width(innerWidth).Render(content) } +// fmtAssessScopeMode converts the assessment scope mode in cfg into a human-readable label. +// It maps "org" to "GitHub org", "local-repo" to "Local repository", and "both" to "Org + local checkout". +// For any other mode it returns an empty string. func fmtAssessScopeMode(cfg *AssessConfig) string { switch cfg.Scope.Mode { case "org": @@ -101,6 +109,12 @@ func fmtAssessScopeMode(cfg *AssessConfig) string { return "" } +// fmtAssessTarget returns a human-readable target string based on cfg.Scope.Mode. +// For "org" it returns "" when Org is empty, the Org name when no repos are listed, +// or "Org (repos...)" when repos are present (repos rendered compactly inside parentheses). +// For "local-repo" it returns cfg.Scope.LocalPath. +// For "both" it joins any non-empty Org and LocalPath with " · ". +// For any other mode it returns an empty string. func fmtAssessTarget(cfg *AssessConfig) string { switch cfg.Scope.Mode { case "org": @@ -126,6 +140,10 @@ func fmtAssessTarget(cfg *AssessConfig) string { return "" } +// fmtAssessTier converts an evidence tier identifier into a human-readable label. +// It maps "" and "auto" to "auto-detect", "gh" to "1 — gh CLI", "github-mcp" to +// "2 — GitHub MCP", and "git-only" to "3 — git-only". For any other input it +// returns the original tier string unchanged. func fmtAssessTier(tier string) string { switch tier { case "", "auto": @@ -140,6 +158,9 @@ func fmtAssessTier(tier string) string { return tier } +// fmtAssessOutputFormat returns a user-facing label for an output format key. +// It maps the empty string to "both", "both" to "both (md + json)", and preserves +// "markdown" and "json" as-is. For any other input it returns the input unchanged. func fmtAssessOutputFormat(format string) string { switch format { case "": @@ -154,6 +175,8 @@ func fmtAssessOutputFormat(format string) string { return format } +// fmtAssessAnswersFile provides a display string for the interview answers file. +// If path is empty it returns "interactive"; otherwise it returns the provided path. func fmtAssessAnswersFile(path string) string { if path == "" { return "interactive" @@ -161,6 +184,9 @@ func fmtAssessAnswersFile(path string) string { return path } +// fmtAssessRunMode determines the assessment run mode based on the provided configuration. +// If cfg.Mode is non-empty that value is used; otherwise it returns "interactive" when +// cfg.InteractiveInterview is true and "headless" otherwise. func fmtAssessRunMode(cfg *AssessConfig) string { if cfg.Mode != "" { return cfg.Mode diff --git a/tui/assess_wizard.go b/tui/assess_wizard.go index 94d8e73..35040c9 100644 --- a/tui/assess_wizard.go +++ b/tui/assess_wizard.go @@ -15,7 +15,11 @@ import ( // 1. Runs the framed scope wizard (matches the report's two-pane layout). // 2. Spawns the service runner and drives the Bubble Tea progress display. // 3. Round-trips interview questions through huh prompts (one at a time). -// 4. Opens the tabbed Glamour preview when the audit is written. +// runAssessInteractive runs an interactive assessment flow: it prompts for scope and settings via a framed wizard, starts the assessment service, streams progress and interview prompts to that service, and opens a preview of the resulting audit when available. +// +// The function prints cancellation and error notes to stderr, attempts to persist the updated assessment configuration (best-effort), and logs a note if preview rendering is unavailable. +// +// It returns an error if the assessment service reports a fatal error or if progress reports an unrecoverable error; otherwise it returns nil. func runAssessInteractive(cfg *AssessConfig) error { res, err := runAssessScopeWizard(cfg) if err != nil { @@ -122,7 +126,9 @@ type AssessWizardResult struct { // runAssessScopeWizard runs the scope-selection wizard inside a Bubble Tea // program. The View() renders the same shell-header + two-pane layout as -// the report wizard, with the right pane showing renderAssessSummary(). +// runAssessScopeWizard runs a framed, two-pane interactive wizard to collect and confirm assessment scope settings. +// It returns an AssessWizardResult containing a possibly-updated AssessConfig and flags indicating whether the +// user confirmed or aborted the wizard. An error is returned if the terminal UI failed to run. func runAssessScopeWizard(cfg *AssessConfig) (*AssessWizardResult, error) { cwd, _ := os.Getwd() @@ -152,6 +158,8 @@ func runAssessScopeWizard(cfg *AssessConfig) (*AssessWizardResult, error) { }, nil } +// defaultScopeMode returns cfg.Scope.Mode when it is non-empty; otherwise it returns "local-repo". +// The cwd parameter is accepted for callers' convenience but is not used. func defaultScopeMode(cfg *AssessConfig, cwd string) string { if cfg.Scope.Mode != "" { return cfg.Scope.Mode @@ -160,6 +168,7 @@ func defaultScopeMode(cfg *AssessConfig, cwd string) string { return "local-repo" } +// defaultLocalPath returns the configured local path for the assessment scope if set; otherwise it falls back to the provided working directory. func defaultLocalPath(cfg *AssessConfig, cwd string) string { if cfg.Scope.LocalPath != "" { return cfg.Scope.LocalPath @@ -429,7 +438,9 @@ func (m *assessWizardModel) formWidth() int { // --------------------------------------------------------------------------- // Helpers -// --------------------------------------------------------------------------- +// validateLocalPath reports an error when the provided path string is empty, does not +// exist, or exists but is not a directory. It trims surrounding whitespace before +// performing the checks and returns an explanatory error for each failure case. func validateLocalPath(s string) error { trimmed := strings.TrimSpace(s) @@ -446,6 +457,8 @@ func validateLocalPath(s string) error { return nil } +// requireNonEmpty returns a validator function that ensures a string is not empty. +// The returned function trims whitespace and returns an error formatted as " is required" when the result is empty, or nil otherwise. func requireNonEmpty(field string) func(string) error { return func(s string) error { if strings.TrimSpace(s) == "" { @@ -455,6 +468,8 @@ func requireNonEmpty(field string) func(string) error { } } +// parseRepoCSV splits s on commas and returns a slice of non-empty, trimmed segments. +// It trims leading and trailing whitespace from the input and from each segment; empty segments are omitted. func parseRepoCSV(s string) []string { parts := strings.Split(strings.TrimSpace(s), ",") out := make([]string, 0, len(parts)) diff --git a/tui/main.go b/tui/main.go index 7199f00..a4f5824 100644 --- a/tui/main.go +++ b/tui/main.go @@ -14,6 +14,8 @@ import ( // version is injected at build time via -ldflags "-X main.version=X.Y.Z" var version = "dev" +// printUsage writes the top-level help text to stderr, listing available subcommands +// and global flags. func printUsage() { fmt.Fprintf(os.Stderr, `Usage: teamhero [flags] @@ -129,6 +131,12 @@ Examples: `) } +// main is the CLI entrypoint; it parses command-line arguments, routes `--help` to +// subcommand-specific usage, and dispatches execution for `setup`, `doctor`, +// `assess`, headless, or interactive modes. +// It also handles global flags such as `--version` (prints build version) and +// `--show-config` (prints the saved configuration), and maps common cancellation +// or error conditions to appropriate exit codes. func main() { // Detect subcommand first so --help can be routed to the right usage. subcommand := ""