diff --git a/.coderabbit.yaml b/.coderabbit.yaml new file mode 100644 index 0000000..b724b59 --- /dev/null +++ b/.coderabbit.yaml @@ -0,0 +1,19 @@ +# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json +# CodeRabbit configuration — https://docs.coderabbit.ai/configuration/auto-review +# +# Goal: auto-review PRs opened by Claude (branches named claude/*). +# CodeRabbit auto-reviews every PR targeting the default branch (main) by default, +# so PRs from claude/* branches are covered automatically. +# +# Note: the CodeRabbit schema has no source/head-branch filter. `base_branches` +# only matches the TARGET branch, not the source. The configuration below simply +# ensures auto-review stays on for the default branch. + +language: en-US + +reviews: + auto_review: + enabled: true + drafts: false + auto_incremental_review: true + auto_pause_after_reviewed_commits: 5 diff --git a/.gitignore b/.gitignore index 1b6c7f8..d93a618 100644 --- a/.gitignore +++ b/.gitignore @@ -66,6 +66,13 @@ dist/ teamhero-report-* docs/maintenance_results.md +# Interview kit scaffolds — `teamhero interview bootstrap` defaults to +# ./interviews/. Both entries are listed: `interviews/` is the current +# default; `roles/` remains so users with content from the prior default +# don't accidentally commit it on upgrade. +interviews/ +roles/ + # Cache out claude-plugin/bin/ diff --git a/.mcp.json b/.mcp.json deleted file mode 100644 index 2779415..0000000 --- a/.mcp.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "mcpServers": { - "agentvibes": { - "command": "npx", - "args": [ - "-y", - "--package=agentvibes", - "agentvibes-mcp-server" - ] - } - } -} diff --git a/README.md b/README.md index 61c64e6..3575ae1 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,35 @@ teamhero report --headless --since 2026-03-01 --until 2026-03-14 --sections loc, Run `teamhero report --help` for all flags. +### 4. Review candidate interviews + +TeamHero also includes a candidate AI-collaboration interview reviewer. Run +the interactive wizard to configure a role: + +```bash +teamhero interview bootstrap +``` + +The wizard walks you through role slug, tech stack, business domain, feature +description, time-box, project mode, analysis mode, and rubric mode (with +conditional follow-ups for a custom prompt or a job-description file). + +Once a candidate has submitted their repository, review it: + +```bash +teamhero interview review --candidate "Jane Doe" --repo https://github.com/jane/submission +``` + +The review run prints a phased progress display (clone → collect-evidence → +extract-measurements → observe → audit-write) and finishes with a +glamour-rendered preview of the audit. **Every audit ships with a mandatory +ADVISORY banner** — the audit is advisory; hiring decisions are made by +humans. See `docs/interview-classification-rationale.md` for the full +ethical framing. + +For scripting or agents, the headless flag list documented at +`teamhero interview bootstrap --help` is fully equivalent to the wizard. + --- ## Use with Claude Code diff --git a/docs/2026-05-09-candidate-interview-reviewer-plan.md b/docs/2026-05-09-candidate-interview-reviewer-plan.md new file mode 100644 index 0000000..8947f30 --- /dev/null +++ b/docs/2026-05-09-candidate-interview-reviewer-plan.md @@ -0,0 +1,809 @@ +# Plan — Candidate Interview Reviewer + +> **Status:** Design landed via grilling session 2026-05-09 (revised same day across multiple architectural and product refinements; final ethical revision drops all numerical scoring in favor of observations + raw measurements). **All open questions are resolved.** +> +> **Implementation status (2026-05-11):** +> - **MVP (5 slices) shipped** on branch `claude/slice-1-foundation` / PR #10. Beads tracked as `teamhero-scripts-{3sh,7qe,h13,bt2,9v2}` — all closed. Ships the rubric module, headless bootstrap, kit, full assessment pipeline (collectors, extractors, AI observer, audit writer), and cohort summary + Claude skill. +> - **v1.5 (3 slices) shipped** on same PR. Beads `teamhero-scripts-{92a,upo,rjp}` — all closed. Adds the interactive `huh` wizard for `bootstrap`, shared interview style module, glamour-rendered audit preview, phased progress display consuming the JSON-lines protocol, and a manual end-to-end smoke script + doc refresh. +> - **Still deferred:** `list-roles` / `list-candidates` verbs, multi-model bias diversification, periodic anonymized bias audit. See the "Out of Scope" section below for the full deferred list. + +## Context + +We are adding a **candidate-screening tool** that surfaces structured observations about how engineers collaborate with generative AI during a live-observed coding interview. The tool produces: + +- A per-candidate two-tier audit (lightweight observation summary + drill-down audit log), and +- A cohort-level summary that lists all candidates for a role with sign-off status — no comparative ranking. + +This feature is the **implementation side of maturity criterion D12** ("Interviews assess judgment under AI augmentation") from the Agent Maturity Assessment work on `claude/condescending-tereshkova-88a936`. A team that ships this feature is *how* a team scores 1.0 on D12. + +Strategically, it joins `report` and `assess` as the third pillar of an **engineering-org-effectiveness suite**: + +| Tool | What it produces | Granularity | +|---|---|---| +| `teamhero report` | Team output / activity report | Per-team, per-window | +| `teamhero assess` | Org readiness for agentic AI (12 criteria, scored) | Per-org, scored | +| `teamhero interview` *(this feature)* | Per-candidate AI-collaboration observations (9 dimensions, no scores) | Per-candidate, observed | + +Note: `assess` produces scores because it scores an *organization* (a non-human entity); `interview` does NOT produce scores because it observes a *person*. The ethical floor for evaluating humans is higher. + +## TL;DR — Headline Decisions + +1. **Build the skill, don't buy.** None of TestGorilla / Eximius / Coderbyte / HackerEarth / Glider.ai / CoderPad / Codility Cody ship rubric-based AI-collaboration assessment. Codility Cody is the closest commercial fit (provides runtime + transcript) but lacks the analyzer. +2. **Kit + service, not hosted sandbox.** Build effort drops ~85% (from 3–4 months to 3–4 weeks). +3. **Candidate uses their own stack.** Live observation by the interviewer absorbs cheating risk. Interviewer-side captures audio transcript via Granola/Fireflies/Otter. +4. **$50 gift card per candidate** covers any token / subscription expense regardless of stack. +5. **macOS / Linux / WSL only at MVP.** +6. **DDD namespace: `teamhero interview `.** All interview-related actions live under a single bounded-context command. +7. **One skill handles the entire bounded context.** `~/.claude/skills/teamhero-interview/SKILL.md` — a thin wrapper that knows about all verbs and invokes the CLI accordingly. **The skill must handle all of the things we need to do within the interview** — exactly one skill, not one per verb. +8. **Two project modes per role:** (A) AI-bootstrapped extension project, or (B) greenfield from brief. Hiring manager chooses at bootstrap time. +9. **Time box presets** 60 / 90 / 120 min, fixed per-project. +10. **CLI is load-bearing; skill is thin wrapper.** Same pattern as `agent-maturity-assessment` skill wrapping `teamhero assess`. +11. **Hybrid evidence collection per dimension:** 4 deterministic (raw measurements), 2 hybrid (measurements + narrative observation), 3 LLM-judge (narrative observation only). +12. **One mega-call** to the Responses API generates narrative observations across all LLM-judge dimensions in a single strict `json_schema` request, with cited evidence excerpts. **No scores are produced — only observations and evidence.** +13. **Two-tier output:** lightweight `summary.md` (hiring manager's first read, observation-formatted) + drill-down `audit.md` + `audit.json` + raw `evidence/`. +14. **TUI parity required:** the implementer MUST mirror existing `tui/report*.go` and `tui/assess*.go` patterns. Hard requirement, not stylistic preference. +15. **Standalone classification-rationale document** (`docs/interview-classification-rationale.md`) explains the methodology — including why we collect observations rather than scores — for hiring-decision defensibility. +16. **AI analysis is opt-in per role.** Bootstrap wizard asks the hiring manager whether they want AI-assisted observation generation or human-only review. The hiring manager — not the tool — decides which review style this role uses. +17. **AI output is advisory, never determinative.** The TUI, every audit document, and the cohort summary MUST prominently display a human-stakes warning. The candidate is a person; the hiring manager's professional evaluation is the primary, first, and most important factor. +18. **No numerical scoring; observations and raw measurements only.** No per-dimension scores, no weighted total, no high/mid/low band, no ranking math. The AI produces narrative observations per LLM-judge dimension and raw measurements per deterministic dimension. The hiring manager evaluates the observations and evidence and makes all comparative judgments themselves. **This is an ethical decision driven by cognitive anchoring, false precision, comparative drift, bias amplification, and legal exposure concerns** — see the [Why Observations, Not Scores](#why-observations-not-scores) section for the full rationale. The rubric becomes a structured evaluation lens, not a measurement instrument; this aligns with how Matt Pocock teaches good AI-coding practices — through observation and pattern recognition, not numerical reduction. +19. **No pre-production calibration against past hires.** The rubric IS the methodology framework; the LLM produces observations within that framework; the human signs off. +20. **Three rubric modes per role**, selected by the hiring manager at bootstrap time: + - **Custom prompt** — manager provides their own observation prompt + - **Default rubric** — canonical 9-dimension framework, no JD + - **Default rubric + job description** — canonical framework + markdown JD as additional context +21. **Job description as input** — markdown only at MVP (no URL fetching, no PDF parsing). +22. **Two-step cohort workflow at MVP, skill-orchestrated.** Step 1: per-candidate observation generation. Step 2: cohort summary roll-up. +23. **Local disk only at MVP, no cloud storage.** +24. **Privacy release as a kit gate.** Candidate must complete `PRIVACY_RELEASE.md` before `start.sh` will proceed. +25. **Retention is manager-discretion.** No automated retention policy enforced by the tool. +26. **Session recording URL captured in frontmatter.** Live-session video stays at the conferencing platform; URL is reference material for the human reviewer only — NOT fed to the LLM. + +## Why Observations, Not Scores + +The decision to produce observations rather than scores is **the central ethical commitment of this feature.** It is also the decision that makes the tool genuinely useful rather than performatively rigorous. + +### What scoring would have introduced + +We considered (and initially designed) a scoring system: per-dimension scores of 1.0 / 0.5 / 0.0, a weighted total in the 0.0–1.0 range, and a high / mid / low band classification. We built robust safeguards: opt-in mode, human-stakes warning banners, mandatory sign-off, defensibility documentation. The safeguards mitigate harm but do not eliminate it. Five problems remained: + +1. **Cognitive anchoring.** A reviewer who sees "0.6" forms an impression before reading the evidence. Reading is sequential; judgment is not. The framing of "advisory" doesn't undo the anchoring effect — research on numerical anchoring is robust across decades. Even careful reviewers are influenced by the number they saw first. + +2. **False precision.** We rejected pre-production calibration. The score, however well-structured the rubric, would be — fundamentally — an LLM's vibe check expressed as a number. Presenting unreliable judgment with three significant figures (or even one) implies measurement quality the methodology does not have. False precision is worse than no precision because it triggers the human's "this looks rigorous" heuristic. + +3. **Comparative drift.** We tried to prevent comparative ranking by removing score-ordering from the cohort view. But two `summary.md` files side by side that show "0.7" vs "0.85" invite comparison regardless of how we frame the cohort view. You cannot tell humans not to compare numbers in front of them. The compactness of a numerical score makes it cognitively cheap to compare; the cost of choosing not to compare doesn't enter. + +4. **Bias amplification at scale.** Numerical scoring across many candidates enables averaging, thresholding, filtering, and other operations where systemic bias compounds. Even when not intended, the existence of comparable numbers invites operations that amplify whatever bias the rubric encodes. This is exactly the failure mode regulators have flagged in algorithmic hiring tools (NYC Local Law 144, EU AI Act high-risk classification). + +5. **Legal exposure asymmetry.** "The AI scored her at 0.6 and we didn't hire her" is a legally fraught artifact. "The AI noted she didn't write tests after accepting AI-suggested code, and we didn't hire her based on multiple factors" is much more defensible. The first invites the discrimination challenge "is the AI biased?" The second invites the challenge "did the human make a fair decision?" — easier to defend with the human's own reasoning artifacts. + +### What observations provide instead + +Observations + evidence + raw measurements provide **all the value of the rubric structure without these harms.** Compare: + +> "She scored 0.7 on architectural quality." + +vs. + +> "The candidate added the new rate-limiting feature inside the existing `middleware/` module, respecting the existing public interface. However, two helper functions ended up in a new top-level `utils.ts` file rather than within an existing deep module. Tests cover the new behavior but not the helper functions. Cited excerpts: prompt at 00:34:21, file diff at commit `a3f9c0b`." + +The second is harder for the AI to produce, more useful for the human, more defensible legally, more ethical, more actionable, and more aligned with how engineers actually develop AI-collaboration skill. The number adds nothing the observation does not already convey better. + +### Alignment with how AI-coding practices are actually taught + +The rubric's 9 dimensions are grounded in the framing Matt Pocock and other AI-coding educators have articulated: context engineering, "tasting" output for quality, knowing when to drive vs. delegate, treating AI as a junior engineer you mentor and verify. **Pocock does not teach these practices through scoring.** He teaches through observation and pattern recognition — by demonstrating examples of strong and weak practice, articulating the principles, letting engineers internalize the patterns through repeated exposure. + +A tool that surfaces observations against a structured rubric is doing exactly what Pocock-style teaching does: structured pattern recognition. A tool that reduces those observations to a number is doing something different — and arguably less useful for the engineer being evaluated, the engineer doing the evaluation, and the integrity of the practice. + +### The AI as a structurally different perspective — NOT a "non-biased" perspective + +A common (and wrong) framing of AI-assisted hiring tools is that the AI provides a "non-biased" or "objective" viewpoint that compensates for human bias. **This claim is factually incorrect** and the design here does not rest on it. + +LLMs trained via RLHF carry systematic biases: + +- **Training-data bias** — overrepresentation of certain demographics, languages, and cultural contexts in the training corpus +- **Preference-tuning bias** — the human raters during fine-tuning encode their own demographic and aesthetic preferences into the model's "what is preferred" signal +- **Sycophancy bias** — LLMs are trained to agree with their user, which can subtly slant observations toward what the prompt or framing implies the user expects +- **Familiarity bias** — the model is more familiar with mainstream tools and patterns, which disadvantages candidates using less-mainstream alternatives +- **Verbosity preference** — models tend to view verbose output more favorably than concise output, even when concise is better engineering +- **Name and demographic-cue bias** — empirically documented disparate treatment of candidates based on name alone, even when other inputs are identical + +The honest claim about what this tool provides is **bias diversification, not bias elimination**: + +- The AI's biases are *different* from any individual hiring manager's biases +- Different biases mean the *overlap set* of biases is smaller than either alone — biases the manager has but the AI doesn't get caught by the AI's perspective; biases the AI has but the manager doesn't get caught by the manager's +- Two imperfect perspectives covering different blind spots is better than one + +However: + +- AI bias is **systematic across all candidates** (every candidate gets evaluated by the same biased model), while individual manager biases are local +- This means AI bias scales harm more efficiently than human bias +- The observations-not-scores design is one mitigation — numbers compound bias; observations let humans interrogate the reasoning +- The mandatory sign-off, the human-in-the-loop framing, and the reasoning-preserved-in-summary all reinforce that the AI's output is one biased perspective being offered alongside another biased perspective, NOT a corrective ground truth + +The classification-rationale doc must capture this honestly. A defense of the methodology that claims "the AI is unbiased" is indefensible when challenged because it is factually wrong. The defensible claim is: "the AI provides a structurally different perspective with different biases, used as one input to a human-led decision process." + +### What the human reviewer keeps + +The hiring manager's role becomes more demanding, not less. Without a number to anchor on, the manager must: + +- Read the observations +- Examine the evidence +- Form their own judgment about each dimension +- Weigh dimensions against each other based on the role's needs +- Make a `Hire / Hire with notes / No hire` recommendation + +This is harder than reading a score. It is also exactly what professional hiring judgment looks like. The tool does not replace the manager's thinking; it gives the manager structured material to think with. + +### What we lose + +| What we lose | Acceptable trade-off? | +|---|---| +| Compactness — cohort view is several lines per candidate, not a single row of numbers | Yes. Hiring decisions should not be one-row-scannable. | +| Apparent rigor of a numerical artifact | Yes. The apparent rigor was misleading; the underlying judgment was not numerical. | +| Override-pattern tracking via score deltas (was a quality-floor mechanism for scoring) | Yes. Replaced with qualitative tracking: when the manager's hire/no-hire diverges from the AI's narrative slant, flag for rubric review. | +| The maturity-assessment-style "score / total / band" output familiarity | Acceptable. The `assess` command scores an organization; `interview` observes a person. Different ethical floor justifies different output. | + +### What we keep + +- The 9-dimension rubric as a structured evaluation lens +- Cited evidence excerpts pulled from agent log, terminal recording, transcript, and repo +- Raw measurements presented as facts (test counts, timing, destructive-op detection) +- Two-tier output (summary + drill-down audit) +- Privacy release, sign-off step, warning banner +- Single source of truth for the rubric framework +- Comparable rigor across candidates — through structure, not through numbers + +## Why Build Lost the Buy Comparison + +| Vendor | Candidate uses agent during session | Transcript / prompt log | Automated AI-collab analysis | Defensible-by-design | +|---|---|---|---|---| +| TestGorilla | ❌ AI is the *interviewer* | n/a | n/a | n/a | +| Eximius | ❌ Resume / chat-voice screening | n/a | n/a | n/a | +| HackerEarth | ❌ AI is the *interviewer* | Replay exists; AI-prompt logging not specified | Not documented | Not documented | +| Glider.ai | ⚠️ "AI Assistant" is a hint bot, not a coding agent | Session replay | Not documented | Not documented | +| Coderbyte | ✅ Agent mode (Claude/GPT can edit/create files) | Not documented for AI prompts | Not documented | Not documented | +| **CoderPad** | ✅ AI-enabled IDE | Keystroke playback + prompt history | Not documented (philosophy only) | Not documented | +| **Codility Cody** | ✅ Chat / Agent / Autocomplete; gpt-4o-mini, gpt-5-mini | Full transcripts in post-interview report | ❌ Manual review only — provides scores, not observations | ❌ Score-based | + +**Gap nobody fills:** structured AI-generated observations (no scores) on AI-collaboration practices, with cited evidence and a documented human-in-the-loop protocol. Closest commercial fits all produce numerical scores with limited safeguards. + +## The Rubric (9 dimensions, two thematic groups) + +The rubric is grounded in maturity criteria one level down (org → per-engineer): + +### Process dimensions + +| # | Dimension | What it observes | Maturity criterion lineage | +|---|---|---|---| +| 1 | **Upfront design & decomposition** | Whether the candidate plans and decomposes before prompting, or prompts straight into code. | B5 | +| 2 | **Context engineering** | Whether the candidate feeds repo context (CLAUDE.md, glossary, file paths, constraints) to the agent. | B7 + Pocock | +| 3 | **Critical evaluation / "tasting"** | Whether the candidate catches AI errors before running them. Whether they reject hallucinated APIs or bad logic. | C9 + Pocock | +| 4 | **Verification discipline** | Whether the candidate writes/runs tests, reads diffs, checks outputs. Or accept-and-pray. | C9 + C10 | +| 5 | **Course-correction** | When stuck, whether the candidate rolls back, reframes, switches approaches. Or thrashes. | Pocock | +| 6 | **Risk awareness** | Whether the candidate pauses on destructive operations. Whether they prefer reversible actions. | C11 | + +### Outcome dimensions + +| # | Dimension | What it observes | Maturity criterion lineage | +|---|---|---|---| +| 7 | **Architectural quality** | The final artifact: deep modules, clean interfaces, sprawl. | B6 | +| 8 | **Test pass / spec satisfaction** | Whether the work meets the brief. Whether tests pass. | C10 | +| 9 | **Throughput** | Time-to-working solution; how the candidate paced their work. | — | + +The dimensions are grouped thematically (process / outcome) for organization, but **there is no weighting** — that would imply scores can be combined, and we don't produce scores. The hiring manager weighs dimensions against each other in their own judgment, informed by the role's needs. + +## Evidence Collection Strategy + +Each rubric dimension generates evidence using one of three approaches. The output is observations and measurements, not scores. + +### Hybrid classification per dimension + +| # | Dimension | Approach | Output produced | +|---|---|---|---| +| 1 | Upfront design & decomposition | **LLM-judge** | Narrative observation (1–3 sentences) + cited evidence excerpts | +| 2 | Context engineering | **Hybrid** | Raw signal counts (e.g. "3 CLAUDE.md references in prompts; 7 glossary terms used") + narrative observation | +| 3 | Critical evaluation / "tasting" | **LLM-judge** | Narrative observation + cited diff excerpts showing kept-vs-rejected suggestions | +| 4 | Verification discipline | **Deterministic** | Raw measurements as facts: test-run counts, frequency, interleaving with prompts | +| 5 | Course-correction | **Hybrid** | Detected signals (git resets, prompt re-asks, file rollbacks) + narrative observation | +| 6 | Risk awareness | **Deterministic** | Raw measurements as facts: detected destructive commands, pause-before-Enter timing | +| 7 | Architectural quality | **LLM-judge** | Narrative observation on the final artifact + cited code excerpts | +| 8 | Test pass / spec satisfaction | **Deterministic** | Raw measurements as facts: pass/fail per acceptance criterion | +| 9 | Throughput | **Deterministic** | Raw measurements as facts: timestamps from asciinema + git + agent log | + +Totals: **4 deterministic (4, 6, 8, 9), 2 hybrid (2, 5), 3 LLM-judge (1, 3, 7)**. + +### Call structure: single mega-call + +All LLM-judge dimensions (and the LLM half of hybrid dimensions) generate observations in **one OpenAI Responses API call** using strict `json_schema` returning an array of `Observation` objects. Same shape pattern as the maturity assessment AI scorer (`src/services/maturity/ai-scorer.ts`), but the produced artifacts are observations and reasoning rather than scores. + +Rationale unchanged from the prior design: prompt-cache hits perfectly across the rubric definition + evidence package; holistic context (the LLM sees all dimensions at once); single trace; ~5× cheaper than per-dimension calls. + +Deterministic dimensions skip the LLM call entirely; their outputs are computed from extracted signals and presented as plain facts. + +### Prompt-level guard against interviewer-bias injection + +The audio transcript and interviewer notes feed the LLM observer. A biased interviewer remark (e.g., "she seemed nervous," "he was hesitant") can propagate into the AI's narrative observation if not guarded against. The observation prompt MUST include the following instruction verbatim (or a close paraphrase): + +> "The audio transcript and interviewer notes are provided as context about what was happening during the session. Treat the interviewer's verbal commentary as situational context only — do NOT weight it as evidence of the candidate's skill, competence, or character. Your observations must be grounded in the candidate's *actions* (prompts they wrote, tools they used, code they produced, tests they ran, decisions they made) — not in the interviewer's framing of those actions. If an interviewer remark could be interpreted multiple ways, do not let it bias your observation; rely on the directly observable artifacts (interview.log, terminal.cast, repo state)." + +This instruction tightens the input/output boundary so interviewer bias doesn't compound into AI bias. Implementation: add this paragraph to the LLM-observer prompt template. Validate by inspecting the first 10 candidates' observations for any phrasing that echoes interviewer commentary verbatim — if found, tighten the instruction further. + +### Strict JSON schema (for the Responses API call) + +```typescript +type Observation = { + dimension_id: string; // e.g. "context-engineering" + observation: string; // narrative, 1-3 sentences — primary artifact + reasoning: string; // unconstrained text; chain-of-thought — preserved in BOTH tiers + evidence_excerpts: Array<{ // cited evidence supporting the observation + timestamp?: string; // ISO8601 if from terminal.cast / interview.log + source: "terminal.cast" | "interview.log" | "transcript" | "git" | "repo"; + content: string; // the cited excerpt (truncated to ~200 chars in summary; full in audit) + }>; + caveats?: string; // optional; populated when the observation is uncertain +}; + +type Measurement = { + dimension_id: string; + facts: Array<{ // raw measurements presented as facts + label: string; // e.g. "Test runs total" + value: string | number; // e.g. 5 or "8/8 passing" + context?: string; // optional surrounding info + }>; +}; + +type ReviewResult = { + rubric_version: string; + candidate_id: string; + role_slug: string; + observed_at: string; // ISO8601 — note: not "scored_at" + observations: Observation[]; // for LLM-judge and hybrid dims + measurements: Measurement[]; // for deterministic and hybrid dims +}; +``` + +Note what's NOT in the schema: `score`, `weighted_total`, `raw_total`, `band`, `signal_count`. The LLM is instructed never to produce a numerical assessment of the candidate. If a future prompt drift produces one anyway, validation rejects the response. + +The `reasoning` field is preserved in BOTH tiers (summary and audit). This is intentional: showing the AI's chain-of-thought lets the manager interrogate "why did the AI reach this observation?" without drilling into a separate file. It also reinforces transparency — observations are presented alongside their reasoning so the manager can weigh both. Trade-off: `summary.md` becomes longer (~3–4 pages instead of 1–2). This is acceptable; observation-based output is necessarily longer than score-based output, and we already accepted that trade-off when dropping numerical scoring. + +## Two-Tier Output and Defensibility + +### Per-candidate output layout + +``` +docs/interviews//-/ +├── summary.md ← TIER 1: lightweight (~1–2 pages) +│ Per-dim observation + measurements + cited evidence excerpts. +│ Sign-off section. Hiring manager reads first. +├── audit.md ← TIER 2: full reasoning trace per dimension +│ + complete evidence excerpts + raw signal values +│ + LLM chain-of-thought reasoning text. Opens only if questions arise. +├── audit.json ← TIER 2: machine-readable; same content as audit.md +└── evidence/ ← TIER 2: raw inputs preserved verbatim + ├── interview.log + ├── terminal.cast + ├── transcript.txt + ├── PRIVACY_RELEASE.md (signed) + └── interviewer-notes.md (if provided) +``` + +Single observation generation run produces both tiers; no double work. + +### Cohort-level output + +The cohort summary lists candidates with sign-off status only — no scores, no totals, no ordering by anything that implies ranking. + +``` +docs/interviews//COHORT.md +``` + +Format — one row per candidate, alphabetical or chronological order: + +```markdown +⚠ THIS REPORT IS ADVISORY. Hiring decisions are made by humans using + professional judgment. The candidate is a person, not a score. ... + +# Cohort: Senior Backend Engineer (2026 Q2) + +| Candidate | Interviewed | Sign-off | Recommendation | Audit | +|--------------|---------------|----------------|------------------------|----------------------------------------------------| +| Alice Chen | 2026-05-12 | ✅ Reviewed | Hire with notes | [link to summary.md](alice-2026-05-12/summary.md) | +| Bob Park | 2026-05-13 | ⏳ Pending | — | [link to summary.md](bob-2026-05-13/summary.md) | +| Carol Singh | 2026-05-14 | ✅ Reviewed | Hire | [link to summary.md](carol-2026-05-14/summary.md) | +``` + +Hiring manager clicks through to per-candidate `summary.md` for the per-dimension observations. The cohort view has zero numerical content. The `Recommendation` column is the manager's categorical choice from sign-off (Hire / Hire with notes / No hire), not anything the AI produced. + +### Per-candidate `summary.md` template + +```markdown +--- +tags: [hiring, candidate, ] +candidate: +role: +date: +rubric_version: +rubric_mode: default | custom | default-with-jd +signed_off: true | false +session_recording_url: +session_platform: zoom | teams | meet | other | none +session_date: +--- + +⚠ THIS AUDIT IS ADVISORY. Hiring decisions are made by humans using + professional judgment. The candidate is a person, not a score. + This rubric is one factor among many; your evaluation is the + primary, first, and most important basis for your decision. + +# Candidate observations: + +## Process dimensions + +### 1. Upfront design & decomposition +**Observation:** [LLM narrative, 1-3 sentences] +**Reasoning:** [LLM chain-of-thought explaining the observation] +**Evidence:** +- [excerpt 1, with source citation] +- [excerpt 2, with source citation] + +### 2. Context engineering +**Measurements:** +- CLAUDE.md references in prompts: [n] +- Glossary terms used in prompts: [n] +- Files referenced explicitly in prompts: [n] +**Observation:** [LLM narrative] +**Reasoning:** [LLM chain-of-thought] +**Evidence:** +- [excerpts] + +### 3. Critical evaluation / "tasting" +**Observation:** [LLM narrative] +**Reasoning:** [LLM chain-of-thought] +**Evidence:** +- [diff excerpts showing kept-vs-rejected suggestions] + +### 4. Verification discipline +**Measurements:** +- Test runs: [n] total, interleaved with [m] prompts +- Diff/grep commands: [n] +- Final test state: [pass/fail counts] + +### 5. Course-correction +**Detected signals:** +- Git resets: [n], at [timestamps] +- Prompt re-asks: [n] +- File rollbacks: [n] +**Observation:** [LLM narrative] +**Reasoning:** [LLM chain-of-thought] +**Evidence:** +- [excerpts] + +### 6. Risk awareness +**Measurements:** +- Destructive commands detected: [list of commands + timestamps + pause durations] + +## Outcome dimensions + +### 7. Architectural quality +**Observation:** [LLM narrative on the final artifact] +**Reasoning:** [LLM chain-of-thought] +**Evidence:** +- [code excerpts] + +### 8. Test pass / spec satisfaction +**Measurements:** +- Acceptance criteria: [n/m passing] +- Test suite: [n/m passing] + +### 9. Throughput +**Measurements:** +- Total elapsed: [HH:MM] +- Time to first passing test: [HH:MM] + +## Reviewer sign-off + +I have personally reviewed this audit, weighed it alongside my own +professional evaluation of the candidate, and made my hiring +recommendation based on my judgment — not solely on the rubric +observations. + +Reviewer name: ___________________________ +Date: ___________________________ +Recommendation: [ ] Hire [ ] Hire with notes [ ] No hire + +**Reasoning summary (required, written in your own words):** +Why did you reach this recommendation? What did you weigh most heavily? +Was there anything in the AI's observations you disagreed with, and why? + +[ ____________________________________________________________________ ] +[ ____________________________________________________________________ ] +[ ____________________________________________________________________ ] + +Additional notes (optional): +[ ____________________________________________________________________ ] + +--- +*The reasoning summary is required to complete sign-off. The TUI will not +accept a blank field. Its purpose is to ensure the manager has genuinely +engaged with the audit rather than rubber-stamping the AI's observations.* + +--- +*Generated using rubric v; see [interview-classification-rationale.md](../../interview-classification-rationale.md) for methodology.* +``` + +### Defensibility document + +`docs/interview-classification-rationale.md` covers, for each of the 9 dimensions: + +- Why it's classified as deterministic / hybrid / LLM-judge +- What signals are extracted, in what order +- The kinds of observations the LLM is instructed to produce +- Known limitations and observed failure modes +- Version history + +**Top-section preamble** establishes the human-in-the-loop principle and the rationale for observations-not-scores before any methodology details. Auditors should encounter the ethical framing first. + +## Privacy, Consent, and Storage + +[unchanged from prior version — see Privacy Release wording, local-disk-only storage, Obsidian conventions, retention guidance, right-to-erasure] + +### Candidate consent (privacy release) + +The kit includes `PRIVACY_RELEASE.md` — a consent template that the candidate must complete before `start.sh` will proceed. Default placeholder wording: + +```markdown +# Submission Consent + +By submitting this work and participating in this interview session, I grant +[Company Name] a non-exclusive, royalty-free license to retain, review, and +analyze: + +- This submission, including any captured logs, transcripts, screen recordings, + and terminal recordings. +- The full audio and video recording of the live interview session conducted + via [Zoom / Microsoft Teams / Google Meet / other platform], for the purpose + of evaluating my candidacy. + +I acknowledge that: + +- Submissions and session recordings may be reviewed by Company personnel. +- AI tools may be used to generate observations about my AI-collaboration + practices, with all observations reviewed by humans before any hiring + decision is made. +- AI tools will NOT produce numerical scores about me; they produce + observations and citations of evidence that humans evaluate. +- AI tools will NOT be given access to the session video recording — it is + reserved for human reviewer reference. +- **My submission and recordings will NOT be used to train any AI models.** + Submitted artifacts are used only for the purpose of evaluating my + candidacy for this role. +- This evaluation does not create an employment relationship. +- I may request deletion of my submission and any associated recordings at any + time after the evaluation process concludes. +- **If I believe the evaluation contains factual errors or unfair characterization, + I may contact [Company contact email] within 30 days of receiving a hiring + decision to request review.** A human reviewer will respond and document + any corrections. + +Signed: ___________________________ +Date: ___________________________ +``` + +**The kit ships this file marked "REVIEW WITH LEGAL BEFORE USE."** + +### Storage: local disk only at MVP + +| Artifact | Location | Persistence | +|---|---|---| +| Candidate cloned repos | `~/.cache/teamhero/interview-clones//` | Temporary — deleted after observation generation | +| Per-candidate audits | `//-/{summary.md, audit.md, audit.json, evidence/}` | Persistent — at manager-configured path | +| Cohort summary | `//COHORT.md` | Persistent | +| Role config | `//role.json` | Persistent | +| Privacy release | Inside candidate's audit `evidence/` directory | Persistent — preserved as legal record | + +`` is configured by the hiring manager at bootstrap time. Default is the teamhero project's `docs/interviews/`, but can point anywhere on local disk — including Obsidian vault subfolders, NAS shares, encrypted volumes, etc. + +### Obsidian-friendly conventions + +- **YAML frontmatter** on `summary.md` and `audit.md` (see template above) — Obsidian indexes these +- **Wikilinks** between artifacts where helpful +- **Internal-link relative paths** so files render in any markdown viewer + +### Retention and right-to-erasure + +Manager-discretion. Tool ships with guidance: retain at least until the hiring decision is finalized; longer retention follows company HR policy. GDPR right-to-erasure is trivial — manager deletes the candidate's audit folder. + +## Architecture + +### Three audiences, three flows + +| Audience | When | Tool used | Frequency | +|---|---|---|---| +| **Hiring manager** | Once per role (before candidates start) | `teamhero interview bootstrap` (wizard) | 1× per role | +| **Candidate** | During the interview session | The kit's `start.sh` / `end.sh` | 1× per candidate | +| **Interviewer** | After each candidate's session | `teamhero interview review` | 1× per candidate | + +### CLI namespace (DDD-organized) + +``` +teamhero interview # (no verb) → prints help + verb menu +teamhero interview bootstrap # MVP: bootstrap a role's project +teamhero interview review # MVP: produce observations for a single candidate +teamhero interview cohort # MVP: produce cohort summary roll-up for a role +teamhero interview list-roles # v1.5 +teamhero interview list-candidates # v1.5 +``` + +Cohort iteration across candidates is orchestrated by the **`teamhero-interview` Claude skill**, not by a CLI batch mode. + +### Repo / module layout + +``` +src/services/interview/ ← all interview logic, organized by verb +├── bootstrap/ +│ ├── project-generator.ts +│ ├── validator.ts +│ └── prompts.ts +├── review/ ← observation generation, not scoring +│ ├── evidence-collectors.ts # per-input adapters (asciinema, JSONL, markdown, audio) +│ ├── deterministic-extractors.ts # raw measurements for the 4 deterministic dims +│ ├── ai-observer.ts # mega-call to Responses API for LLM-judge dimensions +│ └── prompts.ts # observation-generation prompt + REVIEW_RESULT schema +├── cohort/ +│ ├── summary.ts # cohort listing — no ranking math +│ └── audit-store.ts # cohort persistence (COHORT.md per role) +└── shared/ + ├── rubric.ts # single source of truth, RUBRIC_VERSION + ├── audit-writer.ts # both tiers from the same ReviewResult + └── types.ts + +scripts/run-interview-bootstrap.ts +scripts/run-interview-review.ts ← invoked by the Go TUI when the operator runs `teamhero interview review` + +tui/interview.go +tui/interview_bootstrap_*.go +tui/interview_review_*.go ← UI displays observations, not scores +tui/interview_cohort_*.go + +teamhero-interview-kit/ ← candidate-facing recording/logging kit +├── start.sh +├── end.sh +├── INTERVIEW_RULES.md +├── RUBRIC_OVERVIEW.md ← plain-language summary of the 9 dimensions for the candidate +├── PRIVACY_RELEASE.md ← placeholder consent template +└── .claude/ + ├── settings.json + └── CLAUDE.md + +~/.claude/skills/teamhero-interview/SKILL.md + +docs/ +├── 2026-05-09-candidate-interview-reviewer-plan.md ← THIS FILE +├── interview-rubric.md ← formal rubric (9 dims, observation framework) +├── interview-classification-rationale.md ← defensibility doc, leads with ethics preamble +└── interviews// ← per-role audit outputs +``` + +The `review/` subdirectory naming reflects that we're reviewing — observing and producing an advisory audit, not scoring. The CLI verb (`review`) and the directory name match deliberately: the language is consistent end-to-end with the ethics floor (observations, not scores). + +### The single skill + +There is exactly **one** skill for the whole interview domain. It must handle **everything** the engineering manager and interviewer need to do — bootstrap, observation generation, cohort viewing, listing roles/candidates. No business logic in the skill — all logic lives in `src/services/interview/`. + +## TUI Implementation Constraints + +The implementer **MUST** mirror the existing `report` and `assess` TUI patterns. Before writing any new TUI code, review and match: + +| Concern | Reference file(s) | Pattern to preserve | +|---|---|---| +| Wizard layout | `tui/report_wizard.go`, `tui/assess_config.go` | `huh` form prompts, single-column flow, consistent labeling | +| Progress display | `tui/assess_progress.go` | Phase-based framed display, consistent styling | +| Preview tabs | `tui/assess_preview.go`, `tui/preview.go` | Tabbed layout, Glamour-rendered markdown | +| Color scheme & styling | All existing `tui/*.go` | Same lipgloss palette, borders, padding, spacing | +| Headless mode | `src/cli/index.ts` (`report`, `assess` flags) | Same `--headless --foreground --no-confirm` shape | +| JSON-lines protocol | `tui/assess_protocol.go`, `scripts/run-assess.ts` | Same bidirectional event types, line-buffered transport | +| **Human-stakes warning banner** | new — see [Why Observations, Not Scores](#why-observations-not-scores) | High-visibility banner at top of every observation-display screen; cannot be suppressed | +| **Sign-off field rendering** | new | Always rendered at end of `summary.md`; cohort viewer surfaces missing sign-offs | +| **No numerical artifacts in TUI** | new | TUI displays observations and measurements; no scores, no totals, no bands. If a developer is tempted to render a number, double-check it's a *measurement* (e.g. "5 test runs"), not a *score* (e.g. "0.7 architectural quality"). | +| **Sign-off requires manager-written reasoning summary** | new | The sign-off form must include a non-blank free-text field where the manager writes (in their own words) why they reached their recommendation. The TUI rejects empty submission. Forces engagement; prevents rubber-stamping. | + +This is an **explicit guard**. AI-driven implementations of new commands have historically lost TUI fidelity. Treat as hard requirement, not stylistic preference. + +## Project Bootstrap Modes + +When the hiring manager runs `teamhero interview bootstrap`, they choose modes per role. + +### Wizard prompt sequence + +1. Role title +2. Stack +3. Domain +4. Feature to add +5. **Project mode**: (A) AI-bootstrap extension or (B) Greenfield brief +6. Time box (60 / 90 / 120 / custom) +7. **Analysis mode**: AI-assisted observation generation, or human-only review +8. **Rubric mode**: + - **Custom prompt** — manager's own observation prompt + - **Default rubric** — canonical 9-dim framework + - **Default rubric + job description** — canonical framework + markdown JD +9. **Job description path** (only if rubric mode includes JD) +10. Output directory + +Project mode, analysis mode, and rubric mode are first-class active choices — no defaults. + +### Mode A: AI-bootstrap extension project (recommended for IC roles) + +The bootstrap wizard uses the OpenAI Responses API with a prompt encoding the rubric's structural requirements: + +- README.md, CLAUDE.md (agent context + module map), GLOSSARY.md (5–8 ubiquitous-language terms) +- Deep-module structure (≥2 well-encapsulated modules with clean interfaces) +- Deliberate "shallow vs deep" architectural trap +- Failing-but-skipped tests describing the new feature (5–8 acceptance criteria) +- All existing tests passing +- ~400–700 LOC total +- Idiomatic for the chosen stack + +Wizard validates structural checks; regenerates up to 3× on failure. + +### Mode B: Greenfield from brief (recommended for staff/architect roles) + +Wizard generates a `BRIEF.md` template; no starter code; candidate creates everything from scratch. Tests bootstrap discipline. + +### Mode coverage of rubric dimensions + +[unchanged from prior version — Mode A tests "uses provided context discipline" strongly; Mode B tests "bootstraps own context discipline" strongly] + +## Time Box + +Configurable per-project (set during bootstrap, fixed for all candidates of that role): + +| Preset | Duration | Recommended for | +|---|---|---| +| Focused | 60 min | Smaller scaffolds, junior screens | +| **Standard (default)** | 90 min | Most scaffolds, senior IC and team-lead screens | +| Extended | 120 min | Larger scaffolds, senior architect / staff screens | + +Time-box is fixed per-project, not per-candidate. + +## End-to-end Flow + +### Hiring manager (1× per role) + +``` +$ teamhero interview bootstrap +[wizard prompts as above] +[service generates per-role repo via Responses API + validates structural requirements] +$ cd && git push -u origin main +``` + +### Candidate (1× per session) + +``` +$ git clone && cd +$ # read INTERVIEW_RULES.md and RUBRIC_OVERVIEW.md so you know the rules +$ # and the dimensions you'll be evaluated on +$ # edit PRIVACY_RELEASE.md to sign +$ ./start.sh # checks privacy release signed → asciinema + hooks +[code the interview using their own AI stack] +$ ./end.sh # stop recording, commit +$ git push (their fork) +``` + +### Interviewer (1× per candidate) + +``` +$ teamhero interview review \ + --transcript \ + [--recording ] \ + [--interviewer-notes ] \ + [--session-recording-url ] \ + [--session-platform zoom|teams|meet|other] \ + [--mode ai-assisted|human-only] +[evidence-collectors → deterministic-extractors + ai-observer (mega-call) → audit-writer] +[outputs: docs/interviews//-/{summary.md, audit.md, audit.json, evidence/}] +[appends to docs/interviews//COHORT.md] +``` + +## Inputs the Reviewer Consumes + +| Input | Source | Captures | Required? | Used by | +|---|---|---|---|---| +| Git repo (with no-squash hygiene) | Candidate's GitHub fork | Final artifact, commit chronology | ✅ Required | LLM observer + deterministic extractors | +| `interview.log` | Claude Code hooks (or per-tool native log) | Verbatim prompts, tool calls, AI responses | ✅ Required | Both | +| `terminal.cast` | `asciinema rec` (started by `start.sh`) | Every command, every output, with timing | ✅ Required | Both | +| Audio transcript | Granola / Fireflies / Otter (interviewer-side) | Verbalized planning, course-correction commentary | ✅ Required | LLM observer | +| Screen recording | Loom / Zoom recording | Multimodal context for non-Claude-Code tools | ⚠️ Optional | LLM observer (if provided) | +| Interviewer notes | Structured form | Metacognitive signal automation can't extract | ⚠️ Optional | LLM observer (if provided) | +| **Session recording URL** | Zoom / Teams / Meet platform recording link | Full live-session video — interviewer-candidate dynamics, body language, pressure handling | ⚠️ Optional | **Human reviewer reference only — NOT fed to LLM** | + +**Session recording URL is intentionally scoped as human-reviewer reference material, not LLM input.** The recording's value is for the human sign-off step, particularly for borderline interpretations of observations — questions like "did the candidate handle pressure well?" are answered by watching, not by transcript-reading. + +## Funding Model + +**$50 gift card per candidate.** Universal coverage across all AI tools. Zero infrastructure burden. Net positive on hiring brand. Tool-choice signal stays pure. + +## Out of Scope for MVP + +- **Numerical scoring of any kind.** Per ethical decision in [Why Observations, Not Scores](#why-observations-not-scores). +- **Comparative ranking math.** Same reasoning. +- **Windows-native support.** WSL is the documented path. +- **Hosted runtime / sandbox.** +- **API-key provisioning for candidates.** Replaced by gift card. +- **Replay UI.** Use raw asciinema playback + the audit tier markdown at MVP. +- **Real-time cheat detection.** Live observation absorbs the cheating risk. +- **Per-role rubric customization beyond default presets.** Single rubric framework at MVP plus the three rubric modes from #20. +- **`teamhero interview list-roles | list-candidates`.** Still deferred (skill documents `ls docs/interviews/` workaround). +- **Cross-role comparison.** Tool stays within-role. +- **Per-dimension calls (vs mega-call).** Mega-call is canonical. +- **Cloud storage / multi-manager sync.** Local disk only at MVP. +- **Candidate-facing audit access.** Candidates do NOT receive the audit by default — only the appeal channel (privacy release) is provided. **Important caveat:** GDPR Article 15 may require disclosure for EU/UK candidates. The classification-rationale doc must call this out so the company knows to revisit if hiring expands to GDPR jurisdictions. Withholding access in those jurisdictions is the legal risk, not granting it. +- **Multi-model evaluation (bias diversification across LLMs).** Deferred to v1.5+. Currently single-model (Opus 4.7). +- **Periodic anonymized bias audit of accumulated observations.** Deferred to v1.5+. Worth planning the data structure now to support it later. + +## Open Questions + +All open questions are resolved. Implementation can begin when the user gives explicit go-ahead. + +## Implementation Progress + +### MVP — shipped via PR #10 (commits prior to `c7ff78f`) + +| # | Step | Status | Beads issue | +|---|---|---|---| +| 1 | Formal rubric document `docs/interview-rubric.md` (9 dims, observation framework) | ✅ shipped | `teamhero-scripts-3sh` | +| 2 | Classification-rationale `docs/interview-classification-rationale.md` (ethics preamble, GDPR Art. 15 caveat, appeal mechanism, single-model bias limitation) | ✅ shipped | `teamhero-scripts-3sh` | +| 3 | Bootstrap prompt template encoding scaffold requirements | ✅ shipped (`src/services/interview/bootstrap/project-generator.ts`) | `teamhero-scripts-7qe` | +| 4 | `teamhero-interview-kit/` (POSIX privacy gate, start/end scripts, templates) | ✅ shipped | `teamhero-scripts-h13` | +| 5 | `src/services/interview/{bootstrap,assess,cohort,shared}/` + `teamhero interview` CLI subcommand | ✅ shipped (4 evidence collectors, 4 deterministic extractors, AI observer with bias guard + scoreless strict schema, audit writer with sign-off validator) | `teamhero-scripts-{7qe,bt2}` | +| 6 | `skills/teamhero-interview/SKILL.md` (single bounded-context skill) | ✅ shipped | `teamhero-scripts-9v2` | + +### v1.5 — shipped via PR #10 commit `c7ff78f` + +| # | Step | Status | Beads issue | +|---|---|---|---| +| v1 | Interactive `huh` wizard for `teamhero interview bootstrap` (no-flag invocation drops into wizard; rubric-mode branching for custom + JD; shared validator gate with headless path) | ✅ shipped | `teamhero-scripts-92a` | +| v2 | TUI style polish — shared `interviewStyles` palette, glamour-rendered audit preview with pinned ADVISORY banner, phased progress display (`clone → collect-evidence → extract-measurements → observe → audit-write`) consuming the JSON-lines protocol | ✅ shipped | `teamhero-scripts-upo` | +| v3 | Manual end-to-end smoke script `scripts/manual-test-interview.sh` (TTY-only, no OpenAI spend via `--mode-analysis human-only`) + README and SKILL.md doc refresh | ✅ shipped | `teamhero-scripts-rjp` | + +Note on `tui/assess_*.go` patterns: the TUI Implementation Constraints table above lists `tui/assess_progress.go`, `tui/assess_config.go`, `tui/assess_preview.go`, `tui/assess_protocol.go` as reference files. **Those files do not exist on `main` yet** (they live on `claude/condescending-tereshkova-88a936`). The v1.5 implementation aligned with the existing primary — `tui/wizard.go` and `tui/setup.go` — instead. When the maturity-assessment branch lands on `main`, a small follow-up can re-align if any palette/header differences remain. + +### v2 — post-shipping refinements on PR #10 (May 2026) + +A run of usability and integrity refinements after a hiring-manager-led +review of the v1.5 wizard. Each change was driven by a specific +observation about what was hinting at the answer, what was opaque, or +what didn't actually serve the evaluation. Captured here so the +feature's evolution stays visible without spelunking through `git log`. + +| Theme | Change | Why | +|---|---|---| +| Wizard rendering | Picker no longer clips the first option on initial paint (the `huh.Select` cursor was anchoring on the empty-string Cancel sentinel and scrolling the viewport off the first option). | Reported regression: first option invisible on boot; user had to hit up-arrow to reveal it. | +| Wizard defaults | `--output-dir` defaults to `./interviews/` (gitignored); `--time-box` defaults to 60 minutes. | Proctors were re-typing the obvious; default time-box should match the recommended length in the spec. | +| Wizard polish | Confirm screen no longer duplicates the right-side summary as the form description. Static descriptions kept under one line to dodge a `huh.ThemeCharm` left-bar wrap bug. | "Ready to bootstrap?" screen was cluttered enough that the Yes/Cancel buttons got lost. | +| Kit completeness | `applyBootstrapDefaults` fills `--kit-dir` to `teamhero-interview-kit` so the kit overlay ships on every run regardless of mode A vs B. | Proctors who forgot the flag got the AI output but none of the proctor/candidate scaffolding. | +| Headless UX | Successful runs print an OSC 8 file:// hyperlink to the generated repo and, on a TTY, offer to publish to GitHub via the user-configured token. | Generated repos were getting lost on disk; publishing was a manual `git init && git remote add` ritual. | +| Validator scope | Polyglot test-file detection added (xUnit Skip, JUnit @Disabled, pytest skip, RSpec pending, Rust `#[ignore]`, …) so non-TS/Go starter projects pass validation. | Closed `teamhero-scripts-hto`. | +| Logging | `--debug` flag on the headless dispatcher and the bun subprocess; structured `bootstrap.start` / `bootstrap.ok` / `bootstrap.fail` records with elapsed-ms. Light run context (role/mode/stack/domain/rubric/jd/output) always prints to stderr. | Generation failures were impossible to triage post-hoc; one-line context summary makes a failure ticket actionable without rerun. | +| Wizard simplification | Removed the late-stage "How should the AI prompt be supplied?" + "Project prompt (optional)" steps. Replaced with a single either/or feature-source step after Domain: type the description yourself OR pick from AI-suggested ideas. `projectPrompt` field and `--project-prompt` flag retired. | The "Feature description" and "Project prompt" steps were asking the proctor to say the same thing twice; the prompt-source select then asked again. One source of truth wins. | +| Question descriptions | Every wizard step now carries a one-line description that tells the hiring manager how their answer shapes the generated project (stack → source language, domain → glossary vocab, feature → central focus, etc.). | Hiring managers couldn't predict which inputs would surface in the candidate's view. | +| Project type taxonomy | "Project mode" (A/B) renamed to "Project type" with three options: Brownfield (AI scaffolds), Greenfield (use your stack), Greenfield (candidate picks stack). The third option flips a new `stackByCandidate` flag so the BRIEF.md's "Tech stack" section becomes a contextual hint rather than a hard requirement. | Hiring managers think in greenfield/brownfield terms, not A/B. The candidate-picks-stack variant didn't exist at all before. | +| Rubric label | "Rubric mode" renamed to "How should AI share observations?" to signal what the answer controls (the rubric the AI references when writing up the recorded interview). | Pure label change — same three options, same internal values. | +| Size validator removed | Dropped the 400-700 LOC and ≥2-deep-modules rules from `validateModeAProject` and the corresponding "ABSOLUTE SIZE REQUIREMENTS" block from the OpenAI prompt. Retry-feedback helpers (`extractLocFromFailure`, "DOUBLE the previous output") removed; retry budget reverted 5 → 3. | The size rules were a heuristic inherited from slice 2 — they weren't in any product spec, and they were rejecting perfectly serviceable 300-LOC projects. | +| Candidate-facing strip | Removed `GLOSSARY.md` from required files and the AI's output. Removed the failing/skipped-test requirement and explicitly forbid the AI from generating test files. Deleted `teamhero-interview-kit/.claude/CLAUDE.md` from the kit overlay. The only required structural file is now `README.md`. | All three were hinting at the answer: GLOSSARY listed concepts the candidate should identify; skipped tests like `describe.skip("addUser", …)` leaked function names; the kit's CLAUDE.md literally told the candidate's agent to "read GLOSSARY.md and failing tests first." | +| JD as standalone input | Retired the `default+jd` rubric value. JD attachment is now three questions early in the wizard: "Will you provide a JD?" → path → "Should the JD influence the project being generated?". The post-interview observer references the JD whenever it's been provided (independent of rubric); the project-generation prompt reads the JD body and tailors the project's complexity/domain only when the influence flag is on. New `--jd-influences-project` headless flag. | Coupling JD to rubric forced the proctor to choose between custom-rubric guidance and JD context. The user wanted (a) the JD to optionally shape what the candidate sees (e.g., a junior healthtech JD → EHR-flavoured feature), and (b) the rubric to be its own decision. | + +### Still deferred + +7. **Pilot with first batch of real candidates.** Manually spot-check observation output for the first 10. Refine the observation prompt based on observed failure modes (e.g., LLM accidentally producing a numerical assessment despite instructions — validation must reject this). Update the classification-rationale doc when the prompt changes meaningfully. +8. **`list-roles` and `list-candidates` verbs.** Workaround documented in `skills/teamhero-interview/SKILL.md` (`ls docs/interviews/`). +9. **Multi-model bias diversification.** Currently single-model. +10. **Periodic anonymized bias audit of accumulated observations.** Data structure exists; the audit pipeline does not. + +## References + +- `claude/condescending-tereshkova-88a936` branch — Agent Maturity Assessment implementation; this feature reuses much of its scaffolding. +- `docs/2026-05-03-agent-maturity-assessment-plan.md` — sibling plan; same architectural pattern. Note: `assess` produces scores because it scores an organization (a non-human entity); `interview` produces observations because it observes a person (different ethical floor). +- `docs/maturity-skill-ref/references/criteria.md` — full maturity rubric; the 9-dimension interview framework maps to a subset (B5, B6, B7, C9, C10, C11, D12). +- `tui/assess_progress.go`, `tui/assess_config.go`, `tui/assess_preview.go`, `tui/assess_protocol.go` — TUI pattern references. +- `src/services/maturity/ai-scorer.ts` — pattern reference for mega-call to OpenAI Responses API with strict json_schema. +- Matt Pocock — external authority on AI-coding skills for engineers; framing referenced throughout the rubric (context engineering, "tasting" output, knowing when to drive vs delegate). His teaching approach — observation and pattern recognition over numerical reduction — is the model the observation-output approach follows. diff --git a/docs/interview-classification-rationale.md b/docs/interview-classification-rationale.md new file mode 100644 index 0000000..014ec29 --- /dev/null +++ b/docs/interview-classification-rationale.md @@ -0,0 +1,263 @@ +# Interview Classification — Methodology and Ethics Rationale + +This document accompanies the interview rubric. It exists for two reasons: + +1. **Defensibility.** If a hiring decision informed by this tool is ever + legally challenged, this document is the methodology of record. It states + what the AI does, what it deliberately does *not* do, and why. +2. **Internal honesty.** Anyone running an interview through this tool should + read the preamble before they trust any output. + +The preamble is binding. The per-dimension methodology details that follow are +descriptions of the implementation; they do not soften, qualify, or contradict +the preamble. + +--- + +## Preamble — Ethical Commitments + +These four commitments shape every part of the system. They are non-negotiable +and any change to them requires explicit org sign-off. + +### 1. Observations, not scores + +The AI does not produce a score for a candidate. It does not produce a +per-dimension score, a weighted total, a band ("Strong Hire / Mixed / No +Hire"), or any other reductive label. The output is: + +- **Narrative observations** for dimensions where the LLM is the judge — + 1–3 sentences, paired with the reasoning chain that produced them and the + cited evidence excerpts that ground them. +- **Raw measurements** for dimensions that are deterministically observable — + e.g. "ran tests 5 times, interleaved with 12 prompts" presented as a fact. + +The categorical decision is the hiring manager's, captured in the sign-off +section of each candidate's `summary.md` (Hire / Hire with notes / No hire). +That decision is *theirs*. The AI's output is one input among many. + +**Why this matters:** numerical scoring of humans creates harms that +safeguards (opt-in, banners, sign-off) do not fully address — cognitive +anchoring on a number, false precision without true calibration, comparative +drift across candidates, bias amplification when scores are averaged or +thresholded, and increased legal exposure. Observations + evidence + +measurements provide the structure of a rubric without the harms of a score. + +### 2. Bias diversification, NOT bias elimination + +This tool **never** claims the AI is "non-biased," "objective," "neutral," or +"bias-free." Those claims are factually wrong and indefensible when +challenged. + +LLMs trained via RLHF carry well-documented systematic biases: + +- **Training-data bias** — overrepresented demographics, languages, cultural + contexts in the corpus. +- **Preference-tuning bias** — RLHF raters' demographic and aesthetic + preferences encoded in the "preferred response" signal. +- **Sycophancy** — LLMs tend to agree with their user, including subtly + approving of what the framing implies they should approve of. +- **Familiarity bias** — model is more familiar with mainstream tools and + patterns; less-mainstream alternatives are systematically disadvantaged. +- **Verbosity preference** — verbose output rated more favorably than + concise output, even when concise is better. +- **Name- and demographic-cue bias** — empirically documented disparate + treatment based on names alone replicates in LLM evaluators. + +The defensible claim about the AI is this: **the AI offers a structurally +different perspective with different biases than the human reviewer.** Two +imperfect perspectives covering different blind spots is genuinely better +than one — but only because the *overlap set* of biases is smaller, not +because either perspective is unbiased. + +Critically, AI bias is *systematic across all evaluations* — every candidate +faces the same biased model — while individual reviewer biases are local. +This means AI bias can scale harm more efficiently than individual bias if +deployed without the safeguards in commitment #3. + +### 3. Human-in-the-loop is mandatory + +Every interview run **requires** a human hiring manager to read the AI's +observations and write a sign-off. The sign-off has three categorical +outcomes (Hire / Hire with notes / No hire) plus a free-form reasoning field +where the manager explains their decision in their own words. + +The tool refuses to consider an interview "complete" without this sign-off. +The cohort report displays sign-off status and the manager's recommendation +only — it does not display anything the AI produced as a verdict. + +The standing copy at the top of every per-candidate audit and the cohort +report reads: + +> ⚠ THIS AUDIT IS ADVISORY. Hiring decisions are made by humans using +> professional judgment. The candidate is a person, not a score. This rubric +> is one factor among many; your evaluation is the primary, first, and most +> important basis for your decision. + +This is not boilerplate; it is the load-bearing framing of the tool. + +### 4. GDPR Article 15 caveat — candidate audit access (MVP) + +GDPR Article 15 ("right of access by the data subject") grants candidates in +the EU/EEA the right to obtain confirmation of, and access to, personal data +processed about them. The observations and measurements this tool produces +about a candidate fall within scope. + +**MVP behavior:** candidate-facing audit access is **not** included. The +audit artifacts are stored locally on the hiring manager's disk and shared +only within the company. This is a *deliberate constraint*, not an oversight: +exposing the audit externally introduces legal review burden the MVP cannot +absorb. + +**Implications the company must accept when running the tool in EU/EEA +contexts:** + +- A candidate filing an Article 15 request must be served via the company's + existing data-subject-request process. The company is responsible for + producing the audit artifacts on request, not the tool. +- The candidate must be informed at the start of the interview that AI + observation is occurring (consent / transparency obligation under Article + 13). This is implemented as the opt-in privacy gate in `bootstrap` and is + reproduced in the per-candidate `PRIVACY_RELEASE.md`. +- Candidates do not see the AI's narrative observation about them as part of + the standard hiring process. If a request is made, the audit is shared in + full — the reasoning chain is preserved precisely so this is possible + without redaction surprises. + +A future enhancement may add a candidate-facing audit-access flow. Until that +is built and legally reviewed, the MVP default stands: company-only access, +candidate-served-on-request via existing processes. + +--- + +## Per-dimension methodology + +The implementation details below describe *how* observations and measurements +are produced for each dimension. They do not change anything in the preamble. + +### 1. Upfront design & decomposition (`upfront-design`) + +**Evidence mode:** llm-judge. + +The LLM observer reads the interview log and terminal recording, looking for +evidence of decomposition behavior before the candidate began prompting: +explicit problem framing, identification of constraints, sketching of +interfaces or data flow, alignment on approach. + +Output: narrative observation (1–3 sentences), reasoning chain, and 1–3 +evidence excerpts cited from the interview log or transcript. + +### 2. Context engineering (`context-engineering`) + +**Evidence mode:** hybrid. + +Deterministic extractor counts: CLAUDE.md references in prompts, glossary +terms used, file paths cited verbatim, examples provided as context. + +LLM observer interprets the counts in context: high counts with poor +relevance are different from low counts with high relevance. Narrative +observation pairs with the raw counts. + +### 3. Critical evaluation / "tasting" (`critical-evaluation`) + +**Evidence mode:** llm-judge. + +LLM observer scans the diff stream and prompt log for evidence of the +candidate rejecting, modifying, or pushing back on AI suggestions versus +accepting them verbatim. Reasoning chain preserved alongside the +observation. + +### 4. Verification discipline (`verification`) + +**Evidence mode:** deterministic. + +Deterministic extractor counts: test invocations, type-check invocations, +diff/grep commands, manual verification commands. Reports the count and +interleaving rhythm (e.g. "8 test runs, alternating roughly every other +prompt"). + +No LLM observation is generated for this dimension. The facts speak for +themselves. + +### 5. Course-correction (`course-correction`) + +**Evidence mode:** hybrid. + +Deterministic extractor detects course-correction signals: `git reset`, +`git checkout --`, file rollbacks, prompt re-asks, abandoned branches. + +LLM observer pairs the detected signals with a narrative observation about +whether they reflect productive correction or thrashing. + +### 6. Risk awareness (`risk-awareness`) + +**Evidence mode:** deterministic. + +Deterministic extractor detects destructive operations (`rm -rf`, `git push +--force`, schema-altering migrations, prod-affecting commands) and reports +them with timestamps and the pause-before-Enter duration if available. + +No LLM observation is generated. The detected events and timings are the +output. + +### 7. Architectural quality (`architectural-quality`) + +**Evidence mode:** llm-judge. + +LLM observer reads the final repo state and produces a narrative observation +on modularity, naming, separation of concerns, and depth of abstraction. +Reasoning chain preserved. Cited evidence excerpts from the produced code. + +### 8. Test pass / spec satisfaction (`test-pass`) + +**Evidence mode:** deterministic. + +Deterministic extractor runs the role-specific acceptance tests against the +candidate's final repo state and reports pass/fail per acceptance criterion. + +No LLM observation is generated. Pass/fail is a fact. + +### 9. Throughput (`throughput`) + +**Evidence mode:** deterministic. + +Deterministic extractor reports timestamps from the terminal recording, git +log, and agent log. Reports time-to-first-passing-test, commits within the +time-box, and total elapsed time. No LLM interpretation. + +--- + +## Interviewer-bias guard (binding) + +Audio transcripts and interviewer notes are fed to the LLM observer as +context. A biased interviewer remark ("she seemed nervous", "he was +hesitant") can propagate into the AI's narrative observation if not guarded +against. + +The observation prompt MUST include this instruction verbatim: + +> The audio transcript and interviewer notes are provided as context about +> what was happening during the session. Treat the interviewer's verbal +> commentary as situational context only — do NOT weight it as evidence of +> the candidate's skill, competence, or character. Your observations must be +> grounded in the candidate's *actions* (prompts they wrote, tools they used, +> code they produced, tests they ran, decisions they made) — not in the +> interviewer's framing of those actions. If an interviewer remark could be +> interpreted multiple ways, do not let it bias your observation; rely on the +> directly observable artifacts (interview.log, terminal.cast, repo state). + +Validation: the first 10 candidates run through the tool will have their +observations inspected for phrasing that echoes interviewer commentary +verbatim. If found, the instruction is tightened further before broader use. + +--- + +## Schema-level guard against scoring drift + +The LLM is called via the OpenAI Responses API with a strict `json_schema` +that explicitly omits `score`, `weighted_total`, `raw_total`, `band`, +`signal_count`, and similar reductive fields. The schema is `strict: true`, +which means a response containing any unlisted field is rejected at the +provider level — the LLM cannot drift into scoring even if prompted to. + +If the schema is ever relaxed, this document and the rubric must be +re-reviewed in the same change. This guard is load-bearing. diff --git a/docs/interview-rubric.md b/docs/interview-rubric.md new file mode 100644 index 0000000..5097882 --- /dev/null +++ b/docs/interview-rubric.md @@ -0,0 +1,138 @@ +# Interview Rubric — AI-Collaboration Coding Interviews + +**Rubric version:** 1.0.0 + +This rubric describes the 9 dimensions along which an AI observer and a human +hiring manager will jointly examine a candidate's behavior during a +live-observed coding interview. + +It is an **observation framework, not a scoring framework**. There are no +scoring levels (no Strong Hire / Mixed / No Hire bands per dimension; no +numerical scores; no weighted totals). The AI produces narrative observations +and raw measurements; the hiring manager produces the categorical decision in +sign-off. See `interview-classification-rationale.md` for why the system is +deliberately scoreless. + +The dimensions are organized into two thematic groups (process / outcome) +**for navigation only**. There is no weighting, and the groups are not summed. + +--- + +## Process dimensions + +How the candidate works with the AI. + +### 1. Upfront design & decomposition + +- **id:** `upfront-design` +- **Evidence mode:** llm-judge +- **What it observes:** Whether the candidate plans and decomposes the problem + before prompting, or prompts straight into code. +- **Observation output:** Narrative observation (1–3 sentences) plus cited + evidence excerpts. AI reasoning chain preserved alongside the observation. +- **Measurement output:** None. + +### 2. Context engineering + +- **id:** `context-engineering` +- **Evidence mode:** hybrid +- **What it observes:** How effectively the candidate primes the AI with + relevant repository context, constraints, and intent before each significant + prompt (CLAUDE.md, glossary terms, file paths, examples). +- **Observation output:** Narrative observation plus reasoning. +- **Measurement output:** Raw signal counts (e.g. CLAUDE.md references in + prompts, glossary terms used, files referenced explicitly). + +### 3. Critical evaluation / "tasting" + +- **id:** `critical-evaluation` +- **Evidence mode:** llm-judge +- **What it observes:** Whether the candidate reads, interrogates, and + challenges AI-generated code rather than accepting it on faith. +- **Observation output:** Narrative observation plus cited diff excerpts + showing kept-vs-rejected suggestions. Reasoning chain preserved. +- **Measurement output:** None. + +### 4. Verification discipline + +- **id:** `verification` +- **Evidence mode:** deterministic +- **What it observes:** Frequency and rigor of test runs, type checks, and + manual verification interleaved between AI exchanges. +- **Observation output:** None (deterministic facts only). +- **Measurement output:** Test run count, interleaving with prompts, diff/grep + commands, final test state. + +### 5. Course-correction + +- **id:** `course-correction` +- **Evidence mode:** hybrid +- **What it observes:** How the candidate notices, names, and recovers from + AI mistakes or their own missteps mid-task. +- **Observation output:** Narrative observation plus reasoning. +- **Measurement output:** Detected signals — git resets, prompt re-asks, file + rollbacks with timestamps. + +### 6. Risk awareness + +- **id:** `risk-awareness` +- **Evidence mode:** deterministic +- **What it observes:** Recognition of destructive operations, security + implications, and reversibility before acting on AI suggestions. +- **Observation output:** None (deterministic facts only). +- **Measurement output:** Detected destructive commands, pause-before-Enter + timing, irreversible-action attempts. + +--- + +## Outcome dimensions + +What the candidate produced. + +### 7. Architectural quality + +- **id:** `architectural-quality` +- **Evidence mode:** llm-judge +- **What it observes:** Whether the final code reflects sound modularity, + naming, and separation of concerns. +- **Observation output:** Narrative observation on the final artifact plus + cited code excerpts. Reasoning chain preserved. +- **Measurement output:** None. + +### 8. Test pass / spec satisfaction + +- **id:** `test-pass` +- **Evidence mode:** deterministic +- **What it observes:** Whether the candidate's submitted solution passes the + role-specific acceptance tests. +- **Observation output:** None (deterministic facts only). +- **Measurement output:** Pass/fail per acceptance criterion. + +### 9. Throughput + +- **id:** `throughput` +- **Evidence mode:** deterministic +- **What it observes:** Volume of meaningful progress within the time-box, + measured as commits, completed features, or tests passed. +- **Observation output:** None (deterministic facts only). +- **Measurement output:** Timestamps from terminal recording, git, and agent + log; time-to-first-passing-test; commits within time-box. + +--- + +## Evidence mode summary + +| Mode | Count | Dimensions | +|---|---|---| +| Deterministic | 4 | verification, risk-awareness, test-pass, throughput | +| Hybrid | 2 | context-engineering, course-correction | +| LLM-judge | 3 | upfront-design, critical-evaluation, architectural-quality | + +## Output shape + +Observation and Measurement records emitted per dimension. Combined into a +per-candidate `summary.md` (Tier 1, ~1–2 pages) and `audit.md` / `audit.json` +(Tier 2, full reasoning). See the implementation plan for the JSON schema. + +No `score`, no `weighted_total`, no `raw_total`, no `band` is produced at any +tier. Validation rejects LLM responses that include such fields. diff --git a/scripts/manual-test-interview.sh b/scripts/manual-test-interview.sh new file mode 100755 index 0000000..c327eef --- /dev/null +++ b/scripts/manual-test-interview.sh @@ -0,0 +1,107 @@ +#!/usr/bin/env bash +# +# manual-test-interview.sh — manual end-to-end smoke test for `teamhero interview` +# +# This script is for a HUMAN running on a TTY. It walks through the v1.5 +# interactive flows that automated tests cannot meaningfully cover: +# the wizard's branching, the glamour preview's readability, and the +# phased progress display's responsiveness. +# +# It does NOT spend money on OpenAI: the review step is invoked with +# --mode-analysis human-only so no API key is required. +# +# Usage: +# ./scripts/manual-test-interview.sh +# +# Requirements: +# - A TTY (this script bails out on piped/non-interactive stdin) +# - A built teamhero-tui binary (run `just tui-build` first) +# - bun available on PATH (the headless path spawns a TS subprocess) + +set -euo pipefail + +if [[ ! -t 0 ]] || [[ ! -t 1 ]]; then + echo "ERROR: this script is for interactive manual testing on a TTY." >&2 + echo " Refusing to run with piped stdin/stdout." >&2 + exit 1 +fi + +REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +BIN="$REPO_ROOT/tui/teamhero-tui" +TMPDIR="$(mktemp -d -t teamhero-manual-XXXXXX)" +trap 'rm -rf "$TMPDIR"' EXIT + +if [[ ! -x "$BIN" ]]; then + echo "ERROR: teamhero-tui binary not found at $BIN" >&2 + echo " Run \`just tui-build\` first." >&2 + exit 1 +fi + +pause() { + echo + echo "──────────────────────────────────────────────────────────" + echo " $1" + echo " Press Enter to continue, or Ctrl+C to abort." + echo "──────────────────────────────────────────────────────────" + read -r _ +} + +step() { + echo + echo "═══ STEP $1: $2 ═══" + echo +} + +step 1 "Interactive wizard for \`teamhero interview bootstrap\`" +echo "You should see a series of huh.Form prompts asking for role slug, stack," +echo "domain, feature, time-box, project mode, analysis mode, rubric mode, and" +echo "output directory. Try selecting the 'Custom prompt' rubric mode on one" +echo "run and the 'Default + Job Description' mode on another to verify both" +echo "conditional branches work. Cancel out with Ctrl+C — it must exit cleanly." +pause "Verify the wizard branches correctly and Ctrl+C exits cleanly." +"$BIN" interview bootstrap || true + +step 2 "Headless bootstrap (no wizard) — agents/CI path" +echo "This invocation should bypass the wizard entirely and behave exactly like" +echo "it always has." +pause "Run the headless bootstrap and verify no TUI is rendered." +"$BIN" interview bootstrap \ + --headless --no-confirm \ + --role manual-test-role --stack TypeScript --domain Manual \ + --feature "Verify the headless path still works" \ + --mode-project A --mode-analysis human-only --mode-rubric default \ + --output-dir "$TMPDIR/manual-test-role" || true +echo "Bootstrap output (if any) is in: $TMPDIR/manual-test-role" + +step 3 "Review flow — phased progress display + glamour preview" +echo "The review step is invoked with --mode-analysis human-only so no OpenAI" +echo "key is needed. Watch for:" +echo " - Phased progress display: clone → collect-evidence → extract-measurements → observe → audit-write" +echo " - The ADVISORY warning banner at the start" +echo " - A glamour-rendered preview of summary.md after the run completes" +pause "Run review and verify the progress display + glamour preview." +"$BIN" interview review \ + --candidate "Manual Test Candidate" \ + --repo "https://example.com/fake-repo" \ + --output-dir "$TMPDIR/review-output" || true + +step 4 "Sign-off file gating" +echo "Open the summary.md and audit.md in $TMPDIR/review-output/ and verify:" +echo " - The ADVISORY banner is at the top of BOTH files" +echo " - The sign-off section is present and requires a categorical decision" +echo " - The session recording URL appears in frontmatter only (not in body)" +pause "Confirm the sign-off section exists and the ADVISORY banner is present." + +step 5 "Cohort rendering for the manual-test-role" +echo "Run cohort to confirm it lists the single candidate without any" +echo "Score/Total/Rank columns (rankless by design)." +pause "Run cohort and verify the output is rankless." +"$BIN" interview cohort --role manual-test-role || true + +echo +echo "═══ MANUAL TEST COMPLETE ═══" +echo +echo "If all five steps behaved as expected, the v1.5 interactive surfaces" +echo "are working correctly. Output artifacts (auto-cleaned on exit):" +echo " $TMPDIR" +echo diff --git a/scripts/run-interview-bootstrap.ts b/scripts/run-interview-bootstrap.ts new file mode 100644 index 0000000..b31a664 --- /dev/null +++ b/scripts/run-interview-bootstrap.ts @@ -0,0 +1,222 @@ +#!/usr/bin/env bun +// CLI entry point invoked by the Go TUI for `teamhero interview bootstrap --headless`. +// Reads role-config fields from argv flags, runs the bootstrap orchestrator +// using the OpenAI-backed generator client, and exits with code 0 on success +// or 1 on failure. + +import { consola } from "consola"; +import { config as loadDotenv } from "dotenv"; + +// Do not override: CI/production env vars (real OPENAI_API_KEY, etc.) must +// win over anything that happens to be in a local .env. +loadDotenv(); + +import { OpenAIGeneratorClient } from "../src/services/interview/bootstrap/openai-generator-client.js"; +import { runBootstrap } from "../src/services/interview/bootstrap/orchestrator.js"; +import type { + AnalysisMode, + ProjectMode, + RoleConfig, + RubricMode, +} from "../src/services/interview/bootstrap/role-config.js"; + +interface FlagSpec { + flag: string; + target: keyof ParsedFlags; +} + +interface ParsedFlags { + role?: string; + roleTitle?: string; + stack?: string; + domain?: string; + feature?: string; + timeBox?: string; + modeProject?: string; + modeAnalysis?: string; + modeRubric?: string; + jdPath?: string; + customPrompt?: string; + outputDir?: string; + kitDir?: string; + model?: string; + maxAttempts?: string; + debug?: boolean; + stackByCandidate?: boolean; + jdInfluencesProject?: boolean; +} + +const FLAGS: readonly FlagSpec[] = [ + { flag: "--role", target: "role" }, + { flag: "--role-title", target: "roleTitle" }, + { flag: "--stack", target: "stack" }, + { flag: "--domain", target: "domain" }, + { flag: "--feature", target: "feature" }, + { flag: "--time-box", target: "timeBox" }, + { flag: "--mode-project", target: "modeProject" }, + { flag: "--mode-analysis", target: "modeAnalysis" }, + { flag: "--mode-rubric", target: "modeRubric" }, + { flag: "--jd-path", target: "jdPath" }, + { flag: "--custom-prompt", target: "customPrompt" }, + { flag: "--output-dir", target: "outputDir" }, + { flag: "--kit-dir", target: "kitDir" }, + { flag: "--model", target: "model" }, + { flag: "--max-attempts", target: "maxAttempts" }, +]; + +function parseArgs(argv: readonly string[]): ParsedFlags { + const out: ParsedFlags = {}; + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (arg === "--debug" || arg === "-d") { + out.debug = true; + continue; + } + if (arg === "--stack-by-candidate") { + out.stackByCandidate = true; + continue; + } + if (arg === "--jd-influences-project") { + out.jdInfluencesProject = true; + continue; + } + const spec = FLAGS.find((f) => f.flag === arg); + if (spec && i + 1 < argv.length) { + (out as Record)[spec.target] = argv[i + 1]; + i++; + } + } + return out; +} + +function buildConfig(flags: ParsedFlags): RoleConfig | string { + if (!flags.role) return "Missing required flag --role"; + if (!flags.stack) return "Missing required flag --stack"; + // --domain is required UNLESS --jd-path is supplied. The JD + // describes the business domain; the OpenAI prompt and role-config + // validator both accept an empty domain when a JD is attached. + if (!flags.domain && !flags.jdPath) { + return "Missing required flag --domain (or attach a --jd-path so the JD can describe the domain)"; + } + if (!flags.feature) return "Missing required flag --feature"; + if (!flags.modeProject) return "Missing required flag --mode-project"; + if (!flags.modeAnalysis) return "Missing required flag --mode-analysis"; + if (!flags.modeRubric) return "Missing required flag --mode-rubric"; + if (!flags.outputDir) return "Missing required flag --output-dir"; + + const timeBox = Number.parseInt(flags.timeBox ?? "60", 10); + if (!Number.isFinite(timeBox)) { + return `--time-box must be an integer number of minutes (got ${String(flags.timeBox)})`; + } + + const validModeProjects = ["A", "B"]; + if (!validModeProjects.includes(flags.modeProject)) { + return `--mode-project must be one of ${validModeProjects.join("/")} (got ${flags.modeProject})`; + } + const validModeAnalyses = ["ai-assisted", "human-only"]; + if (!validModeAnalyses.includes(flags.modeAnalysis)) { + return `--mode-analysis must be one of ${validModeAnalyses.join("/")} (got ${flags.modeAnalysis})`; + } + const validModeRubrics = ["default", "custom"]; + if (!validModeRubrics.includes(flags.modeRubric)) { + return `--mode-rubric must be one of ${validModeRubrics.join("/")} (got ${flags.modeRubric})`; + } + + if (flags.stackByCandidate && flags.modeProject !== "B") { + return "--stack-by-candidate is only valid with --mode-project B"; + } + + if (flags.jdInfluencesProject && !flags.jdPath) { + return "--jd-influences-project requires --jd-path"; + } + + const config: RoleConfig = { + roleSlug: flags.role, + roleTitle: flags.roleTitle ?? flags.role, + stack: flags.stack, + // Empty domain is allowed when --jd-path is supplied; the + // OpenAI prompt and the validator both fall back to the JD. + domain: flags.domain ?? "", + featureDescription: flags.feature, + timeBoxMinutes: timeBox, + projectMode: flags.modeProject as ProjectMode, + analysisMode: flags.modeAnalysis as AnalysisMode, + rubricMode: flags.modeRubric as RubricMode, + outputDir: flags.outputDir, + ...(flags.jdPath ? { jdPath: flags.jdPath } : {}), + ...(flags.customPrompt ? { customPrompt: flags.customPrompt } : {}), + ...(flags.stackByCandidate ? { stackByCandidate: true } : {}), + ...(flags.jdInfluencesProject ? { jdInfluencesProject: true } : {}), + }; + return config; +} + +// truncateForLog clips long strings so a stray multi-KB feature description +// (or rubric custom-prompt) can't blow up the debug log. 300 chars is +// enough to recognize the input while staying readable in a terminal. +function truncateForLog(s: string | undefined, max = 300): string { + if (!s) return ""; + const t = s.replace(/\s+/g, " ").trim(); + if (t.length <= max) return t; + return `${t.slice(0, max - 1)}…`; +} + +async function main() { + const flags = parseArgs(process.argv.slice(2)); + // consola's default log level (3) hides .debug() output. When the + // proctor passes --debug we want the per-field truncated body logs + // to actually print, so raise the threshold. Lifted once here so + // every consola.debug below benefits without re-checking the flag. + if (flags.debug) { + consola.level = 4; + } + const built = buildConfig(flags); + if (typeof built === "string") { + consola.error(built); + process.exit(1); + } + const maxAttempts = flags.maxAttempts + ? Number.parseInt(flags.maxAttempts, 10) + : undefined; + + // Always log: enough run context to triage a failure ticket without + // repro. Skip feature/prompt text bodies — those go in --debug. + consola.info( + `bootstrap.start role=${built.roleSlug} mode=${built.projectMode} stack=${built.stack} stack-by-candidate=${built.stackByCandidate ?? false} domain=${built.domain} time-box=${built.timeBoxMinutes}m rubric=${built.rubricMode} jd=${built.jdPath ?? "(none)"} jd-influences-project=${built.jdInfluencesProject ?? false} max-attempts=${maxAttempts ?? "(default)"} model=${flags.model ?? "(default)"}`, + ); + if (flags.debug) { + consola.debug( + `bootstrap.debug.feature ${truncateForLog(built.featureDescription)}`, + ); + if (built.customPrompt) { + consola.debug( + `bootstrap.debug.custom-prompt ${truncateForLog(built.customPrompt)}`, + ); + } + consola.debug(`bootstrap.debug.output-dir ${built.outputDir}`); + consola.debug(`bootstrap.debug.kit-dir ${flags.kitDir ?? "(none)"}`); + } + + const startedAt = Date.now(); + const result = await runBootstrap(built, { + client: new OpenAIGeneratorClient(undefined, flags.model), + kitTemplateDir: flags.kitDir, + ...(Number.isFinite(maxAttempts) ? { maxAttempts } : {}), + }); + const elapsedMs = Date.now() - startedAt; + if (!result.ok) { + consola.error( + `bootstrap.fail attempts=${result.attempts} elapsed=${elapsedMs}ms`, + ); + for (const f of result.failures) consola.error(` - ${f}`); + process.exit(1); + } + consola.success( + `bootstrap.ok attempts=${result.attempts} elapsed=${elapsedMs}ms output=${built.outputDir}`, + ); +} + +main().catch((err) => { + consola.error(err); + process.exit(1); +}); diff --git a/scripts/run-interview-cohort.ts b/scripts/run-interview-cohort.ts new file mode 100644 index 0000000..ffe90fe --- /dev/null +++ b/scripts/run-interview-cohort.ts @@ -0,0 +1,60 @@ +#!/usr/bin/env bun +// CLI entry for `teamhero interview cohort`. Spawned by the Go TUI. + +import { consola } from "consola"; +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { writeCohortSummary } from "../src/services/interview/cohort/cohort-summary.js"; + +interface Flags { + role?: string; + roleDir?: string; + order?: "alphabetical" | "chronological"; +} + +function parseFlags(argv: readonly string[]): Flags { + const out: Flags = {}; + for (let i = 0; i < argv.length; i++) { + const a = argv[i]; + if (a === "--role" && i + 1 < argv.length) { + out.role = argv[i + 1]; + i++; + } else if (a === "--role-dir" && i + 1 < argv.length) { + out.roleDir = argv[i + 1]; + i++; + } else if (a === "--order" && i + 1 < argv.length) { + const v = argv[i + 1]; + if (v === "alphabetical" || v === "chronological") out.order = v; + i++; + } + } + return out; +} + +async function main() { + const flags = parseFlags(process.argv.slice(2)); + if (!flags.role) { + consola.error("Missing required flag: --role "); + process.exit(1); + } + const roleDir = + flags.roleDir ?? + join(process.cwd(), "docs", "interviews", flags.role); + if (!existsSync(roleDir)) { + consola.error(`Role directory does not exist: ${roleDir}`); + process.exit(1); + } + const out = writeCohortSummary({ + roleDir, + roleSlug: flags.role, + order: flags.order, + }); + consola.success( + `COHORT.md written for ${out.recordCount} candidate(s): ${out.path}`, + ); +} + +main().catch((err) => { + consola.error(err); + process.exit(1); +}); diff --git a/scripts/run-interview-review.ts b/scripts/run-interview-review.ts new file mode 100644 index 0000000..d73a54f --- /dev/null +++ b/scripts/run-interview-review.ts @@ -0,0 +1,93 @@ +#!/usr/bin/env bun +// CLI entry for `teamhero interview review`. Spawned by the Go TUI. + +import { consola } from "consola"; +import { config as loadDotenv } from "dotenv"; + +// Do not override: CI/production env vars (real OPENAI_API_KEY, etc.) must +// win over anything that happens to be in a local .env. +loadDotenv(); + +import { OpenAIObserverClient } from "../src/services/interview/review/ai-observer.js"; +import { reviewCandidate } from "../src/services/interview/review/review-orchestrator.js"; + +interface Flags { + repo?: string; + candidate?: string; + transcript?: string; + interviewerNotes?: string; + sessionRecordingUrl?: string; + sessionPlatform?: string; + sessionDate?: string; + outputDir?: string; + localRepoPath?: string; +} + +function parseFlags(argv: readonly string[]): Flags { + const out: Flags = {}; + const map: Record = { + "--repo": "repo", + "--candidate": "candidate", + "--transcript": "transcript", + "--interviewer-notes": "interviewerNotes", + "--session-recording-url": "sessionRecordingUrl", + "--session-platform": "sessionPlatform", + "--session-date": "sessionDate", + "--output-dir": "outputDir", + "--local-repo-path": "localRepoPath", + }; + for (let i = 0; i < argv.length; i++) { + const key = map[argv[i]]; + if (key && i + 1 < argv.length) { + (out as Record)[key] = argv[i + 1]; + i++; + } + } + return out; +} + +async function main() { + const flags = parseFlags(process.argv.slice(2)); + if (!flags.candidate) { + consola.error("Missing required flag: --candidate "); + process.exit(1); + } + if (!flags.repo && !flags.localRepoPath) { + consola.error("Need either --repo or --local-repo-path "); + process.exit(1); + } + if (flags.repo && flags.localRepoPath) { + consola.error( + "--repo and --local-repo-path are mutually exclusive; supply exactly one.", + ); + process.exit(1); + } + const result = await reviewCandidate( + { + repoUrl: flags.repo ?? "", + candidateName: flags.candidate, + transcriptPath: flags.transcript, + interviewerNotesPath: flags.interviewerNotes, + sessionRecordingUrl: flags.sessionRecordingUrl, + sessionPlatform: flags.sessionPlatform, + sessionDate: flags.sessionDate, + outputDir: flags.outputDir, + localRepoPath: flags.localRepoPath, + }, + { observer: new OpenAIObserverClient() }, + ); + if (!result.ok) { + consola.error("Review failed:"); + for (const f of result.failures) consola.error(` - ${f}`); + process.exit(1); + } + consola.success(`Audit written:`); + consola.info(` summary: ${result.outputs?.summaryPath}`); + consola.info(` audit: ${result.outputs?.auditPath}`); + consola.info(` json: ${result.outputs?.auditJsonPath}`); +} + +main().catch((err) => { + consola.error(err); + process.exit(1); +}); diff --git a/scripts/run-tests.sh b/scripts/run-tests.sh index 653a44c..fcefb9d 100755 --- a/scripts/run-tests.sh +++ b/scripts/run-tests.sh @@ -52,9 +52,13 @@ for f in "${FILES[@]}"; do output=$(bun test "$f" 2>&1) fi - file_pass=$(echo "$output" | grep -oP '\d+(?= pass)' || echo 0) - file_fail=$(echo "$output" | grep -oP '\d+(?= fail)' || echo 0) - file_skip=$(echo "$output" | grep -oP '\d+(?= skip)' || echo 0) + # Bun may emit "N pass" / "N fail" / "N skip" multiple times (per-file lines + # plus a summary). Take the LAST match to get the file's summary totals; + # without `tail -1` the value is multi-line and bash arithmetic fails with + # "syntax error in expression" downstream. + file_pass=$(echo "$output" | grep -oP '\d+(?= pass)' | tail -1) + file_fail=$(echo "$output" | grep -oP '\d+(?= fail)' | tail -1) + file_skip=$(echo "$output" | grep -oP '\d+(?= skip)' | tail -1) [[ -z "$file_pass" ]] && file_pass=0 [[ -z "$file_fail" ]] && file_fail=0 diff --git a/skills/teamhero-interview/SKILL.md b/skills/teamhero-interview/SKILL.md new file mode 100644 index 0000000..191f897 --- /dev/null +++ b/skills/teamhero-interview/SKILL.md @@ -0,0 +1,159 @@ +--- +name: teamhero-interview +description: Run candidate AI-collaboration coding interviews end-to-end — configure a role and generate the project (bootstrap), review a single candidate's submission with structured observations (review), produce a cohort roll-up across all candidates for a role (cohort). Use this when the user wants to set up an interview, evaluate a candidate's submitted repo, see the full cohort for a role, or asks anything about hiring through the `teamhero interview` CLI. +--- + +# teamhero-interview — bounded-context skill + +This skill is the conversational wrapper for the `teamhero interview` CLI. It +translates the user's natural-language intent into the right verb invocation +and surfaces results back in plain English. **It contains no business logic +of its own.** All hiring-specific logic lives in `src/services/interview/` +in the teamhero.cli repo; this skill only routes. + +The bounded context is hiring: configure a role, evaluate a candidate, +review a cohort. Anything outside that belongs to a different skill or no +skill at all. + +## Ethical framing — always present these to the user + +Before running review or cohort, remind the user of the three commitments +that govern this tool. These are not boilerplate; they shape how the +output should be read: + +1. **Observations, not scores.** The tool produces narrative observations + and raw measurements per dimension, never numerical scores. The hiring + decision is the human's. +2. **AI bias diversification, not elimination.** The AI carries its own + biases (training-data, RLHF preference-tuning, name/demographic-cue, + verbosity-preference). Its perspective is *different* from any + individual reviewer's, not unbiased. +3. **Human-in-the-loop is mandatory.** Every audit requires a human + sign-off with a manager-written reasoning summary in their own words + before the audit is considered complete. + +If a user asks the skill to "score the candidate" or "rank these +candidates," redirect: explain the tool does not produce scores, then +offer to render the observations + measurements + sign-off status, which +is the appropriate output for the request. + +## Available verbs + +### Production verbs (MVP) + +#### `teamhero interview bootstrap` +Configures a role and generates the candidate coding project. + +**Interactive wizard (primary path for humans in a terminal session).** When +the user is at a TTY and not scripting, recommend they just run: + +``` +teamhero interview bootstrap +``` + +with no flags. They will be walked through role slug, stack, domain, feature, +time-box, project mode, analysis mode, and rubric mode (with conditional +follow-ups for custom prompt or job-description file). The wizard hands the +chosen configuration to the same validator the headless path uses. + +**Headless flags (agents and CI use this form).** If the user is scripting, +asks for a one-liner, or is invoking via an agent like Claude Code, use the +explicit flag list: + +``` +teamhero interview bootstrap --headless \ + --role --stack --domain --feature "" \ + --time-box \ + --mode-project A|B \ + --mode-analysis ai-assisted|human-only \ + --mode-rubric default|custom|default+jd \ + --output-dir \ + [--jd-path ] [--custom-prompt ""] [--role-title ""] +``` + +When user says: "set up an interview for a senior backend role", "create a +new role", "I need a coding project for candidates", "bootstrap a hiring +round" → run bootstrap. + +#### `teamhero interview review --candidate <name> --repo <url>` +Reviews a single candidate's submitted repository. The repo can also be +supplied positionally (`teamhero interview review --candidate "Jane" <url>`) +or as a local path (`--local-repo-path <dir>` instead of `--repo`). + +```bash +teamhero interview review --candidate "Jane Doe" --repo <url> \ + [--transcript <file>] [--interviewer-notes <file>] \ + [--session-recording-url <url>] [--session-platform zoom|teams|meet|other] \ + [--session-date YYYY-MM-DD] [--output-dir <path>] +``` + +When user says: "review Alice's submission", "evaluate this candidate's +repo", "produce the audit for X" → run review. + +Always print the ADVISORY warning banner before reporting results, and +always end by reminding the user the audit is not complete until they +write the sign-off section with their own categorical recommendation +and reasoning summary. + +#### `teamhero interview cohort --role <slug>` +Produces a `COHORT.md` roll-up of all candidates for a role. + +``` +teamhero interview cohort --role <slug> [--order alphabetical|chronological] +``` + +When user says: "show me the cohort", "list all candidates for the +backend role", "roll up the interviews" → run cohort. + +### Stubs (v1.5; not yet implemented) + +- `teamhero interview list-roles` — show all configured roles. +- `teamhero interview list-candidates --role <slug>` — show all candidates + for a role. + +If the user asks for these, explain they are planned for v1.5 and offer +the workaround (`ls docs/interviews/`). + +## Cohort orchestration — when invoked conversationally + +If the user says "review the whole cohort" or "review all candidates for +role X": + +1. Locate the role config (typically `docs/interviews/<slug>/role-config.json` + or `<slug>/role-config.json` in the project root). If candidate URLs are + listed there, use them. Otherwise ask the user for the list. +2. For each candidate URL the user provides: + a. Ask the user which transcript file (if any) belongs to that candidate. + Look in `~/Downloads/`, the project's `transcripts/` directory, or + anywhere the user indicates. Do NOT guess — ask. + b. Invoke `teamhero interview review --candidate "<name>" --repo <url> \ + --transcript <path>` and capture the audit. + c. Report which audit was written (path). +3. After all candidates are reviewed, invoke `teamhero interview cohort + --role <slug>` and report the path to `COHORT.md`. +4. Remind the user that **each candidate's audit needs a separate sign-off + from the manager** before it counts as complete, and that no hiring + decision should be made from the cohort report alone. + +## Output style + +- Always include the path(s) to written audits/cohort files. +- Never reproduce the AI observer's narrative observations in chat verbatim + without also including a pointer to the audit.md (the reasoning chain is + preserved there for a reason — managers should read the full chain, not + just the summary). +- If a review run fails, surface the failure list literally; do not + paraphrase the diagnostic. + +## What NOT to do + +- **Do not produce scores.** If a user prompt would result in a numerical + rating ("rate Alice 7/10 on context engineering"), refuse and explain. +- **Do not bypass the privacy gate.** If start.sh refuses because + PRIVACY_RELEASE.md is unsigned, do not work around it — explain to the + user that the candidate must sign first. +- **Do not feed session_recording_url to the AI observer.** The CLI + already enforces this; do not paste meeting links into prompt fields. +- **Do not generate any code or schema changes that re-introduce score + fields.** Slices 1–4 deliberately strip them; the strict json_schema + validates them out; doing so would violate the tool's ethics floor. diff --git a/src/cli/index.ts b/src/cli/index.ts index 453cf13..794cdb2 100644 --- a/src/cli/index.ts +++ b/src/cli/index.ts @@ -144,7 +144,7 @@ export function createCli( .version(VERSION); program - .command("report") + .command("report [args...]") .description("Generate a developer contribution report") .helpOption(false) // Let the Go TUI binary handle --help .allowUnknownOption() // Pass through any flags to the TUI binary @@ -154,7 +154,7 @@ export function createCli( reportArgIndex >= 0 ? process.argv.slice(reportArgIndex + 1) : []; // Reject subcommands that are top-level — don't allow `teamhero report doctor`. - const subcommands = ["doctor", "setup"]; + const subcommands = ["doctor", "setup", "interview"]; if (argsToPass.length > 0 && subcommands.includes(argsToPass[0])) { deps.logger.error( `Unknown argument: ${argsToPass[0]}. Did you mean \`teamhero ${argsToPass[0]}\`?`, @@ -166,7 +166,7 @@ export function createCli( }); program - .command("setup") + .command("setup [args...]") .description("Configure credentials and preferences") .helpOption(false) .allowUnknownOption() @@ -178,7 +178,7 @@ export function createCli( }); program - .command("doctor") + .command("doctor [args...]") .description("Validate installation health") .helpOption(false) .allowUnknownOption() @@ -189,6 +189,22 @@ export function createCli( await spawnTui(deps, argsToPass); }); + program + .command("interview [args...]") + .description( + "Review candidate AI-collaboration interviews (bootstrap, review, cohort)", + ) + .helpOption(false) + .allowUnknownOption() + .action(async function (this: Command) { + const interviewArgIndex = process.argv.indexOf("interview"); + const argsToPass = + interviewArgIndex >= 0 + ? process.argv.slice(interviewArgIndex) + : ["interview"]; + await spawnTui(deps, argsToPass); + }); + return program; } @@ -210,6 +226,8 @@ export async function createDefaultDependencies(): Promise<CliDependencies> { } satisfies CliDependencies; } +const KNOWN_SUBCOMMANDS = ["report", "doctor", "setup", "interview"]; + export async function run( argv: string[] = process.argv, deps?: CliDependencies, @@ -217,13 +235,29 @@ export async function run( const resolvedDeps = deps ?? (await createDefaultDependencies()); const program = createCli(resolvedDeps); + const args = argv.slice(2); + const first = args[0]; + + // Reject unknown subcommands explicitly. Without this guard, commander + // silently prints the top-level help and exits 0, which makes a stale + // build look like a missing-command bug. + if ( + typeof first === "string" && + first.length > 0 && + !first.startsWith("-") && + !KNOWN_SUBCOMMANDS.includes(first) + ) { + resolvedDeps.logger.error( + `Unknown subcommand: ${first}. Run \`teamhero --help\` to see available commands.`, + ); + process.exit(1); + } + // If a subcommand is followed by --help, pass through to the Go binary // instead of letting Commander handle it (which prints the top-level help). - const args = argv.slice(2); - const subcommands = ["report", "doctor", "setup"]; if ( args.length >= 1 && - subcommands.includes(args[0]) && + KNOWN_SUBCOMMANDS.includes(args[0]) && args.includes("--help") ) { await spawnTui(resolvedDeps, args); diff --git a/src/services/interview/bootstrap/openai-generator-client.ts b/src/services/interview/bootstrap/openai-generator-client.ts new file mode 100644 index 0000000..549a808 --- /dev/null +++ b/src/services/interview/bootstrap/openai-generator-client.ts @@ -0,0 +1,224 @@ +import { readFileSync } from "node:fs"; +import { consola } from "consola"; +import OpenAI from "openai"; +import { getEnv } from "../../../lib/env.js"; +import type { + GeneratedProject, + GeneratorClient, +} from "./project-generator.js"; +import type { RoleConfig } from "./role-config.js"; + +// readJobDescription returns the JD body when the role config has +// asked the JD to shape project generation. Empty string when either +// the influence flag is off or the file can't be read — the +// generation prompt simply omits the context block in those cases. +// +// We trust the validator to have asserted the file exists before +// reaching this point; the try/catch is defence in depth so a +// transient FS error (e.g., the proctor moved the file between +// validation and generation) downgrades to "no JD context" rather +// than killing the whole run. +function readJobDescription(config: RoleConfig): string { + if (!config.jdInfluencesProject) return ""; + if (!config.jdPath || config.jdPath.trim().length === 0) return ""; + try { + return readFileSync(config.jdPath, "utf8").trim(); + } catch { + return ""; + } +} + +interface GeneratedFileResponse { + path: string; + content: string; +} + +interface ProjectResponse { + files: GeneratedFileResponse[]; +} + +const PROJECT_RESPONSE_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["files"], + properties: { + files: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: ["path", "content"], + properties: { + path: { type: "string" }, + content: { type: "string" }, + }, + }, + }, + }, +} as const; + +function buildPrompt( + config: RoleConfig, + attempt: number, + previousFailures: readonly string[] = [], +): string { + // The full 9-dimension rubric block used to be included verbatim here + // so the model could "build for observability against each + // dimension." In practice the dimensions are how the AI REVIEWER + // scores the candidate — the generator just needs to produce a + // substantive project. Inlining ~600 input tokens of review-side + // context per call wasn't earning its keep, especially after a + // proctor reported a single Mode B run cost $1.36. A one-line + // summary preserves the intent without the bloat; the full rubric + // still drives the review-side prompt in ai-observer.ts. + const rubricSummary = + "Build something the candidate can engage with thoughtfully — enough decision points, naming choices, and edge cases for them to demonstrate engineering judgment under AI augmentation. Don't pad or over-design."; + + const retryNote = + previousFailures.length > 0 + ? `\n\nPrevious attempt failed validation with: ${previousFailures.join("; ")}\nPlease address these specific failures in this attempt.\n` + : ""; + + const modeSpec = + config.projectMode === "A" + ? `Generate a Mode A "AI-bootstrap extension" project — a realistic starter codebase the candidate extends within the time-box. + +REQUIRED FILES: +- README.md at the root — written FOR THE CANDIDATE in plain language. Sections: + (1) "What you're building" — what this project is and the feature/extension the candidate will implement (${config.featureDescription}). + (2) "Time-box" — state the time-box explicitly as ${config.timeBoxMinutes} minutes. + (3) "Getting started" — exact command(s) to install deps. Tell the candidate they are expected to write their own tests; do NOT reference any pre-existing failing test. + (4) "Acceptance criteria" — bullet list of what "done" looks like for this slice. + (5) "Process" — one sentence pointing to INTERVIEW_RULES.md for the recording/interview workflow. + DO NOT write agent operating instructions. DO NOT mention rubric dimensions or what the observer is looking for. DO NOT coach the candidate on how to work with their AI agent. Agent guidance is shipped separately by the kit; the AI generator must not author it. +- Source files under src/ — split the work into a few cohesive modules (domain types, a service/orchestrator, helpers as appropriate for ${config.stack}). Right-size for the ${config.timeBoxMinutes}-minute time-box: substantive enough that a candidate can demonstrate judgment about architecture and naming, not so large that they can't read it in the first 10 minutes. +- A working test framework setup (package.json/go.mod/etc as appropriate for ${config.stack}) so the candidate can immediately write and run their own tests. Include only the dependency manifest and any required config — NO test files. + +DO NOT GENERATE (these would hint at the answer or break the evaluation): +- Any test files. The candidate writes their own tests as part of the evaluation; pre-existing tests (even skipped ones) would leak the API shape, function names, or expected behaviors. +- GLOSSARY.md. A glossary would hint at the domain concepts the candidate is expected to identify themselves. +- CLAUDE.md or AGENTS.md. Those are provided by the interview kit at copy time; writing one would overwrite carefully-authored proctor content with hallucinated instructions.` + : `Generate a Mode B "greenfield brief" project. The output MUST include only: +- BRIEF.md with required sections: ## Time-box (state ${config.timeBoxMinutes} minutes), ## Acceptance criteria, ## Deliverables +${ + config.stackByCandidate + ? `- A "## Tech stack" section in BRIEF.md that EXPLICITLY states the candidate selects their own tech stack. Mention "${config.stack}" only as context for the kind of tooling the hiring team uses internally — do NOT require it. The candidate's stack choice is itself evaluated.` + : `- A "## Tech stack" section in BRIEF.md that REQUIRES the candidate to use ${config.stack}.` +} +- No starter code at all. The candidate writes everything from scratch. +- DO NOT generate a CLAUDE.md or an AGENTS.md. Those files are provided by the interview kit at copy time.`; + + // Job description context — included only when the proctor opted to + // let the JD influence project generation. The model uses this to + // calibrate the project's complexity and domain character: e.g., a + // junior healthtech JD nudges toward an EHR-flavoured feature; a + // staff platform-engineering JD nudges toward systems-level + // concerns. Placed BEFORE the rubric so the structural rules + // (README required, no tests/glossary/CLAUDE.md) remain the final + // authoritative instruction the model reads. + const jd = readJobDescription(config); + const jdContext = + jd.length > 0 + ? `\n\nJob description context — use this to calibrate the project's complexity, seniority, and domain character. Do not echo it back to the candidate or reference it in the README; treat it as background that shapes what you build:\n---\n${jd}\n---\n` + : ""; + + // Domain: when a JD is attached the wizard skips the Domain question + // (the JD already describes it), so render an instruction to the + // model rather than an empty "Domain: ." line. The jdContext block + // below has the actual JD body for inference. + const domainLine = + config.domain && config.domain.trim().length > 0 + ? `Domain: ${config.domain}.` + : "Domain: infer from the job description context below."; + + return `You are generating a candidate coding interview project for the role: ${config.roleTitle}. +Stack: ${config.stack}. ${domainLine} Feature focus: ${config.featureDescription}. +Time-box: ${config.timeBoxMinutes} minutes. + +This is attempt ${attempt}.${retryNote}${jdContext} + +${modeSpec} + +Project surface area: ${rubricSummary} + +Return a JSON object with a "files" array. Each entry has "path" (repo-relative) and "content" (full file content).`; +} + +export class OpenAIGeneratorClient implements GeneratorClient { + private readonly client: OpenAI; + public readonly model: string; + + constructor(client?: OpenAI, model?: string) { + this.client = client ?? new OpenAI({ apiKey: getEnv("OPENAI_API_KEY") }); + this.model = model ?? getEnv("AI_MODEL") ?? "gpt-5-mini"; + } + + async generate(input: { + readonly config: RoleConfig; + readonly attempt: number; + readonly previousFailures?: readonly string[]; + }): Promise<GeneratedProject> { + const prompt = buildPrompt( + input.config, + input.attempt, + input.previousFailures, + ); + const response = await this.client.responses.create({ + model: this.model, + input: prompt, + // gpt-5-mini and gpt-5 default to "medium" reasoning effort, + // which spends a large number of internal reasoning tokens on + // a structured-output generation task. A proctor reported a + // single Mode B run cost $1.36 / 143k tokens; reasoning was + // the dominant share. "low" still produces high-quality + // scaffolds for this kind of file-list task and meaningfully + // shortens the billed tokens. Override via AI_REASONING_EFFORT + // if a future use case wants medium/high. + reasoning: { + effort: + (getEnv("AI_REASONING_EFFORT") as + | "minimal" + | "low" + | "medium" + | "high" + | undefined) ?? "low", + }, + text: { + format: { + type: "json_schema", + name: "interview_project", + schema: PROJECT_RESPONSE_SCHEMA, + strict: true, + }, + }, + }); + // Log token usage at info level so a proctor can see the + // per-attempt cost without --debug. The Responses API returns a + // usage object with input/output/(reasoning) counts; field + // shape is the standard OpenAI v2 shape (input_tokens, + // output_tokens, output_tokens_details.reasoning_tokens). The + // cast is defensive — older mocks may omit usage entirely. + const usage = ( + response as { + usage?: { + input_tokens?: number; + output_tokens?: number; + total_tokens?: number; + output_tokens_details?: { reasoning_tokens?: number }; + }; + } + ).usage; + if (usage) { + const reasoning = usage.output_tokens_details?.reasoning_tokens ?? 0; + consola.info( + `openai.usage model=${this.model} attempt=${input.attempt} input=${usage.input_tokens ?? 0} output=${usage.output_tokens ?? 0} reasoning=${reasoning} total=${usage.total_tokens ?? 0}`, + ); + } + const text = (response as { output_text?: string }).output_text; + if (!text) { + throw new Error("OpenAI Responses API returned no output_text"); + } + const parsed = JSON.parse(text) as ProjectResponse; + return { files: parsed.files }; + } +} diff --git a/src/services/interview/bootstrap/orchestrator.ts b/src/services/interview/bootstrap/orchestrator.ts new file mode 100644 index 0000000..c868abe --- /dev/null +++ b/src/services/interview/bootstrap/orchestrator.ts @@ -0,0 +1,109 @@ +import { existsSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { + generateProject, + type GeneratorClient, +} from "./project-generator.js"; +import { + type RoleConfig, + validateRoleConfig, + writeRoleConfig, +} from "./role-config.js"; + +// Static index.html stub written for Mode B (greenfield) runs. The +// candidate opens this file in their browser to get a friendly +// "where do I start" landing pad even when they've chosen to write +// everything from scratch. Kept deliberately small — no styling, no +// dependencies, no framework markers — so it doesn't influence the +// candidate's tooling choices. +const MODE_B_INDEX_HTML_STUB = `<!doctype html> +<html lang="en"> + <head> + <meta charset="utf-8" /> + <meta name="viewport" content="width=device-width, initial-scale=1" /> + <title>Interview project — getting started + + +

Interview project

+

+ Welcome. This is a placeholder. Open + BRIEF.md for what you're + being asked to build and the acceptance criteria. +

+

+ Replace this file with your own work. You choose the stack and + tooling. +

+ + +`; + +// writeModeBStub drops a minimal index.html into a Mode B output so the +// candidate has something concrete to open in a browser. No-op for +// Mode A (which already has source files) and no-op when the AI's +// own output already contains an index.html so we don't clobber it. +function writeModeBStub(config: RoleConfig): void { + if (config.projectMode !== "B") return; + const target = join(config.outputDir, "index.html"); + if (existsSync(target)) return; + writeFileSync(target, MODE_B_INDEX_HTML_STUB, "utf8"); +} + +export interface RunBootstrapOptions { + readonly client: GeneratorClient; + readonly kitTemplateDir?: string; + readonly maxAttempts?: number; +} + +export interface RunBootstrapResult { + readonly ok: boolean; + readonly attempts: number; + readonly failures: readonly string[]; +} + +export async function runBootstrap( + config: RoleConfig, + options: RunBootstrapOptions, +): Promise { + const configValidation = validateRoleConfig(config); + if (!configValidation.ok) { + return { + ok: false, + attempts: 0, + failures: configValidation.failures, + }; + } + + // Wrap so an unexpected throw inside generation / writeRoleConfig (e.g. + // network failure, disk-full, path-guard rejection) surfaces as a + // structured failure on RunBootstrapResult instead of an unhandled + // rejection at the CLI boundary. + try { + const generation = await generateProject(config, options.client, { + kitTemplateDir: options.kitTemplateDir, + maxAttempts: options.maxAttempts, + }); + if (!generation.ok) { + return { + ok: false, + attempts: generation.attempts, + failures: generation.failures, + }; + } + + writeRoleConfig(config.outputDir, config); + writeModeBStub(config); + + return { + ok: true, + attempts: generation.attempts, + failures: [], + }; + } catch (err) { + return { + ok: false, + attempts: 0, + failures: [err instanceof Error ? err.message : String(err)], + }; + } +} diff --git a/src/services/interview/bootstrap/project-generator.ts b/src/services/interview/bootstrap/project-generator.ts new file mode 100644 index 0000000..516be9a --- /dev/null +++ b/src/services/interview/bootstrap/project-generator.ts @@ -0,0 +1,307 @@ +import { homedir } from "node:os"; +import { + lstatSync, + mkdirSync, + readdirSync, + readFileSync, + realpathSync, + rmSync, + statSync, + writeFileSync, +} from "node:fs"; +import { dirname, isAbsolute, join, relative, resolve, sep } from "node:path"; +import { + validateModeAProject, + validateModeBProject, + type ValidationResult, +} from "./project-validator.js"; +import type { RoleConfig } from "./role-config.js"; + +export interface GeneratedFile { + readonly path: string; // relative to outputDir + readonly content: string; +} + +export interface GeneratedProject { + readonly files: readonly GeneratedFile[]; +} + +export interface GeneratorClient { + generate(input: { + readonly config: RoleConfig; + readonly attempt: number; + readonly previousFailures?: readonly string[]; + }): Promise; +} + +export interface GenerateOptions { + /** Source directory for the embedded interview kit (copied verbatim into outputDir). */ + readonly kitTemplateDir?: string; + /** Maximum number of generation attempts before giving up. */ + readonly maxAttempts?: number; +} + +export interface GenerateResult { + readonly ok: boolean; + readonly attempts: number; + readonly failures: readonly string[]; +} + +// Three attempts handles the structural failure modes the validator +// still enforces (missing README.md / GLOSSARY.md / failing test). +// One structured Responses API call per attempt, so the budget stays +// cheap; bump if a future validator rule produces more retry pressure. +const DEFAULT_MAX_ATTEMPTS = 3; + +/** + * Resolves `relPath` relative to `rootAbs` and refuses paths that escape the + * root via `..`, absolute components, drive letters, or symlinks that point + * outside the root. The generator client returns file paths from an LLM + * response, which is untrusted input. + * + * Two-stage containment: (1) string-level relative-path resolution catches + * obvious traversal; (2) realpath check on the parent directory catches a + * symlink planted by a previous attempt that points outside the root. + * Without (2), an attacker who can leave `subdir -> /etc` in the output + * tree could redirect a later write to `/etc/passwd` via `subdir/passwd`. + */ +function resolveWithinRoot(rootAbs: string, relPath: string): string { + if (relPath.includes("\0")) { + throw new Error( + `Generated file path contains a null byte, refusing: ${JSON.stringify(relPath)}`, + ); + } + if (isAbsolute(relPath)) { + throw new Error( + `Generated file path is absolute, refusing: ${relPath}`, + ); + } + const target = resolve(rootAbs, relPath); + const rel = relative(rootAbs, target); + if (rel.startsWith("..") || rel.startsWith(`..${sep}`) || isAbsolute(rel)) { + throw new Error( + `Generated file path escapes output directory, refusing: ${relPath}`, + ); + } + + // Walk path segments from the root toward the target; refuse if any + // existing intermediate component is a symlink. This neutralizes a + // `subdir -> /etc` symlink planted by a prior generation attempt. + let cursor = rootAbs; + const parts = rel.split(sep).filter((p) => p.length > 0); + // Drop the final segment — it's the file name, which may not exist yet. + for (let i = 0; i < parts.length - 1; i++) { + cursor = join(cursor, parts[i]); + try { + const st = lstatSync(cursor); + if (st.isSymbolicLink()) { + throw new Error( + `Generated file path traverses a symlink at ${cursor}, refusing: ${relPath}`, + ); + } + } catch (err) { + // ENOENT is expected for new directories; rethrow anything else. + if ( + err instanceof Error && + (err as NodeJS.ErrnoException).code !== "ENOENT" + ) { + throw err; + } + } + } + + return target; +} + +function writeGenerated(outputDir: string, project: GeneratedProject): void { + // realpath the root once so subsequent containment math is symlink-stable. + // If outputDir itself doesn't exist yet, fall back to its resolved path — + // clearOutputDir runs immediately before writeGenerated and creates it. + let rootAbs = resolve(outputDir); + try { + rootAbs = realpathSync(rootAbs); + } catch { + // not yet created — resolved path is the best we can do + } + for (const file of project.files) { + const target = resolveWithinRoot(rootAbs, file.path); + mkdirSync(dirname(target), { recursive: true }); + writeFileSync(target, file.content, "utf8"); + } +} + +// Known dangerous absolute paths we refuse to recursively delete even when +// they technically pass the depth and root-segment checks. Adding entries +// here is preferable to adding new heuristics — it forces an explicit +// decision for each system-relevant path. +const DANGEROUS_ROOTS: ReadonlySet = new Set([ + "/", + "/bin", + "/boot", + "/dev", + "/etc", + "/home", + "/lib", + "/lib32", + "/lib64", + "/mnt", + "/opt", + "/proc", + "/root", + "/run", + "/sbin", + "/srv", + "/sys", + "/tmp", + "/usr", + "/usr/local", + "/var", +]); + +/** + * Refuses to clear paths that are obviously dangerous to recursively delete: + * filesystem roots, the user's home directory, single-segment paths + * (`/foo`), or well-known system directories like `/tmp` and `/var` even + * when they technically resolve fine. Subdirectories of those system + * directories (`/tmp/my-output`) are permitted — that's where mkdtemp + * lives and where tests stage fixtures. + * + * Stronger than "is this filesystem root?" because a misconfigured + * `outputDir: "/tmp"` would previously have been accepted and would have + * deleted every other process's tempfiles. + */ +function assertSafeToClear(outputDir: string): void { + const abs = resolve(outputDir); + if (abs === sep) { + throw new Error(`Refusing to clear filesystem root: ${abs}`); + } + + const home = homedir(); + if (home && abs === resolve(home)) { + throw new Error(`Refusing to clear home directory: ${abs}`); + } + + if (DANGEROUS_ROOTS.has(abs)) { + throw new Error( + `Refusing to clear well-known system directory: ${abs}. ` + + `Pick an output directory inside your workspace.`, + ); + } + + // Refuse a single-segment absolute path like `/foo` or `/anything`. + // Real workspace paths have at least two segments after the root. + const parts = abs.split(sep).filter((p) => p.length > 0); + if (parts.length < 2) { + throw new Error( + `Refusing to clear single-segment path: ${abs}. ` + + `Output directories must be a subdirectory, not a top-level path.`, + ); + } +} + +function clearOutputDir(outputDir: string): void { + assertSafeToClear(outputDir); + rmSync(outputDir, { recursive: true, force: true }); + mkdirSync(outputDir, { recursive: true }); +} + +/** + * Recursively copies `src` into `dest`, substituting `{{KEY}}` tokens in + * every file body using `vars`. The substitution is intentionally + * minimal — single `String.replaceAll` per key, no escaping or + * conditionals — so the kit's template grammar is "the literal text + * that appears in the source files." + * + * Files that don't contain any placeholder pay only one read+write, no + * regex compilation per file. + */ +function copyDir( + src: string, + dest: string, + vars: Readonly> = {}, +): void { + const tokenKeys = Object.keys(vars); + for (const entry of readdirSync(src)) { + const s = join(src, entry); + const d = join(dest, entry); + const st = statSync(s); + if (st.isDirectory()) { + mkdirSync(d, { recursive: true }); + copyDir(s, d, vars); + } else { + mkdirSync(dirname(d), { recursive: true }); + if (tokenKeys.length === 0) { + writeFileSync(d, readFileSync(s)); + continue; + } + // readFileSync as utf8 — template files (.md, .sh, .json) are text. + // If the kit ever includes a binary asset we'll need to whitelist + // extensions; today there are none. + let body = readFileSync(s, "utf8"); + for (const key of tokenKeys) { + body = body.replaceAll(`{{${key}}}`, vars[key]); + } + writeFileSync(d, body); + } + } +} + +function validateOutput( + config: RoleConfig, + outputDir: string, +): ValidationResult { + if (config.projectMode === "A") return validateModeAProject(outputDir); + return validateModeBProject(outputDir); +} + +export async function generateProject( + config: RoleConfig, + client: GeneratorClient, + options: GenerateOptions = {}, +): Promise { + const maxAttempts = options.maxAttempts ?? DEFAULT_MAX_ATTEMPTS; + let lastFailures: readonly string[] = []; + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + clearOutputDir(config.outputDir); + + const project = await client.generate({ + config, + attempt, + previousFailures: lastFailures, + }); + writeGenerated(config.outputDir, project); + + // Copy kit templates after the generated files so kit files take precedence + // when paths overlap (intentional: kit is the canonical wiring). + // {{TIME_BOX}} placeholders in kit text files are substituted with the + // configured minutes — INTERVIEW_RULES.md reads this so candidates see + // a real number instead of the literal placeholder. + if (options.kitTemplateDir) { + copyDir(options.kitTemplateDir, config.outputDir, { + TIME_BOX: String(config.timeBoxMinutes), + }); + } + + const validation = validateOutput(config, config.outputDir); + if (validation.ok) { + return { ok: true, attempts: attempt, failures: [] }; + } + lastFailures = validation.failures; + } + + return { + ok: false, + attempts: maxAttempts, + failures: lastFailures, + }; +} + +/** + * Validation-only helper exposed for callers that want to re-check a previously + * generated project (e.g., after a manual override). + */ +export function validateGenerated(config: RoleConfig): ValidationResult { + return validateOutput(config, config.outputDir); +} + diff --git a/src/services/interview/bootstrap/project-validator.ts b/src/services/interview/bootstrap/project-validator.ts new file mode 100644 index 0000000..34fb3a3 --- /dev/null +++ b/src/services/interview/bootstrap/project-validator.ts @@ -0,0 +1,60 @@ +import { existsSync, readFileSync } from "node:fs"; +import { join } from "node:path"; + +export interface ValidationResult { + readonly ok: boolean; + readonly failures: readonly string[]; +} + +export function validateModeAProject(dir: string): ValidationResult { + const failures: string[] = []; + + // README.md is the only required file. It is the candidate-facing + // brief — what they're building, the time-box, and how to run tests + // they're about to write themselves. + // + // Notably absent (by design, not oversight): + // - GLOSSARY.md — would hint at domain concepts the candidate + // should think about. Removed. + // - Failing/skipped sample tests — would hint at the API surface + // or function names the candidate is expected to implement. + // Removed. + // - .claude/CLAUDE.md — would coach the candidate's agent about + // the structure of the work. Removed from the kit overlay. + // The candidate writes their own tests, picks their own glossary, + // and works with their agent on their own terms. That's what's + // being evaluated. + if (!existsSync(join(dir, "README.md"))) { + failures.push("Missing README.md at project root (candidate-facing brief)."); + } + + return { ok: failures.length === 0, failures }; +} + +const MODE_B_REQUIRED_SECTIONS: readonly RegExp[] = [ + /##\s+Time-?box/i, + /##\s+Acceptance criteria/i, + /##\s+Deliverables/i, +]; + +export function validateModeBProject(dir: string): ValidationResult { + const failures: string[] = []; + const briefPath = join(dir, "BRIEF.md"); + if (!existsSync(briefPath)) { + failures.push("Missing BRIEF.md at project root."); + return { ok: false, failures }; + } + const body = readFileSync(briefPath, "utf8").trim(); + if (body.length === 0) { + failures.push("BRIEF.md is empty."); + return { ok: false, failures }; + } + for (const section of MODE_B_REQUIRED_SECTIONS) { + if (!section.test(body)) { + failures.push( + `BRIEF.md is missing required section matching ${section}.`, + ); + } + } + return { ok: failures.length === 0, failures }; +} diff --git a/src/services/interview/bootstrap/role-config.ts b/src/services/interview/bootstrap/role-config.ts new file mode 100644 index 0000000..bf894be --- /dev/null +++ b/src/services/interview/bootstrap/role-config.ts @@ -0,0 +1,184 @@ +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +export type ProjectMode = "A" | "B"; +export type AnalysisMode = "ai-assisted" | "human-only"; +// rubricMode picks the rubric the AI observer uses for post-interview +// analysis. The job description, when supplied, is treated as an +// independent input — see jdPath / jdInfluencesProject below — so it +// can supplement either rubric or be used purely for project generation. +export type RubricMode = "default" | "custom"; + +export interface RoleConfig { + readonly roleSlug: string; + readonly roleTitle: string; + readonly stack: string; + readonly domain: string; + readonly featureDescription: string; + readonly timeBoxMinutes: number; + readonly projectMode: ProjectMode; + readonly analysisMode: AnalysisMode; + readonly rubricMode: RubricMode; + readonly outputDir: string; + readonly customPrompt?: string; + // jdPath is the absolute or relative path to a markdown/text job + // description. Optional in all rubric modes. When supplied, the + // AI observer references it during post-interview analysis. When + // jdInfluencesProject is also true, the project-generation prompt + // reads the JD content and tailors the generated repo to match + // the seniority and domain it implies (e.g., a junior healthtech + // JD nudges the generator toward an EHR-flavoured feature). + readonly jdPath?: string; + readonly jdInfluencesProject?: boolean; + // stackByCandidate flips Mode B's brief from "use the named stack" + // to "candidate picks their own stack". Only meaningful when + // projectMode === "B"; validation rejects the combination otherwise. + readonly stackByCandidate?: boolean; +} + +export interface RoleConfigValidationResult { + readonly ok: boolean; + readonly failures: readonly string[]; +} + +const CUSTOM_TIME_BOX_MIN = 15; +const CUSTOM_TIME_BOX_MAX = 240; +const STANDARD_TIME_BOXES = new Set([60, 90, 120]); + +const ROLE_CONFIG_FILENAME = "role-config.json"; + +function requireNonEmpty( + field: keyof RoleConfig, + value: unknown, + failures: string[], +): void { + if (typeof value !== "string" || value.trim().length === 0) { + failures.push(`${String(field)} must be a non-empty string`); + } +} + +export function validateRoleConfig( + config: RoleConfig, +): RoleConfigValidationResult { + const failures: string[] = []; + + requireNonEmpty("roleSlug", config.roleSlug, failures); + requireNonEmpty("roleTitle", config.roleTitle, failures); + requireNonEmpty("stack", config.stack, failures); + requireNonEmpty("featureDescription", config.featureDescription, failures); + requireNonEmpty("outputDir", config.outputDir, failures); + // domain is required UNLESS a JD is attached. The JD describes the + // business domain; asking the proctor to also type it out is + // redundant and a source of friction. The OpenAI prompt and the + // observer both fall back to the JD body when domain is empty. + const hasJD = + typeof config.jdPath === "string" && config.jdPath.trim().length > 0; + if (!hasJD) { + requireNonEmpty("domain", config.domain, failures); + } + + const t = config.timeBoxMinutes; + const inRange = t >= CUSTOM_TIME_BOX_MIN && t <= CUSTOM_TIME_BOX_MAX; + if (!Number.isFinite(t) || !inRange) { + failures.push( + `timeBoxMinutes must be a finite number between ${CUSTOM_TIME_BOX_MIN} and ${CUSTOM_TIME_BOX_MAX} (standard values are ${[...STANDARD_TIME_BOXES].join("/")})`, + ); + } + + if (config.projectMode !== "A" && config.projectMode !== "B") { + failures.push("projectMode must be 'A' or 'B'"); + } + if (config.stackByCandidate && config.projectMode !== "B") { + failures.push( + "stackByCandidate requires projectMode 'B' — Mode A scaffolds in a specific stack, so 'candidate picks the stack' is incoherent there.", + ); + } + if ( + config.analysisMode !== "ai-assisted" && + config.analysisMode !== "human-only" + ) { + failures.push("analysisMode must be 'ai-assisted' or 'human-only'"); + } + + switch (config.rubricMode) { + case "default": + break; + case "custom": + if ( + typeof config.customPrompt !== "string" || + config.customPrompt.trim().length === 0 + ) { + failures.push( + "rubricMode 'custom' requires a non-empty customPrompt field", + ); + } + break; + default: + failures.push( + `rubricMode must be one of 'default', 'custom' (got ${String(config.rubricMode)})`, + ); + } + + // jdPath is now an independent optional field — validated regardless + // of rubric mode. When provided, the file must exist; when + // jdInfluencesProject is set, jdPath becomes mandatory because the + // generator has nothing to read otherwise. + if (typeof config.jdPath === "string" && config.jdPath.trim().length > 0) { + if (!existsSync(config.jdPath)) { + failures.push(`jdPath does not exist on disk: ${config.jdPath}`); + } + } + if (config.jdInfluencesProject) { + if ( + typeof config.jdPath !== "string" || + config.jdPath.trim().length === 0 + ) { + failures.push( + "jdInfluencesProject is true but jdPath is missing — provide a JD or unset the influence flag", + ); + } + } + + return { ok: failures.length === 0, failures }; +} + +export function writeRoleConfig(dir: string, config: RoleConfig): void { + const result = validateRoleConfig(config); + if (!result.ok) { + throw new Error( + `Invalid role config: ${result.failures.join("; ")}`, + ); + } + mkdirSync(dir, { recursive: true }); + writeFileSync( + join(dir, ROLE_CONFIG_FILENAME), + `${JSON.stringify(config, null, 2)}\n`, + "utf8", + ); +} + +export function readRoleConfig(dir: string): RoleConfig | null { + const path = join(dir, ROLE_CONFIG_FILENAME); + if (!existsSync(path)) return null; + const body = readFileSync(path, "utf8"); + let parsed: unknown; + try { + parsed = JSON.parse(body); + } catch (err) { + const reason = err instanceof Error ? err.message : String(err); + throw new Error(`Malformed role-config.json at ${path}: ${reason}`); + } + if (!parsed || typeof parsed !== "object") { + throw new Error( + `Malformed role-config.json at ${path}: top-level value is not an object`, + ); + } + const candidate = parsed as RoleConfig; + const validation = validateRoleConfig(candidate); + if (!validation.ok) { + throw new Error( + `Invalid role-config.json at ${path}: ${validation.failures.join("; ")}`, + ); + } + return candidate; +} diff --git a/src/services/interview/cohort/cohort-summary.ts b/src/services/interview/cohort/cohort-summary.ts new file mode 100644 index 0000000..752fdc9 --- /dev/null +++ b/src/services/interview/cohort/cohort-summary.ts @@ -0,0 +1,145 @@ +import { existsSync, readdirSync, readFileSync, statSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import type { AuditFrontmatter } from "../review/audit-writer.js"; + +/** + * Cohort summary module. + * + * Reads the per-candidate audit.json files for a given role and produces a + * `COHORT.md` roll-up. No score column. No ranking. Alphabetical order only. + * Pending sign-offs are visibly indicated. + */ + +export interface CandidateAuditRecord { + readonly frontmatter: AuditFrontmatter; + readonly summaryPath: string; // relative to the cohort dir +} + +const WARNING_BANNER = `⚠ THIS COHORT REPORT IS ADVISORY. Hiring decisions are made by humans using +professional judgment. Each candidate is a person, not a score. This rubric +is one factor among many; your evaluation is the primary basis for the +hiring decision.`; + +/** + * Loads candidate audit records from a role directory. + * + * Expected layout: `//audit.json`. Subdirectories + * without audit.json are skipped silently. + */ +export function loadCohort(roleDir: string): readonly CandidateAuditRecord[] { + if (!existsSync(roleDir)) return []; + const records: CandidateAuditRecord[] = []; + for (const entry of readdirSync(roleDir)) { + const entryPath = join(roleDir, entry); + if (!statSync(entryPath).isDirectory()) continue; + const auditJson = join(entryPath, "audit.json"); + if (!existsSync(auditJson)) continue; + try { + const body = readFileSync(auditJson, "utf8"); + const parsed = JSON.parse(body) as unknown; + const fm = (parsed as { frontmatter?: unknown } | null)?.frontmatter; + if (!fm || typeof fm !== "object") continue; + const ff = fm as Partial; + if ( + typeof ff.candidate !== "string" || + typeof ff.role !== "string" || + typeof ff.date !== "string" || + typeof ff.signed_off !== "boolean" + ) { + continue; + } + records.push({ + frontmatter: fm as AuditFrontmatter, + summaryPath: join(entry, "summary.md"), + }); + } catch { + // Skip malformed audit.json silently. + } + } + return records; +} + +// escapeMarkdownTableCell sanitizes field values so they don't break the +// pipe-delimited markdown table layout. A candidate name like +// "Alice | aka Bob" would otherwise insert an extra column; a name with a +// newline would terminate the row mid-record. +function escapeMarkdownTableCell(value: string): string { + return value.replace(/\\/g, "\\\\").replace(/\|/g, "\\|").replace(/\r?\n/g, " "); +} + +function renderRow(rec: CandidateAuditRecord): string { + const fm = rec.frontmatter; + const interviewed = fm.session_date ?? fm.date; + const signOff = fm.signed_off ? "✅ Reviewed" : "⏳ Pending"; + const recommendation = fm.signed_off ? (fm.recommendation ?? "") : "—"; + const audit = `[link](${rec.summaryPath})`; + return `| ${escapeMarkdownTableCell(fm.candidate)} | ${escapeMarkdownTableCell(interviewed)} | ${signOff} | ${escapeMarkdownTableCell(recommendation)} | ${audit} |`; +} + +export interface CohortSummaryOptions { + /** "alphabetical" (default) or "chronological" (by session_date / date). */ + readonly order?: "alphabetical" | "chronological"; +} + +export function renderCohortSummary( + roleSlug: string, + records: readonly CandidateAuditRecord[], + options: CohortSummaryOptions = {}, +): string { + const order = options.order ?? "alphabetical"; + const sorted = [...records]; + if (order === "alphabetical") { + sorted.sort((a, b) => + a.frontmatter.candidate.localeCompare(b.frontmatter.candidate), + ); + } else { + sorted.sort((a, b) => { + const aDate = a.frontmatter.session_date ?? a.frontmatter.date; + const bDate = b.frontmatter.session_date ?? b.frontmatter.date; + return aDate.localeCompare(bDate); + }); + } + + const lines: string[] = []; + lines.push(`> ${WARNING_BANNER.split("\n").join("\n> ")}`); + lines.push(""); + lines.push(`# Cohort: ${roleSlug}`); + lines.push(""); + const pending = sorted.filter((r) => !r.frontmatter.signed_off).length; + const total = sorted.length; + lines.push( + `Candidates: ${total} (${pending} pending sign-off, ${total - pending} reviewed). Order: ${order}.`, + ); + lines.push(""); + if (sorted.length === 0) { + lines.push("_No candidates yet for this role._"); + lines.push(""); + return `${lines.join("\n")}`; + } + lines.push("| Candidate | Interviewed | Sign-off | Recommendation | Audit |"); + lines.push("|-----------|-------------|----------|----------------|-------|"); + for (const rec of sorted) { + lines.push(renderRow(rec)); + } + lines.push(""); + return `${lines.join("\n")}`; +} + +export interface WriteCohortInput { + readonly roleDir: string; + readonly roleSlug: string; + readonly order?: "alphabetical" | "chronological"; +} + +export function writeCohortSummary(input: WriteCohortInput): { + readonly path: string; + readonly recordCount: number; +} { + const records = loadCohort(input.roleDir); + const body = renderCohortSummary(input.roleSlug, records, { + order: input.order, + }); + const path = join(input.roleDir, "COHORT.md"); + writeFileSync(path, body, "utf8"); + return { path, recordCount: records.length }; +} diff --git a/src/services/interview/review/ai-observer.ts b/src/services/interview/review/ai-observer.ts new file mode 100644 index 0000000..8800100 --- /dev/null +++ b/src/services/interview/review/ai-observer.ts @@ -0,0 +1,276 @@ +import { readFileSync } from "node:fs"; +import OpenAI from "openai"; +import { getEnv } from "../../../lib/env.js"; +import type { RoleConfig } from "../bootstrap/role-config.js"; +import { getDimensions, getRubricVersion } from "../shared/rubric.js"; +import type { EvidenceEvent, Observation } from "./types.js"; + +/** + * AI observer. Produces narrative observations for the 5 LLM-judge and + * hybrid dimensions. Refuses to emit numerical scores via strict json_schema. + * + * Critical commitments: + * - The interviewer-bias guard instruction appears verbatim in every prompt. + * - The strict json_schema lists ONLY the observation fields; no `score`, + * `weighted_total`, `band`. The Responses API rejects responses with + * extra fields at the provider level. + * - The session_recording_url (if provided as metadata) is NEVER included + * in the prompt input — it is metadata for the frontmatter only. + */ + +export const INTERVIEWER_BIAS_GUARD = `The audio transcript and interviewer notes are provided as context about what was happening during the session. Treat the interviewer's verbal commentary as situational context only — do NOT weight it as evidence of the candidate's skill, competence, or character. Your observations must be grounded in the candidate's *actions* (prompts they wrote, tools they used, code they produced, tests they ran, decisions they made) — not in the interviewer's framing of those actions. If an interviewer remark could be interpreted multiple ways, do not let it bias your observation; rely on the directly observable artifacts (interview.log, terminal.cast, repo state).`; + +const OBSERVABLE_DIMENSION_IDS = [ + "upfront-design", + "context-engineering", + "critical-evaluation", + "course-correction", + "architectural-quality", +] as const; + +export const OBSERVATION_RESPONSE_SCHEMA = { + type: "object", + additionalProperties: false, + required: ["observations"], + properties: { + observations: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: [ + "dimension_id", + "observation", + "reasoning", + "evidence_excerpts", + ], + properties: { + dimension_id: { + type: "string", + enum: OBSERVABLE_DIMENSION_IDS, + }, + observation: { type: "string" }, + reasoning: { type: "string" }, + evidence_excerpts: { + type: "array", + items: { + type: "object", + additionalProperties: false, + required: ["source", "content"], + properties: { + timestamp: { type: "string" }, + source: { type: "string" }, + content: { type: "string" }, + }, + }, + }, + caveats: { type: "string" }, + }, + }, + }, + }, +} as const; + +const FORBIDDEN_FIELDS = [ + "score", + "weighted_total", + "raw_total", + "band", + "signal_count", +]; + +export interface BuildPromptInput { + readonly config: RoleConfig; + readonly events: readonly EvidenceEvent[]; + /** Optional interviewer notes file path. */ + readonly interviewerNotesPath?: string; + /** + * Optional session recording URL. Captured for frontmatter only — + * this function deliberately DOES NOT include it in the prompt. + */ + readonly sessionRecordingUrl?: string; +} + +export interface BuiltPrompt { + readonly instructions: string; + readonly input: string; +} + +function readIfExists(path: string | undefined): string { + if (!path) return ""; + try { + return readFileSync(path, "utf8"); + } catch { + return ""; + } +} + +function rubricBlock(): string { + return getDimensions() + .filter((d) => OBSERVABLE_DIMENSION_IDS.includes(d.id as never)) + .map( + (d) => + `- ${d.id} (${d.title}, ${d.evidenceMode}, group ${d.group}): ${d.description}`, + ) + .join("\n"); +} + +function rubricModeAddendum(config: RoleConfig): string { + // Rubric mode picks the base prompt; the JD is appended separately + // whenever it's been provided, regardless of mode. The previous + // design coupled "JD attached" to a "default+jd" rubric value, which + // forced the proctor to pick between custom rubric guidance and JD + // context — now they can have both. + let addendum = ""; + switch (config.rubricMode) { + case "default": + break; + case "custom": + addendum = `\n\nAdditional rubric guidance from the hiring manager (custom mode):\n${config.customPrompt ?? ""}`; + break; + } + const jd = readIfExists(config.jdPath); + if (jd) { + addendum += `\n\nJob description supplied by the hiring manager — use it as context for what level of work to expect from this candidate:\n---\n${jd}\n---`; + } + return addendum; +} + +function summarizeEvents(events: readonly EvidenceEvent[]): string { + const lines: string[] = []; + for (const e of events) { + switch (e.type) { + case "prompt": + lines.push( + `[${e.timestamp}] PROMPT: ${e.text.slice(0, 500).replace(/\n/g, " ")}`, + ); + break; + case "tool-use": + lines.push(`[${e.timestamp}] TOOL: ${e.tool}`); + break; + case "command": + lines.push(`[${e.timestamp}] $ ${e.command.slice(0, 200)}`); + break; + case "commit": + lines.push( + `[${e.timestamp}] COMMIT ${e.sha.slice(0, 7)} (+${e.insertions}/-${e.deletions}): ${e.message}`, + ); + break; + case "transcript-line": + lines.push( + `[${e.timestamp}] (transcript) ${e.speaker}: ${e.text.slice(0, 400)}`, + ); + break; + } + } + return lines.join("\n"); +} + +export function buildObserverPrompt(input: BuildPromptInput): BuiltPrompt { + const instructions = `You are an interview observer. You read the candidate's session artifacts and produce structured narrative observations for the dimensions listed below. + +CRITICAL RULES: +- You do NOT produce scores, weights, totals, or bands. Your json_schema response only includes observations, reasoning, and evidence excerpts. +- ${INTERVIEWER_BIAS_GUARD} +- For each dimension you observe, write a 1-3 sentence narrative observation, a multi-sentence reasoning chain, and 1-3 evidence excerpts citing source + content. +- The hiring manager reads your output as one input among many. Be specific, be cautious where you are uncertain, and prefer caveats over confident generalizations. + +Rubric version: ${getRubricVersion()}. +Dimensions you observe (others are deterministic and handled separately): +${rubricBlock()} +${rubricModeAddendum(input.config)}`; + + const interviewerNotes = readIfExists(input.interviewerNotesPath); + const candidateActions = summarizeEvents(input.events); + const eventBlock = candidateActions || "(no events recorded)"; + + const userText = `Role: ${input.config.roleTitle} (${input.config.roleSlug}) +Stack: ${input.config.stack} | Domain: ${input.config.domain} | Feature: ${input.config.featureDescription} +Time-box: ${input.config.timeBoxMinutes} minutes + +Candidate session evidence (in chronological order): +${eventBlock} +${interviewerNotes ? `\nInterviewer notes (situational context only; remember the bias guard above):\n${interviewerNotes}` : ""}`; + + return { instructions, input: userText }; +} + +/** + * Walk the response tree and throw if any object literally contains a key + * whose name matches a forbidden scoring field. Earlier versions of this + * function scanned the serialized JSON with a regex, which produced + * false positives when an evidence excerpt happened to contain a string + * like `"score": 7/10` inside a code comment. Inspecting object keys + * directly is the precise check. + */ +export function rejectIfScored(response: unknown): void { + const forbidden = new Set(FORBIDDEN_FIELDS); + const visit = (node: unknown): void => { + if (node === null || typeof node !== "object") return; + if (Array.isArray(node)) { + for (const item of node) visit(item); + return; + } + for (const key of Object.keys(node as Record)) { + if (forbidden.has(key)) { + throw new Error( + `AI observer response contained forbidden field '${key}'. ` + + "This rubric is observation-only; numerical scoring is rejected at the schema and validator layers.", + ); + } + visit((node as Record)[key]); + } + }; + visit(response); +} + +export interface ObserverClient { + observe(input: BuiltPrompt): Promise<{ readonly observations: readonly Observation[] }>; +} + +export class OpenAIObserverClient implements ObserverClient { + private readonly client: OpenAI; + private readonly model: string; + + constructor(client?: OpenAI, model = "gpt-5-mini") { + this.client = client ?? new OpenAI({ apiKey: getEnv("OPENAI_API_KEY") }); + this.model = model; + } + + async observe(prompt: BuiltPrompt) { + const response = await this.client.responses.create({ + model: this.model, + instructions: prompt.instructions, + input: prompt.input, + text: { + format: { + type: "json_schema", + name: "interview_observations", + schema: OBSERVATION_RESPONSE_SCHEMA, + strict: true, + }, + }, + }); + const text = (response as { output_text?: string }).output_text; + if (!text) throw new Error("Observer API returned no output_text"); + const parsed = JSON.parse(text) as { + observations: readonly Observation[]; + }; + rejectIfScored(parsed); + return { observations: parsed.observations }; + } +} + +/** + * Builds observation records when role config requests human-only mode. The + * returned observations are "blank fillable templates" with a clear marker so + * the audit-writer renders them as gaps the manager fills in. + */ +export function humanOnlyObservations(): readonly Observation[] { + return OBSERVABLE_DIMENSION_IDS.map((id) => ({ + dimension_id: id, + observation: "(human-only mode — manager to write observation)", + reasoning: "(human-only mode — manager to write reasoning)", + evidence_excerpts: [], + })); +} diff --git a/src/services/interview/review/audit-writer.ts b/src/services/interview/review/audit-writer.ts new file mode 100644 index 0000000..45e27eb --- /dev/null +++ b/src/services/interview/review/audit-writer.ts @@ -0,0 +1,328 @@ +import { mkdirSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { getDimensions } from "../shared/rubric.js"; +import type { ReviewResult, Measurement, Observation } from "./types.js"; + +/** + * Audit writer. Pure transformation from ReviewResult to the per-candidate + * audit folder layout (summary.md / audit.md / audit.json). Reasoning chains + * are preserved in BOTH summary.md and audit.md. + */ + +export interface AuditFrontmatter { + readonly tags: readonly string[]; + readonly candidate: string; + readonly role: string; + readonly date: string; // YYYY-MM-DD + readonly rubric_version: string; + readonly rubric_mode: string; + readonly signed_off: boolean; + /** Categorical sign-off result. Present only when signed_off is true. */ + readonly recommendation?: "Hire" | "Hire with notes" | "No hire"; + readonly session_recording_url?: string; + readonly session_platform?: string; + readonly session_date?: string; +} + +export interface WriteAuditInput { + readonly result: ReviewResult; + readonly frontmatter: AuditFrontmatter; + readonly outputDir: string; +} + +const WARNING_BANNER = `⚠ THIS AUDIT IS ADVISORY. Hiring decisions are made by humans using +professional judgment. The candidate is a person, not a score. This rubric +is one factor among many; your evaluation is the primary, first, and most +important basis for your decision.`; + +const SIGN_OFF_PLACEHOLDER = `## Sign-off (MANDATORY) + +This audit is not complete until a hiring manager has read the +observations above and written a categorical recommendation along with +a reasoning summary in their own words. The TUI rejects empty submissions. + +**Recommendation:** \`\` (Hire | Hire with notes | No hire) + +**Manager reasoning (write your own summary; do not leave blank):** + +> _Write 3–6 sentences in your own words. What stood out? What gave you +> pause? Which observations did you weigh most heavily and why? Do not +> simply restate the AI's observations — give your own read._ +`; + +// Quote any YAML scalar that could be misparsed by a YAML reader: contains +// colons, # comments, leading/trailing whitespace, or starts with a YAML +// indicator. Within the double-quoted form we escape ALL ASCII control +// characters (C0: \x00–\x1F, plus \x7F) as `\xHH` per YAML 1.2 §5.7 — a +// raw control character in a double-quoted scalar is a parse error in +// strict YAML parsers (js-yaml FAILSAFE, libyaml). Candidate names with +// stray control characters from copy-paste otherwise corrupt the +// audit.json round-trip silently. +function escapeYamlDoubleQuoted(value: string): string { + let out = value.replace(/\\/g, "\\\\").replace(/"/g, '\\"'); + out = out.replace(/[\x00-\x1F\x7F]/g, (c) => { + switch (c) { + case "\n": + return "\\n"; + case "\r": + return "\\r"; + case "\t": + return "\\t"; + case "\0": + return "\\0"; + case "\b": + return "\\b"; + case "\f": + return "\\f"; + case "\v": + return "\\v"; + default: { + const hex = c.charCodeAt(0).toString(16).padStart(2, "0"); + return `\\x${hex}`; + } + } + }); + return out; +} + +function yamlScalar(value: string): string { + if ( + value.length === 0 || + /[:#\n\r\t"\\]/.test(value) || + /^[\s\-?\[\]{},&*!|>'%@`]/.test(value) || + value.trim() !== value + ) { + return `"${escapeYamlDoubleQuoted(value)}"`; + } + return value; +} + +function yamlTag(value: string): string { + if (/[:,\[\]{}#&*!|>'"%@`\s]/.test(value)) { + return `"${escapeYamlDoubleQuoted(value)}"`; + } + return value; +} + +function yaml(value: AuditFrontmatter): string { + const lines: string[] = ["---"]; + lines.push(`tags: [${value.tags.map(yamlTag).join(", ")}]`); + lines.push(`candidate: ${yamlScalar(value.candidate)}`); + lines.push(`role: ${yamlScalar(value.role)}`); + lines.push(`date: ${yamlScalar(value.date)}`); + lines.push(`rubric_version: ${yamlScalar(value.rubric_version)}`); + lines.push(`rubric_mode: ${yamlScalar(value.rubric_mode)}`); + lines.push(`signed_off: ${value.signed_off}`); + if (value.recommendation !== undefined) { + lines.push(`recommendation: ${yamlScalar(value.recommendation)}`); + } + if (value.session_recording_url !== undefined) { + lines.push( + `session_recording_url: ${yamlScalar(value.session_recording_url)}`, + ); + } + if (value.session_platform !== undefined) { + lines.push(`session_platform: ${yamlScalar(value.session_platform)}`); + } + if (value.session_date !== undefined) { + lines.push(`session_date: ${yamlScalar(value.session_date)}`); + } + lines.push("---"); + return lines.join("\n"); +} + +function obsByDim( + observations: readonly Observation[], +): Map { + const map = new Map(); + for (const o of observations) map.set(o.dimension_id, o); + return map; +} + +function measByDim( + measurements: readonly Measurement[], +): Map { + const map = new Map(); + for (const m of measurements) map.set(m.dimension_id, m); + return map; +} + +function renderObservation(o: Observation): string { + const lines: string[] = []; + lines.push(`**Observation:** ${o.observation}`); + lines.push(`**Reasoning:** ${o.reasoning}`); + if (o.evidence_excerpts.length > 0) { + lines.push("**Evidence:**"); + for (const e of o.evidence_excerpts) { + const ts = e.timestamp ? ` [${e.timestamp}]` : ""; + const content = e.content.length > 200 ? `${e.content.slice(0, 200)}…` : e.content; + lines.push(`- (${e.source})${ts} ${content}`); + } + } + if (o.caveats) lines.push(`**Caveats:** ${o.caveats}`); + return lines.join("\n"); +} + +function renderObservationFull(o: Observation): string { + const lines: string[] = []; + lines.push(`**Observation:** ${o.observation}`); + lines.push(`**Reasoning:** ${o.reasoning}`); + if (o.evidence_excerpts.length > 0) { + lines.push("**Evidence (full):**"); + for (const e of o.evidence_excerpts) { + const ts = e.timestamp ? ` [${e.timestamp}]` : ""; + lines.push(`- (${e.source})${ts} ${e.content}`); + } + } + if (o.caveats) lines.push(`**Caveats:** ${o.caveats}`); + return lines.join("\n"); +} + +function renderMeasurement(m: Measurement): string { + const lines: string[] = ["**Measurements:**"]; + for (const f of m.facts) { + const ctx = f.context ? ` _(${f.context})_` : ""; + lines.push(`- ${f.label}: ${f.value}${ctx}`); + } + return lines.join("\n"); +} + +function renderDimensionSection( + titlePrefix: string, + dimensionId: string, + dimensionTitle: string, + observation: Observation | undefined, + measurement: Measurement | undefined, + rendererObservation: (o: Observation) => string, +): string { + const lines: string[] = []; + lines.push(`### ${titlePrefix} ${dimensionTitle}`); + lines.push(`*(id: ${dimensionId})*`); + if (measurement) lines.push(renderMeasurement(measurement)); + if (observation) lines.push(rendererObservation(observation)); + if (!measurement && !observation) { + lines.push("_(No evidence captured for this dimension.)_"); + } + return lines.join("\n\n"); +} + +export function renderSummary(input: WriteAuditInput): string { + const { result, frontmatter } = input; + const obs = obsByDim(result.observations); + const meas = measByDim(result.measurements); + const dims = getDimensions(); + + const process = dims.filter((d) => d.group === "process"); + const outcome = dims.filter((d) => d.group === "outcome"); + + const sections: string[] = []; + sections.push(yaml(frontmatter)); + sections.push(`> ${WARNING_BANNER.split("\n").join("\n> ")}`); + sections.push(`# Candidate observations: ${frontmatter.candidate}`); + sections.push("## Process dimensions"); + process.forEach((d, i) => { + sections.push( + renderDimensionSection( + `${i + 1}.`, + d.id, + d.title, + obs.get(d.id), + meas.get(d.id), + renderObservation, + ), + ); + }); + sections.push("## Outcome dimensions"); + outcome.forEach((d, i) => { + sections.push( + renderDimensionSection( + `${process.length + i + 1}.`, + d.id, + d.title, + obs.get(d.id), + meas.get(d.id), + renderObservation, + ), + ); + }); + sections.push(SIGN_OFF_PLACEHOLDER); + return `${sections.join("\n\n")}\n`; +} + +export function renderAudit(input: WriteAuditInput): string { + const { result, frontmatter } = input; + const obs = obsByDim(result.observations); + const meas = measByDim(result.measurements); + const dims = getDimensions(); + + const sections: string[] = []; + sections.push(yaml(frontmatter)); + sections.push(`> ${WARNING_BANNER.split("\n").join("\n> ")}`); + sections.push(`# Full audit: ${frontmatter.candidate}`); + sections.push( + "This document preserves the full evidence excerpts and the AI observer's reasoning chain for every dimension. It is the canonical source for any appeal review.", + ); + + dims.forEach((d, i) => { + sections.push( + renderDimensionSection( + `${i + 1}.`, + d.id, + d.title, + obs.get(d.id), + meas.get(d.id), + renderObservationFull, + ), + ); + }); + sections.push(SIGN_OFF_PLACEHOLDER); + return `${sections.join("\n\n")}\n`; +} + +export interface AuditWriteOutputs { + readonly summaryPath: string; + readonly auditPath: string; + readonly auditJsonPath: string; + readonly evidenceDir: string; +} + +export function writeAudit(input: WriteAuditInput): AuditWriteOutputs { + mkdirSync(input.outputDir, { recursive: true }); + const summary = renderSummary(input); + const audit = renderAudit(input); + const auditJson = JSON.stringify( + { frontmatter: input.frontmatter, result: input.result }, + null, + 2, + ); + const summaryPath = join(input.outputDir, "summary.md"); + const auditPath = join(input.outputDir, "audit.md"); + const auditJsonPath = join(input.outputDir, "audit.json"); + const evidenceDir = join(input.outputDir, "evidence"); + mkdirSync(evidenceDir, { recursive: true }); + writeFileSync(summaryPath, summary, "utf8"); + writeFileSync(auditPath, audit, "utf8"); + writeFileSync(auditJsonPath, `${auditJson}\n`, "utf8"); + return { summaryPath, auditPath, auditJsonPath, evidenceDir }; +} + +/** Validates a manager-written sign-off. Used by the TUI sign-off prompt. */ +export function validateSignOff(input: { + readonly recommendation: string; + readonly reasoning: string; +}): { readonly ok: boolean; readonly failures: readonly string[] } { + const failures: string[] = []; + const validRecs = ["Hire", "Hire with notes", "No hire"]; + if (!validRecs.includes(input.recommendation)) { + failures.push( + `recommendation must be one of: ${validRecs.map((v) => `"${v}"`).join(", ")}`, + ); + } + const trimmed = (input.reasoning ?? "").trim(); + if (trimmed.length < 20) { + failures.push( + "reasoning summary must be at least 20 characters — write your own words, do not leave blank or restate the AI's observations", + ); + } + return { ok: failures.length === 0, failures }; +} diff --git a/src/services/interview/review/collectors/asciinema.ts b/src/services/interview/review/collectors/asciinema.ts new file mode 100644 index 0000000..86e44c9 --- /dev/null +++ b/src/services/interview/review/collectors/asciinema.ts @@ -0,0 +1,121 @@ +import { readFileSync } from "node:fs"; +import type { CommandEvent } from "../types.js"; + +/** + * asciinema v2 cast files: first line is a JSON header; subsequent lines are + * `[delta_seconds, "o"|"i", "data"]`. We collapse output events into shell + * "commands" by detecting carriage returns following non-empty buffered text. + * + * This is a deliberately conservative parser. It extracts: + * - command lines (text typed then submitted with Enter) + * - the pause in seconds between the last keystroke and the Enter + * + * It is NOT a full PTY emulator. It does not interpret arrow keys, history + * recall, or job control. For richer behavior, swap in a heavier parser later. + */ + +interface CastHeader { + readonly version: number; + readonly width: number; + readonly height: number; + readonly timestamp?: number; + readonly env?: Record; +} + +interface AsciinemaEvent { + readonly delta: number; + readonly kind: "i" | "o"; + readonly data: string; +} + +export interface AsciinemaParseResult { + readonly header: CastHeader; + readonly events: readonly AsciinemaEvent[]; + readonly commands: readonly CommandEvent[]; +} + +function isoFromUnix(unixSeconds: number): string { + return new Date(unixSeconds * 1000).toISOString(); +} + +export function parseAsciinemaCast(path: string): AsciinemaParseResult { + const body = readFileSync(path, "utf8").trim(); + if (body.length === 0) { + throw new Error(`Empty asciinema cast file: ${path}`); + } + const lines = body.split("\n"); + const headerLine = lines[0]; + const header = JSON.parse(headerLine) as CastHeader; + const events: AsciinemaEvent[] = []; + for (let i = 1; i < lines.length; i++) { + const raw = lines[i].trim(); + if (raw.length === 0) continue; + const parsed = JSON.parse(raw) as [number, "i" | "o", string]; + events.push({ delta: parsed[0], kind: parsed[1], data: parsed[2] }); + } + + const baseEpoch = header.timestamp ?? 0; + const commands: CommandEvent[] = []; + + // Walk the input stream and reconstruct commands. When the user types + // printable characters they appear as "i" events; the shell echoes them + // back as "o" events. We focus on "i" events for what the user *typed*. + // + // evt.data is usually a single character for interactive shells but + // can be a multi-character chunk on paste or rapid input. Iterate over + // every character so a payload like "npm test\r" submits the buffered + // command instead of being captured as one literal blob. + let buffer = ""; + let lastKeyDelta = 0; + for (const evt of events) { + if (evt.kind !== "i") continue; + const chunk = evt.data; + for (let i = 0; i < chunk.length; i++) { + const ch = chunk[i]; + // CSI / ANSI escape sequence: ESC [ ... letter. Skip the whole + // sequence as a unit so per-char iteration doesn't accidentally + // buffer the bracket and letter as printable characters. + if (ch === "\x1b") { + let j = i + 1; + if (j < chunk.length && chunk[j] === "[") { + j++; + while (j < chunk.length) { + const c = chunk.charCodeAt(j); + if ((c >= 0x40 && c <= 0x7e)) { + j++; + break; + } + j++; + } + } else { + // Two-byte escape (ESC + 1 char) or lone ESC; skip one more + // char defensively if available. + if (j < chunk.length) j++; + } + i = j - 1; + continue; + } + if (ch === "\r" || ch === "\n") { + if (buffer.trim().length > 0) { + commands.push({ + type: "command", + timestamp: isoFromUnix(baseEpoch + evt.delta), + source: "terminal.cast", + command: buffer, + pauseSecondsBeforeEnter: Math.max(0, evt.delta - lastKeyDelta), + }); + } + buffer = ""; + } else if (ch === "" || ch === "\b") { + buffer = buffer.slice(0, -1); + lastKeyDelta = evt.delta; + } else if (ch.charCodeAt(0) >= 32 || ch === "\t") { + buffer += ch; + lastKeyDelta = evt.delta; + } + // Else: lone control codes — skip silently. + } + } + + return { header, events, commands }; +} diff --git a/src/services/interview/review/collectors/git-history.ts b/src/services/interview/review/collectors/git-history.ts new file mode 100644 index 0000000..4213d95 --- /dev/null +++ b/src/services/interview/review/collectors/git-history.ts @@ -0,0 +1,87 @@ +import { execFileSync } from "node:child_process"; +import type { CommitEvent } from "../types.js"; + +/** + * Reads a git repository's commit history and emits CommitEvent records. + * Uses `git log` with a stable format string; tolerates absent repos by + * returning an empty list rather than throwing. + * + * Tests inject a stub `runGit` for hermetic execution. + */ + +export type GitRunner = (args: string[], cwd: string) => string; + +const realRunner: GitRunner = (args, cwd) => + execFileSync("git", args, { + cwd, + encoding: "utf8", + }); + +const FORMAT = "%H%x09%aI%x09%s"; + +/** + * parseGitHistory builds the git-log argv from constants only, so callers + * cannot inject ref or pathspec values that begin with `-` (which git would + * parse as flags). If this function is ever extended to accept caller- + * supplied refs, inject a literal "--" separator between flags and refs. + */ +export function parseGitHistory( + repoDir: string, + runner: GitRunner = realRunner, +): readonly CommitEvent[] { + let logOut = ""; + try { + // Hard-coded flags only — no user-controlled values reach git. The + // "--" terminator is belt-and-braces in case the runner is replaced + // by a wrapper that adds positional args later. + logOut = runner(["log", `--format=${FORMAT}`, "--numstat", "--"], repoDir); + } catch (err) { + // Distinguish "no git history" (empty repo, freshly init'd) from + // genuine failures (binary missing, permission denied). The former + // is expected when a candidate's project is a Mode A scaffold; the + // latter should at least be logged so it doesn't disappear into a + // silent empty result. + const message = err instanceof Error ? err.message : String(err); + const benign = + /does not have any commits|fatal: your current branch .* does not have/i.test( + message, + ); + if (!benign) { + // Use stderr so the caller sees the failure even though we still + // return an empty list — consistent with the rest of the + // collector layer's "degrade to empty" contract. + process.stderr.write(`[git-history] ${message}\n`); + } + return []; + } + const result: CommitEvent[] = []; + const blocks = logOut.trim().split(/\n(?=[0-9a-f]{40}\t)/); + for (const block of blocks) { + const lines = block.split("\n"); + const head = lines[0]?.split("\t"); + if (!head || head.length < 3) continue; + const [sha, ts, ...subjectParts] = head; + const subject = subjectParts.join("\t"); + let insertions = 0; + let deletions = 0; + for (let i = 1; i < lines.length; i++) { + const numstat = lines[i].trim(); + if (!numstat) continue; + const cols = numstat.split("\t"); + const ins = Number.parseInt(cols[0], 10); + const del = Number.parseInt(cols[1], 10); + if (Number.isFinite(ins)) insertions += ins; + if (Number.isFinite(del)) deletions += del; + } + result.push({ + type: "commit", + timestamp: ts, + source: "git", + sha, + message: subject, + insertions, + deletions, + }); + } + return result; +} diff --git a/src/services/interview/review/collectors/jsonl-log.ts b/src/services/interview/review/collectors/jsonl-log.ts new file mode 100644 index 0000000..4f62a44 --- /dev/null +++ b/src/services/interview/review/collectors/jsonl-log.ts @@ -0,0 +1,77 @@ +import { existsSync, readFileSync } from "node:fs"; +import type { PromptEvent, ToolUseEvent } from "../types.js"; + +/** + * Parses the candidate's `interview.log` — JSONL with one object per line. + * Lines have the shape emitted by the kit's .claude/settings.json hooks: + * + * {"event":"user-prompt-submit","timestamp":"...","prompt":"..."} + * {"event":"pre-tool-use","timestamp":"...","tool_name":"...","tool_input":{...}} + * + * Lines that fail to parse are dropped silently — a corrupt log line should + * not abort the entire review run. + */ + +export interface InterviewLogParseResult { + readonly prompts: readonly PromptEvent[]; + readonly toolUses: readonly ToolUseEvent[]; +} + +interface LogLine { + event?: string; + timestamp?: string; + prompt?: string; + tool_name?: string; + tool_input?: unknown; +} + +export function parseInterviewLog(path: string): InterviewLogParseResult { + // Missing log is normal: candidate may have skipped Claude Code, or the + // hooks never produced a line. Treat it as "no evidence" rather than + // aborting the review. + if (!existsSync(path)) { + return { prompts: [], toolUses: [] }; + } + let body: string; + try { + body = readFileSync(path, "utf8"); + } catch { + return { prompts: [], toolUses: [] }; + } + const lines = body.split("\n").filter((l) => l.trim().length > 0); + const prompts: PromptEvent[] = []; + const toolUses: ToolUseEvent[] = []; + for (const line of lines) { + let parsed: LogLine; + try { + parsed = JSON.parse(line) as LogLine; + } catch { + continue; + } + if ( + parsed.event === "user-prompt-submit" && + typeof parsed.timestamp === "string" && + typeof parsed.prompt === "string" + ) { + prompts.push({ + type: "prompt", + timestamp: parsed.timestamp, + source: "interview.log", + text: parsed.prompt, + }); + } else if ( + parsed.event === "pre-tool-use" && + typeof parsed.timestamp === "string" && + typeof parsed.tool_name === "string" + ) { + toolUses.push({ + type: "tool-use", + timestamp: parsed.timestamp, + source: "interview.log", + tool: parsed.tool_name, + input: parsed.tool_input, + }); + } + } + return { prompts, toolUses }; +} diff --git a/src/services/interview/review/collectors/transcript.ts b/src/services/interview/review/collectors/transcript.ts new file mode 100644 index 0000000..5a416d3 --- /dev/null +++ b/src/services/interview/review/collectors/transcript.ts @@ -0,0 +1,94 @@ +import { readFileSync } from "node:fs"; +import type { TranscriptLineEvent } from "../types.js"; + +/** + * Parses an audio transcript. Supports a unified line format that all three + * tested providers (Granola, Fireflies, Otter) export to when saved as + * markdown or VTT-ish plain text: + * + * [00:01:23] Alice: I'll start with the data model. + * [00:02:01] Bob: That's the right call. + * + * Or the simpler form without a timestamp: + * + * Alice: I'll start with the data model. + * + * Lines that don't match either form are skipped. + * + * For VTT files, this also accepts: + * + * 00:01:23.000 --> 00:01:26.000 + * Alice: I'll start with the data model. + */ + +const TIMESTAMPED = /^\[(\d{1,2}:\d{2}(?::\d{2})?)\]\s+([^:]+?):\s+(.+)$/; +// Speaker labels may include digits ("Speaker 1", "Interviewer 2") and +// non-ASCII letters (e.g. "Étienne", "Ångström", Cyrillic, CJK). \p{L} +// covers Unicode letters under the /u flag; \p{N} covers numbers in any +// script. Spaces, periods, hyphens, apostrophes, and underscores are also +// allowed inside the name. +const BARE = /^(\p{L}[\p{L}\p{N} .'_-]*?):\s+(.+)$/u; + +function toIsoFromHMS(hms: string, sessionStartIso: string): string { + const parts = hms.split(":").map(Number); + let totalSeconds = 0; + if (parts.length === 3) { + totalSeconds = parts[0] * 3600 + parts[1] * 60 + parts[2]; + } else if (parts.length === 2) { + totalSeconds = parts[0] * 60 + parts[1]; + } + const base = new Date(sessionStartIso).getTime(); + // An invalid sessionStartIso would make `base` NaN and crash toISOString(). + // Fall back to the unix epoch so the transcript still parses with relative + // offsets, instead of aborting the entire review. + const safeBase = Number.isFinite(base) ? base : 0; + return new Date(safeBase + totalSeconds * 1000).toISOString(); +} + +export interface TranscriptParseOptions { + /** Anchor wall-clock for "[00:00:00]"-style timestamps in the transcript. */ + readonly sessionStartIso?: string; +} + +export function parseTranscript( + path: string, + options: TranscriptParseOptions = {}, +): readonly TranscriptLineEvent[] { + const body = readFileSync(path, "utf8"); + const lines = body.split("\n"); + const result: TranscriptLineEvent[] = []; + const rawSessionStart = + options.sessionStartIso ?? "1970-01-01T00:00:00.000Z"; + const sessionStart = Number.isFinite(new Date(rawSessionStart).getTime()) + ? rawSessionStart + : "1970-01-01T00:00:00.000Z"; + + for (const raw of lines) { + const line = raw.trim(); + if (line.length === 0) continue; + if (line.startsWith("#") || line.includes("-->")) continue; + + const ts = line.match(TIMESTAMPED); + if (ts) { + result.push({ + type: "transcript-line", + timestamp: toIsoFromHMS(ts[1], sessionStart), + source: "transcript", + speaker: ts[2].trim(), + text: ts[3].trim(), + }); + continue; + } + const bare = line.match(BARE); + if (bare) { + result.push({ + type: "transcript-line", + timestamp: sessionStart, + source: "transcript", + speaker: bare[1].trim(), + text: bare[2].trim(), + }); + } + } + return result; +} diff --git a/src/services/interview/review/extractors/risk-awareness.ts b/src/services/interview/review/extractors/risk-awareness.ts new file mode 100644 index 0000000..245ab17 --- /dev/null +++ b/src/services/interview/review/extractors/risk-awareness.ts @@ -0,0 +1,72 @@ +import type { + CommandEvent, + EvidenceEvent, + Measurement, +} from "../types.js"; + +/** + * Risk-awareness extractor. + * Detects destructive shell commands and reports each with its + * pause-before-Enter timing as observed in the asciinema recording. + */ + +const DESTRUCTIVE_PATTERNS: ReadonlyArray<{ + readonly pattern: RegExp; + readonly label: string; +}> = [ + { pattern: /^\s*rm\s+(-rf?|--recursive)/, label: "rm -rf" }, + { pattern: /^\s*sudo\s+/, label: "sudo" }, + { pattern: /^\s*git\s+push\s+.*--force/, label: "git push --force" }, + { pattern: /^\s*git\s+push\s+.*-f\b/, label: "git push -f" }, + { pattern: /^\s*git\s+reset\s+--hard/, label: "git reset --hard" }, + { pattern: /^\s*git\s+clean\s+(-f|-d)/, label: "git clean -f" }, + { pattern: /^\s*git\s+branch\s+-D/, label: "git branch -D" }, + { pattern: /^\s*git\s+checkout\s+--\s/, label: "git checkout --" }, + { pattern: /^\s*dropdb\b/, label: "dropdb" }, + { pattern: /^\s*DROP\s+(TABLE|DATABASE|SCHEMA)/i, label: "DROP TABLE" }, + { pattern: /^\s*kill\s+-9\b/, label: "kill -9" }, + { pattern: /^\s*mkfs\b/, label: "mkfs" }, + { pattern: /^\s*dd\s+/, label: "dd" }, +]; + +export function extractRiskAwareness( + events: readonly EvidenceEvent[], +): Measurement { + const commands = events.filter( + (e): e is CommandEvent => e.type === "command", + ); + const detected: Array<{ + readonly label: string; + readonly value: string | number; + readonly context?: string; + }> = []; + for (const cmd of commands) { + for (const { pattern, label } of DESTRUCTIVE_PATTERNS) { + if (pattern.test(cmd.command)) { + const pause = cmd.pauseSecondsBeforeEnter; + detected.push({ + label, + value: cmd.command.trim(), + context: `at ${cmd.timestamp}${ + typeof pause === "number" + ? `, paused ${pause.toFixed(2)}s before Enter` + : "" + }`, + }); + break; + } + } + } + if (detected.length === 0) { + return { + dimension_id: "risk-awareness", + facts: [ + { label: "Destructive commands detected", value: 0 }, + ], + }; + } + return { + dimension_id: "risk-awareness", + facts: detected, + }; +} diff --git a/src/services/interview/review/extractors/test-pass.ts b/src/services/interview/review/extractors/test-pass.ts new file mode 100644 index 0000000..b95c4c8 --- /dev/null +++ b/src/services/interview/review/extractors/test-pass.ts @@ -0,0 +1,122 @@ +import { spawnSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import type { Measurement } from "../types.js"; + +/** + * Test-pass / spec-satisfaction extractor. + * Runs the candidate's test suite and reports pass/fail counts. + * + * For the MVP, we detect the runner from the project's package.json or + * presence of go.mod. The runner is injectable for tests. + */ + +export type TestRunner = ( + repoDir: string, +) => { readonly passed: number; readonly failed: number; readonly output: string }; + +// Wall-clock cap on a candidate test run. Beyond this the spawn is killed +// and the extractor reports a timeout rather than hanging the entire grader +// (e.g., on a candidate's runaway watch-mode invocation or infinite loop). +const TEST_TIMEOUT_S = 300; +const TIMEOUT_BIN = "timeout"; + +function summarizeSpawnError(err: Error | undefined): string { + if (!err) return ""; + const code = (err as NodeJS.ErrnoException).code; + if (code === "ETIMEDOUT") return "Test run timed out and was killed."; + if (code === "ENOENT") return `Test runner not found: ${err.message}`; + return err.message; +} + +// `coreutils timeout --kill-after=N M cmd args...` runs `cmd` and, if it +// hasn't finished after M seconds, sends SIGTERM to the whole process +// group, then SIGKILL after another N seconds. Node's spawnSync `timeout` +// option only signals the direct child, leaving sub-binaries (e.g. Go's +// per-package test executables) reparented to PID 1 and still running. +// Delegating to coreutils gets us proper group-kill for free. +function hasCoreutilsTimeout(): boolean { + const probe = spawnSync(TIMEOUT_BIN, ["--version"], { encoding: "utf8" }); + return !probe.error && probe.status === 0; +} + +function runWithTimeout( + cmd: string, + args: readonly string[], + cwd: string, +): ReturnType { + if (hasCoreutilsTimeout()) { + return spawnSync( + TIMEOUT_BIN, + ["--kill-after=10s", `${TEST_TIMEOUT_S}s`, cmd, ...args], + { cwd, encoding: "utf8" }, + ); + } + // Fallback when coreutils `timeout` isn't on PATH (rare on Linux/macOS, + // expected on bare Windows). Node's spawnSync timeout only kills the + // direct child — orphaned grandchildren may persist until the grader + // itself exits. Document the limitation rather than silently leaking. + return spawnSync(cmd, [...args], { + cwd, + encoding: "utf8", + timeout: TEST_TIMEOUT_S * 1000, + killSignal: "SIGKILL", + }); +} + +const realRunner: TestRunner = (repoDir) => { + if (existsSync(join(repoDir, "go.mod"))) { + const r = runWithTimeout("go", ["test", "./..."], repoDir); + const errSummary = summarizeSpawnError(r.error); + const timedOut = r.status === 124; // coreutils timeout exit code on hit + const timeoutNote = timedOut ? "\nTest run timed out and was killed (process group)." : ""; + const output = `${r.stdout ?? ""}\n${r.stderr ?? ""}${errSummary ? `\n${errSummary}` : ""}${timeoutNote}`; + return { + passed: countMatches(r.stdout ?? "", /^ok\s+/gm), + failed: countMatches(r.stdout ?? "", /^FAIL\s+/gm), + output, + }; + } + if (existsSync(join(repoDir, "package.json"))) { + const r = runWithTimeout("bun", ["test"], repoDir); + const errSummary = summarizeSpawnError(r.error); + const timedOut = r.status === 124; + const timeoutNote = timedOut ? "\nTest run timed out and was killed (process group)." : ""; + const combined = `${r.stdout ?? ""}\n${r.stderr ?? ""}${errSummary ? `\n${errSummary}` : ""}${timeoutNote}`; + const passMatch = combined.match(/(\d+)\s+pass\b/); + const failMatch = combined.match(/(\d+)\s+fail\b/); + return { + passed: passMatch ? Number.parseInt(passMatch[1], 10) : 0, + failed: failMatch ? Number.parseInt(failMatch[1], 10) : 0, + output: combined, + }; + } + return { passed: 0, failed: 0, output: "No recognized test setup found." }; +}; + +function countMatches(s: string, re: RegExp): number { + let count = 0; + while (re.exec(s) !== null) count += 1; + return count; +} + +export function extractTestPass( + repoDir: string, + runner: TestRunner = realRunner, +): Measurement { + const r = runner(repoDir); + const total = r.passed + r.failed; + const facts: Array<{ + readonly label: string; + readonly value: string | number; + readonly context?: string; + }> = [ + { label: "Passing tests", value: r.passed }, + { label: "Failing tests", value: r.failed }, + { + label: "Pass rate", + value: total === 0 ? "n/a" : `${r.passed}/${total}`, + }, + ]; + return { dimension_id: "test-pass", facts }; +} diff --git a/src/services/interview/review/extractors/throughput.ts b/src/services/interview/review/extractors/throughput.ts new file mode 100644 index 0000000..d1b011b --- /dev/null +++ b/src/services/interview/review/extractors/throughput.ts @@ -0,0 +1,79 @@ +import type { + CommandEvent, + CommitEvent, + EvidenceEvent, + Measurement, +} from "../types.js"; + +/** + * Throughput extractor. Reports elapsed time, commit cadence, and + * time-to-first-passing-test as raw timestamps and durations. + */ + +const TEST_RUN = /^\s*(bun|npm|yarn|pnpm)\s+(run\s+)?test\b|^\s*go\s+test\b|^\s*pytest\b/; + +function isoEpoch(ts: string): number { + return new Date(ts).getTime(); +} + +function fmtDuration(ms: number): string { + if (!Number.isFinite(ms) || ms < 0) return "n/a"; + const s = Math.round(ms / 1000); + const m = Math.floor(s / 60); + const rem = s % 60; + return `${m}m${rem.toString().padStart(2, "0")}s`; +} + +export function extractThroughput( + events: readonly EvidenceEvent[], +): Measurement { + const sorted = [...events].sort((a, b) => + a.timestamp.localeCompare(b.timestamp), + ); + const commits = sorted.filter( + (e): e is CommitEvent => e.type === "commit", + ); + const commands = sorted.filter( + (e): e is CommandEvent => e.type === "command", + ); + + const start = sorted[0]?.timestamp ?? null; + const end = sorted[sorted.length - 1]?.timestamp ?? null; + + const elapsedMs = + start && end ? isoEpoch(end) - isoEpoch(start) : Number.NaN; + + const firstTestRun = commands.find((c) => TEST_RUN.test(c.command)); + + const facts: Array<{ + readonly label: string; + readonly value: string | number; + readonly context?: string; + }> = [ + { label: "Session start", value: start ?? "unknown" }, + { label: "Session end", value: end ?? "unknown" }, + { label: "Elapsed", value: fmtDuration(elapsedMs) }, + { label: "Total commits", value: commits.length }, + ]; + if (firstTestRun && start) { + facts.push({ + label: "Time to first test run", + value: fmtDuration(isoEpoch(firstTestRun.timestamp) - isoEpoch(start)), + }); + } + if (commits.length > 1) { + const intervals: number[] = []; + for (let i = 1; i < commits.length; i++) { + intervals.push( + isoEpoch(commits[i].timestamp) - isoEpoch(commits[i - 1].timestamp), + ); + } + const avg = intervals.reduce((a, b) => a + b, 0) / intervals.length; + facts.push({ + label: "Average gap between commits", + value: fmtDuration(avg), + }); + } + + return { dimension_id: "throughput", facts }; +} diff --git a/src/services/interview/review/extractors/verification.ts b/src/services/interview/review/extractors/verification.ts new file mode 100644 index 0000000..305bfea --- /dev/null +++ b/src/services/interview/review/extractors/verification.ts @@ -0,0 +1,89 @@ +import type { + CommandEvent, + EvidenceEvent, + Measurement, + PromptEvent, +} from "../types.js"; + +/** + * Verification discipline extractor. + * Counts test/typecheck/diff/grep invocations and reports how they interleave + * with prompts. Purely deterministic; no LLM involvement. + */ + +const TEST_PATTERNS: readonly RegExp[] = [ + /^\s*(bun|npm|yarn|pnpm)\s+(run\s+)?test\b/, + /^\s*go\s+test\b/, + /^\s*pytest\b/, + /^\s*jest\b/, + /^\s*vitest\b/, + /^\s*cargo\s+test\b/, + /^\s*just\s+test\b/, +]; + +const TYPECHECK_PATTERNS: readonly RegExp[] = [ + /^\s*tsc\b/, + /^\s*npx\s+tsc\b/, + /^\s*just\s+typecheck\b/, + /^\s*pyright\b/, + /^\s*mypy\b/, +]; + +const READ_PATTERNS: readonly RegExp[] = [ + /^\s*git\s+diff\b/, + /^\s*grep\b/, + /^\s*rg\b/, + /^\s*cat\b/, + /^\s*less\b/, +]; + +function matches(cmd: string, patterns: readonly RegExp[]): boolean { + return patterns.some((p) => p.test(cmd)); +} + +export function extractVerification( + events: readonly EvidenceEvent[], +): Measurement { + const commands = events.filter( + (e): e is CommandEvent => e.type === "command", + ); + const prompts = events.filter((e): e is PromptEvent => e.type === "prompt"); + + const testRuns = commands.filter((c) => matches(c.command, TEST_PATTERNS)); + const typechecks = commands.filter((c) => + matches(c.command, TYPECHECK_PATTERNS), + ); + const reads = commands.filter((c) => matches(c.command, READ_PATTERNS)); + + // Interleaving: how often a test run follows a prompt within 30 seconds. + // A simple proxy for "verification follows generation". Compares ISO + // timestamps as epoch ms instead of just adjacency in the merged stream. + const PROMPT_TEST_WINDOW_MS = 30_000; + const merged = [...prompts, ...testRuns].sort((a, b) => + a.timestamp.localeCompare(b.timestamp), + ); + let interleavedAfterPrompt = 0; + for (let i = 1; i < merged.length; i++) { + const cur = merged[i]; + const prev = merged[i - 1]; + if (cur.type !== "command" || prev.type !== "prompt") continue; + const dt = + new Date(cur.timestamp).getTime() - new Date(prev.timestamp).getTime(); + if (Number.isFinite(dt) && dt >= 0 && dt <= PROMPT_TEST_WINDOW_MS) { + interleavedAfterPrompt += 1; + } + } + + return { + dimension_id: "verification", + facts: [ + { label: "Total test runs", value: testRuns.length }, + { label: "Total typecheck runs", value: typechecks.length }, + { label: "Diff/grep/cat reads", value: reads.length }, + { + label: "Test runs immediately after a prompt (within prompt chain)", + value: interleavedAfterPrompt, + }, + ], + }; +} diff --git a/src/services/interview/review/review-orchestrator.ts b/src/services/interview/review/review-orchestrator.ts new file mode 100644 index 0000000..7efc9eb --- /dev/null +++ b/src/services/interview/review/review-orchestrator.ts @@ -0,0 +1,275 @@ +import { execFileSync } from "node:child_process"; +import { + cpSync, + existsSync, + mkdtempSync, + readFileSync, + rmSync, + statSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { readRoleConfig } from "../bootstrap/role-config.js"; +import type { RoleConfig } from "../bootstrap/role-config.js"; +import { getRubricVersion } from "../shared/rubric.js"; +import { + buildObserverPrompt, + humanOnlyObservations, + type ObserverClient, +} from "./ai-observer.js"; +import { + type AuditFrontmatter, + type AuditWriteOutputs, + writeAudit, +} from "./audit-writer.js"; +import { parseAsciinemaCast } from "./collectors/asciinema.js"; +import { parseGitHistory } from "./collectors/git-history.js"; +import { parseInterviewLog } from "./collectors/jsonl-log.js"; +import { parseTranscript } from "./collectors/transcript.js"; +import { extractRiskAwareness } from "./extractors/risk-awareness.js"; +import { extractTestPass, type TestRunner } from "./extractors/test-pass.js"; +import { extractThroughput } from "./extractors/throughput.js"; +import { extractVerification } from "./extractors/verification.js"; +import type { + EvidenceEvent, + ReviewResult, + Measurement, +} from "./types.js"; + +export type Cloner = ( + repoUrl: string, + destDir: string, +) => void; + +const defaultCloner: Cloner = (repoUrl, destDir) => { + // execFileSync passes args directly to the spawned process — no shell, so + // repoUrl cannot inject shell metacharacters or break out of quoting. + execFileSync("git", ["clone", "--depth=50", "--", repoUrl, destDir], { + stdio: "inherit", + }); +}; + +export interface ReviewInput { + readonly repoUrl: string; + readonly transcriptPath?: string; + readonly interviewerNotesPath?: string; + readonly sessionRecordingUrl?: string; + readonly sessionPlatform?: string; + readonly sessionDate?: string; + readonly outputDir?: string; + readonly candidateName: string; + /** Override the candidate-repo path instead of cloning (used by tests). */ + readonly localRepoPath?: string; +} + +export interface ReviewDependencies { + readonly observer: ObserverClient; + readonly clone?: Cloner; + readonly testRunner?: TestRunner; +} + +export interface ReviewOutcome { + readonly ok: boolean; + readonly outputs?: AuditWriteOutputs; + readonly result?: ReviewResult; + readonly failures: readonly string[]; +} + +function slug(name: string): string { + return name + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, ""); +} + +function todayIso(): string { + return new Date().toISOString().slice(0, 10); +} + +// Compact timestamp suffix (HHMMSS UTC) appended to candidate_id so two +// reviews of the same candidate on the same day land in distinct directories +// instead of silently overwriting each other. +function timeSuffix(): string { + return new Date().toISOString().slice(11, 19).replace(/:/g, ""); +} + +function mergeEvents(streams: readonly (readonly EvidenceEvent[])[]): readonly EvidenceEvent[] { + const flat = streams.flat(); + return flat.sort((a, b) => a.timestamp.localeCompare(b.timestamp)); +} + +function copyIfExists(src: string, dest: string): void { + if (!existsSync(src)) return; + const s = statSync(src); + if (s.isDirectory()) cpSync(src, dest, { recursive: true }); + else cpSync(src, dest); +} + +function collectEvents( + repoDir: string, + input: ReviewInput, +): readonly EvidenceEvent[] { + const streams: EvidenceEvent[][] = []; + + const cast = join(repoDir, "terminal.cast"); + if (existsSync(cast)) { + try { + streams.push([...parseAsciinemaCast(cast).commands]); + } catch { + // Skip silently — the audit will note no terminal evidence. + } + } + + const log = join(repoDir, "interview.log"); + if (existsSync(log)) { + const parsed = parseInterviewLog(log); + streams.push([...parsed.prompts]); + streams.push([...parsed.toolUses]); + } + + if (input.transcriptPath && existsSync(input.transcriptPath)) { + streams.push([ + ...parseTranscript(input.transcriptPath, { + sessionStartIso: input.sessionDate + ? `${input.sessionDate}T09:00:00Z` + : undefined, + }), + ]); + } + + streams.push([...parseGitHistory(repoDir)]); + + return mergeEvents(streams); +} + +function computeMeasurements( + repoDir: string, + events: readonly EvidenceEvent[], + deps: ReviewDependencies, +): readonly Measurement[] { + return [ + extractVerification(events), + extractRiskAwareness(events), + extractTestPass(repoDir, deps.testRunner), + extractThroughput(events), + ]; +} + +export async function reviewCandidate( + input: ReviewInput, + deps: ReviewDependencies, +): Promise { + const cleanupPaths: string[] = []; + try { + let repoDir: string; + if (input.localRepoPath) { + repoDir = input.localRepoPath; + } else { + repoDir = mkdtempSync(join(tmpdir(), "iv-clone-")); + cleanupPaths.push(repoDir); + (deps.clone ?? defaultCloner)(input.repoUrl, repoDir); + } + + const roleConfig = readRoleConfig(repoDir); + if (!roleConfig) { + return { + ok: false, + failures: [ + "Candidate repo does not contain role-config.json. Run `teamhero interview bootstrap` first to produce one.", + ], + }; + } + + const events = collectEvents(repoDir, input); + const measurements = computeMeasurements(repoDir, events, deps); + + const useAI = roleConfig.analysisMode === "ai-assisted"; + let observations; + if (useAI) { + const prompt = buildObserverPrompt({ + config: roleConfig, + events, + interviewerNotesPath: input.interviewerNotesPath, + sessionRecordingUrl: input.sessionRecordingUrl, + }); + const obs = await deps.observer.observe(prompt); + observations = obs.observations; + } else { + observations = humanOnlyObservations(); + } + + const result: ReviewResult = { + rubric_version: getRubricVersion(), + candidate_id: `${slug(input.candidateName)}-${todayIso()}-${timeSuffix()}`, + role_slug: roleConfig.roleSlug, + observed_at: new Date().toISOString(), + observations, + measurements, + }; + + const outputDir = + input.outputDir ?? + join( + process.cwd(), + "docs", + "interviews", + roleConfig.roleSlug, + result.candidate_id, + ); + + const frontmatter = buildFrontmatter(input, roleConfig); + const outputs = writeAudit({ result, frontmatter, outputDir }); + + // Copy raw evidence into evidence/. + const evidenceDir = outputs.evidenceDir; + for (const file of [ + "PRIVACY_RELEASE.md", + "terminal.cast", + "interview.log", + ]) { + copyIfExists(join(repoDir, file), join(evidenceDir, file)); + } + if (input.transcriptPath) { + copyIfExists(input.transcriptPath, join(evidenceDir, "transcript.txt")); + } + if (input.interviewerNotesPath) { + copyIfExists( + input.interviewerNotesPath, + join(evidenceDir, "interviewer-notes.md"), + ); + } + + return { ok: true, outputs, result, failures: [] }; + } catch (err) { + return { + ok: false, + failures: [err instanceof Error ? err.message : String(err)], + }; + } finally { + for (const p of cleanupPaths) { + rmSync(p, { recursive: true, force: true }); + } + } +} + +function buildFrontmatter( + input: ReviewInput, + role: RoleConfig, +): AuditFrontmatter { + return { + tags: ["hiring", "candidate", role.roleSlug], + candidate: input.candidateName, + role: role.roleSlug, + date: todayIso(), + rubric_version: getRubricVersion(), + rubric_mode: role.rubricMode, + signed_off: false, + session_recording_url: input.sessionRecordingUrl, + session_platform: input.sessionPlatform, + session_date: input.sessionDate, + }; +} + +// Suppress unused-export lint helper for narrow type re-export +export type { ReviewResult, Measurement }; +export { readFileSync as _readFileSync }; diff --git a/src/services/interview/review/types.ts b/src/services/interview/review/types.ts new file mode 100644 index 0000000..9f70656 --- /dev/null +++ b/src/services/interview/review/types.ts @@ -0,0 +1,110 @@ +/** + * Evidence event types. These are the normalized stream that all four + * collectors emit into. Downstream extractors and the AI observer consume + * this stream uniformly. + */ + +import type { DimensionId } from "../shared/rubric.js"; + +export type EvidenceSource = + | "terminal.cast" + | "interview.log" + | "transcript" + | "git" + | "repo"; + +/** A single user-typed prompt to the AI agent. */ +export interface PromptEvent { + readonly type: "prompt"; + readonly timestamp: string; // ISO-8601 + readonly source: EvidenceSource; + readonly text: string; +} + +/** A tool call the agent issued. */ +export interface ToolUseEvent { + readonly type: "tool-use"; + readonly timestamp: string; + readonly source: EvidenceSource; + readonly tool: string; + readonly input?: unknown; +} + +/** A shell command observed in the terminal recording. */ +export interface CommandEvent { + readonly type: "command"; + readonly timestamp: string; + readonly source: EvidenceSource; + readonly command: string; + /** Pause before the user hit Enter, in seconds. Useful for risk-awareness. */ + readonly pauseSecondsBeforeEnter?: number; +} + +/** A git commit. */ +export interface CommitEvent { + readonly type: "commit"; + readonly timestamp: string; + readonly source: "git"; + readonly sha: string; + readonly message: string; + readonly insertions: number; + readonly deletions: number; +} + +/** A line of the audio transcript. */ +export interface TranscriptLineEvent { + readonly type: "transcript-line"; + readonly timestamp: string; + readonly source: "transcript"; + readonly speaker: string; + readonly text: string; +} + +export type EvidenceEvent = + | PromptEvent + | ToolUseEvent + | CommandEvent + | CommitEvent + | TranscriptLineEvent; + +/** + * Per-dimension observation produced by the AI observer (LLM-judge + hybrid + * dimensions). Mirrors the strict json_schema we validate the LLM response + * against. No `score` field — by design. + */ +export interface Observation { + readonly dimension_id: DimensionId; + readonly observation: string; + readonly reasoning: string; + readonly evidence_excerpts: ReadonlyArray<{ + readonly timestamp?: string; + readonly source: EvidenceSource; + readonly content: string; + }>; + readonly caveats?: string; +} + +/** + * Per-dimension measurement produced by deterministic extractors and the + * deterministic half of hybrid dimensions. Raw facts. + */ +export interface Measurement { + readonly dimension_id: DimensionId; + readonly facts: ReadonlyArray<{ + readonly label: string; + readonly value: string | number; + readonly context?: string; + }>; +} + +/** The top-level result emitted per candidate before audit-writer renders it. */ +export interface ReviewResult { + readonly rubric_version: string; + readonly candidate_id: string; + readonly role_slug: string; + readonly observed_at: string; // ISO-8601 — never "scored_at" + readonly observations: readonly Observation[]; + readonly measurements: readonly Measurement[]; + /** Free-form metadata captured at review time (e.g., interviewer notes path). */ + readonly metadata?: Readonly>; +} diff --git a/src/services/interview/shared/events.ts b/src/services/interview/shared/events.ts new file mode 100644 index 0000000..a81c40f --- /dev/null +++ b/src/services/interview/shared/events.ts @@ -0,0 +1,67 @@ +/** + * JSON-lines protocol for IPC between the Go TUI and the TypeScript + * interview service. Each event is a single JSON object terminated by a + * newline. Wire format: + * + * {"type":"progress","step":"...","status":"start"} + * {"type":"progress","step":"...","status":"done","message":"..."} + * + * Future slices extend this union (observation, measurement, audit, + * result, error) — keep new event types additive and discriminated by + * the `type` field. + */ + +export interface InterviewProgressEvent { + readonly type: "progress"; + readonly step: string; + readonly status: "start" | "update" | "done" | "error"; + readonly message?: string; + readonly progress?: number; +} + +export type InterviewEvent = InterviewProgressEvent; + +const KNOWN_EVENT_TYPES: readonly InterviewEvent["type"][] = ["progress"]; + +export function serializeInterviewEvent(event: InterviewEvent): string { + return `${JSON.stringify(event)}\n`; +} + +const VALID_STATUSES: readonly InterviewProgressEvent["status"][] = [ + "start", + "update", + "done", + "error", +]; + +export function parseInterviewEvent(line: string): InterviewEvent | null { + let value: unknown; + try { + value = JSON.parse(line); + } catch { + return null; + } + if (!value || typeof value !== "object") return null; + const obj = value as Record; + const type = obj.type; + if (typeof type !== "string") return null; + if (!KNOWN_EVENT_TYPES.includes(type as InterviewEvent["type"])) return null; + + if (type === "progress") { + if (typeof obj.step !== "string" || obj.step.length === 0) return null; + if ( + typeof obj.status !== "string" || + !VALID_STATUSES.includes(obj.status as InterviewProgressEvent["status"]) + ) { + return null; + } + if (obj.message !== undefined && typeof obj.message !== "string") { + return null; + } + if (obj.progress !== undefined && typeof obj.progress !== "number") { + return null; + } + } + + return value as InterviewEvent; +} diff --git a/src/services/interview/shared/rubric.ts b/src/services/interview/shared/rubric.ts new file mode 100644 index 0000000..3e3c6f4 --- /dev/null +++ b/src/services/interview/shared/rubric.ts @@ -0,0 +1,125 @@ +export const RUBRIC_VERSION = "1.0.0"; + +export type DimensionId = + | "upfront-design" + | "context-engineering" + | "critical-evaluation" + | "verification" + | "course-correction" + | "risk-awareness" + | "architectural-quality" + | "test-pass" + | "throughput"; + +export type EvidenceMode = "deterministic" | "hybrid" | "llm-judge"; + +export type DimensionGroup = "process" | "outcome"; + +export interface Dimension { + readonly id: DimensionId; + readonly title: string; + readonly description: string; + readonly evidenceMode: EvidenceMode; + readonly group: DimensionGroup; + readonly maturityLineage: readonly string[]; +} + +const DIMENSIONS: readonly Dimension[] = [ + { + id: "upfront-design", + title: "Upfront Design Discipline", + description: + "Does the candidate sketch architecture, identify constraints, and align on approach before generating code?", + evidenceMode: "llm-judge", + group: "process", + maturityLineage: ["D12"], + }, + { + id: "context-engineering", + title: "Context Engineering", + description: + "How effectively the candidate primes the AI with relevant repository context, constraints, and intent before each significant prompt.", + evidenceMode: "hybrid", + group: "process", + maturityLineage: ["D12"], + }, + { + id: "critical-evaluation", + title: "Critical Evaluation of AI Output", + description: + "How the candidate reads, interrogates, and challenges AI-generated code rather than accepting it on faith.", + evidenceMode: "llm-judge", + group: "process", + maturityLineage: ["D12"], + }, + { + id: "verification", + title: "Verification Behavior", + description: + "Frequency and rigor of test runs, type checks, and manual verification interleaved between AI exchanges.", + evidenceMode: "deterministic", + group: "process", + maturityLineage: ["D12"], + }, + { + id: "course-correction", + title: "Course Correction", + description: + "How the candidate notices, names, and recovers from AI mistakes or their own missteps mid-task.", + evidenceMode: "hybrid", + group: "process", + maturityLineage: ["D12"], + }, + { + id: "risk-awareness", + title: "Risk Awareness", + description: + "Recognition of destructive operations, security implications, and reversibility before acting on AI suggestions.", + evidenceMode: "deterministic", + group: "process", + maturityLineage: ["D12"], + }, + { + id: "architectural-quality", + title: "Architectural Quality of Output", + description: + "Whether the final code reflects sound modularity, naming, and separation of concerns.", + evidenceMode: "llm-judge", + group: "outcome", + maturityLineage: ["D12"], + }, + { + id: "test-pass", + title: "Test Outcome", + description: + "Whether the candidate's submitted solution passes the role-specific acceptance tests.", + evidenceMode: "deterministic", + group: "outcome", + maturityLineage: ["D12"], + }, + { + id: "throughput", + title: "Throughput", + description: + "Volume of meaningful progress within the time-box, measured as commits, completed features, or tests passed.", + evidenceMode: "deterministic", + group: "outcome", + maturityLineage: ["D12"], + }, +]; + +export function getRubricVersion(): string { + return RUBRIC_VERSION; +} + +export function getDimensions(): readonly Dimension[] { + return DIMENSIONS; +} + +export function getDimension(id: DimensionId): Dimension | undefined { + return DIMENSIONS.find((d) => d.id === id); +} + +export function getEvidenceMode(id: DimensionId): EvidenceMode | undefined { + return getDimension(id)?.evidenceMode; +} diff --git a/teamhero-interview-kit/.claude/settings.json b/teamhero-interview-kit/.claude/settings.json new file mode 100644 index 0000000..3121e01 --- /dev/null +++ b/teamhero-interview-kit/.claude/settings.json @@ -0,0 +1,26 @@ +{ + "hooks": { + "UserPromptSubmit": [ + { + "matcher": ".*", + "hooks": [ + { + "type": "command", + "command": "jq -c '{event: \"user-prompt-submit\", timestamp: (now | todateiso8601), prompt: .prompt}' >> interview.log || true" + } + ] + } + ], + "PreToolUse": [ + { + "matcher": ".*", + "hooks": [ + { + "type": "command", + "command": "jq -c '{event: \"pre-tool-use\", timestamp: (now | todateiso8601), tool_name: .tool_name, tool_input: .tool_input}' >> interview.log || true" + } + ] + } + ] + } +} diff --git a/teamhero-interview-kit/AGENTS.md b/teamhero-interview-kit/AGENTS.md new file mode 100644 index 0000000..c78c03e --- /dev/null +++ b/teamhero-interview-kit/AGENTS.md @@ -0,0 +1,56 @@ +# Candidate agent context + +You are assisting a software engineer who is the *candidate* in a coding +job interview. The candidate is using you as a coding collaborator the +way they would on any normal work day. + +This file mirrors `.claude/CLAUDE.md` so non-Claude AI tools (Codex, +Cursor, Aider, etc.) that read the cross-tool `AGENTS.md` convention +pick up the same operating instructions. It is shipped by the interview +kit, not authored by the project generator — proctors own this content. + +## Your role + +- The candidate is the engineer. You are the AI collaborator. Follow + their direction. +- Be genuinely helpful. Write code, run tools, read files, suggest + approaches when asked. +- Respect when they push back, abandon an approach, or take a + direction you wouldn't have chosen. The interview is observing + *their* judgment, not yours. +- When asked to verify, verify. When asked to back out, back out. + +## What this interview observes + +The session is recorded and structurally observed by a second AI process +after the interview ends. That observer looks at *how* the candidate +works with you — not whether you produced the right answer on the first +try. Treat that as background, not as instruction: + +- Do not pre-emptively coach the candidate on "what the rubric is + looking for." +- Do not modify your normal behavior to make the candidate look better + or worse. Be the assistant you would normally be. +- Do not refuse work because "the candidate should write it + themselves." Help when asked. + +## What you should not do + +- Don't run destructive operations without explicit confirmation + (`rm -rf`, force-pushes, mass file deletes, schema-altering + migrations). The candidate's pause before destructive ops is one of + the things the observer measures; running them silently undermines + the interview. +- Don't access the candidate's broader environment beyond the repo + unless explicitly asked. +- Don't send any part of this session to external services beyond what + your normal tool calls would do. + +## House style + +- This repo includes a `GLOSSARY.md` (if applicable) and a `BRIEF.md` + or failing tests describing what to build. Read those first. +- Standard project hygiene applies: tests should pass before they're + committed; commits should be small enough to revert independently. +- The interview is time-boxed (see `INTERVIEW_RULES.md`). Help the + candidate make the most of the time, not push toward maximum scope. diff --git a/teamhero-interview-kit/INTERVIEW_RULES.md b/teamhero-interview-kit/INTERVIEW_RULES.md new file mode 100644 index 0000000..2a25f50 --- /dev/null +++ b/teamhero-interview-kit/INTERVIEW_RULES.md @@ -0,0 +1,102 @@ +# Interview Rules + +Welcome. This is a recorded, AI-collaboration coding interview. The hiring +team wants to see *how* you work with AI coding tools — not just the final +code. Use whatever AI agent, IDE, and tools you would normally use. + +## Time-box + +Your interview is time-boxed. The exact duration is filled in by your +interviewer in this section. **`{{TIME_BOX}}`** minutes. + +## Before you begin + +1. **Read `PRIVACY_RELEASE.md` carefully.** It describes what is + recorded, how your submission may and may not be used, and your + rights (access, appeal, deletion). +2. **If you consent**, fill in the **## Signed** and **## Date** + sections of `PRIVACY_RELEASE.md`. If you don't consent, contact your + interviewer — the session can continue without AI observation. +3. **Run `./start.sh`** from this repo's root. It checks the release + is signed and starts terminal recording. + +## During the interview + +- The brief for what to build is in either this README, in `BRIEF.md`, + or as comments in failing tests under `tests/`. Your interviewer will + point you to the right place. +- Use your normal AI coding workflow. Talk to your AI agent the way you + would on any work day. +- **Verify the AI's output.** Run tests. Read diffs. Check edge cases. +- It's fine to abandon an approach. It's fine to ask the AI to back out + changes. It's fine to push back on the AI's suggestions. +- If you finish early, polish — refactor, add tests, write docs. + +## Finishing + +1. Exit the asciinema recording shell (Ctrl-D or `exit`). +2. Run `./end.sh` from this repo's root. It stages the recording, the + log, and your signed release into a single git commit. +3. Push the repo to the URL your interviewer provided: + ``` + git push origin HEAD + ``` +4. Share the URL with your interviewer. + +## Setup notes + +### macOS + +``` +brew install asciinema +``` + +### Linux + +``` +sudo apt install asciinema # or your distro's package manager +``` + +### Windows — WSL is required + +The recording kit uses POSIX shell scripts and `asciinema`. Both are +WSL-native. To set up: + +1. Install WSL2 (Ubuntu is fine) — `wsl --install` in an Admin + PowerShell terminal. +2. Inside your WSL distro: + ``` + sudo apt update + sudo apt install asciinema git + ``` +3. Clone this repo *inside* WSL (under `~/`, not `/mnt/c/`) so file + permissions and git work correctly. +4. Run `./start.sh` from a WSL terminal — not from PowerShell or + Command Prompt. + +If WSL is not an option for you, contact your interviewer; an +alternative recording method may be available. + +## What the AI sees about your session + +The AI observer reads: +- `terminal.cast` (asciinema recording) +- `interview.log` (your AI agent's prompts and tool calls) +- Git history during the session +- Final state of this repository +- (Optional, only if your interviewer has set this up) the audio + transcript from the video call + +The AI observer does **not** see any screen video of your desktop. + +## What we're observing + +See `RUBRIC_OVERVIEW.md` for a plain-language summary of the nine +dimensions we observe. We do not produce a numerical score about you; +we produce narrative observations and raw measurements that the human +hiring manager reads alongside the rest of the interview signal. + +## Questions + +If anything is unclear about the rules, the release, or the brief — ask +your interviewer. There are no trick questions and no hidden criteria. diff --git a/teamhero-interview-kit/PRIVACY_RELEASE.md b/teamhero-interview-kit/PRIVACY_RELEASE.md new file mode 100644 index 0000000..8ebebc2 --- /dev/null +++ b/teamhero-interview-kit/PRIVACY_RELEASE.md @@ -0,0 +1,79 @@ +> ⚠ **REVIEW WITH LEGAL BEFORE USE.** This file is a placeholder. The +> hiring company is responsible for replacing the language below with +> release terms reviewed by qualified legal counsel for the candidate's +> jurisdiction. The bracketed sections below indicate the substantive +> commitments the hiring company must make to the candidate. + +# Privacy Release — Interview Recording + +I, the undersigned, am participating in a recorded coding interview for +the role described in `INTERVIEW_RULES.md` of this repository. + +## What is recorded + +- **Terminal recording (`terminal.cast`)** — a text-based recording of + everything visible in my terminal during the interview, captured by + asciinema. +- **AI agent log (`interview.log`)** — a JSONL log of prompts I sent to + my AI coding agent and the tool calls the agent made. +- **Git history** — all commits I make during the session. +- **(Optional, only if I am informed at the start of the session) Audio + transcript** — from the video conference platform's transcription + service. + +## What is NOT recorded + +- Screen video of my desktop outside this terminal session. +- The AI agent's full responses (only the tool calls it makes). +- Anything I do outside this repository. + +## How my submission may be used + +- **Hiring decision only.** The recording, log, and submitted code are + used by the hiring team to evaluate this interview for this role. +- **NO training use.** My submission, recording, and log will not be + used to train any AI model. The company commits not to submit any + part of my submission to vendor model-improvement programs. +- **Retention:** the artifacts will be retained no longer than required + by the company's lawful retention period for hiring records in my + jurisdiction, and not less than 30 days after the hiring decision is + communicated to me (to support the appeal mechanism below). + +## My rights + +- **Right of access.** I may request a copy of the AI's observations and + measurements about my interview at any time during the retention + period via the company's data-subject-request process. +- **Right of appeal.** Within 30 days of being notified of the hiring + decision, I may submit a written appeal asking for human re-review of + the AI-generated observations. The company commits to respond within + a further 30 days. +- **Right of deletion.** I may request deletion of my recording and log + at any time after the hiring decision is communicated. (The company + may retain a minimal hiring-record after deletion as required by law.) + +## What the AI does and does not see + +- The AI observer reads `terminal.cast`, `interview.log`, my git + history, the final state of this repository, and (if present) the + audio transcript. +- The AI observer **does not** see any video recording of the session. +- The AI's role is to produce structured observations, not a score. The + hiring decision is made by humans. + +## Bias acknowledgement + +I acknowledge that AI systems carry their own biases (including bias +based on names, language style, and code patterns). The AI's +observations are one input among many; the hiring decision is made by +human reviewers who exercise their own judgment. + +--- + +## Signed + +(placeholder — candidate signs here) + +## Date + +(placeholder — candidate dates here, YYYY-MM-DD) diff --git a/teamhero-interview-kit/RUBRIC_OVERVIEW.md b/teamhero-interview-kit/RUBRIC_OVERVIEW.md new file mode 100644 index 0000000..ae5b081 --- /dev/null +++ b/teamhero-interview-kit/RUBRIC_OVERVIEW.md @@ -0,0 +1,92 @@ +# Rubric Overview — What the AI observes + +The AI observer looks at nine dimensions during your interview. **There are +no scores.** For each dimension the AI either writes a short narrative +observation (3–5 sentences) or reports raw measurements as facts. The +human hiring manager reads the observations and decides — your interview +is one factor among many, and the manager weighs each dimension in their +own judgment. + +## Process dimensions — how you work + +These dimensions describe your *workflow* during the session, not the +quality of any specific artifact. + +### 1. Upfront design & decomposition + +Whether you sketch out the problem, identify constraints, or talk through +the approach before generating code. Either path is OK; the AI is +observing what you do, not enforcing a single style. + +### 2. Context engineering + +How effectively you give the AI agent the context it needs to help you +— e.g. pointing it at the right files, citing project conventions, +attaching examples. Both raw counts (how many files referenced, how +many glossary terms used) and a narrative observation. + +### 3. Critical evaluation of AI output + +Whether you read and challenge what the AI gives you, or accept it as +written. Rejecting, modifying, or correcting AI suggestions is a +positive signal — it shows judgment, not unfamiliarity. + +### 4. Verification discipline + +How often you run tests, read diffs, and check outputs between AI +exchanges. Raw counts only; no narrative. + +### 5. Course correction + +How you respond when an approach isn't working — rolling back, asking +the AI to reframe, switching strategies. Productive correction looks +different from thrashing; the AI tries to distinguish. + +### 6. Risk awareness + +Whether you notice and pause on destructive or hard-to-reverse +operations (force pushes, `rm -rf`, schema-altering migrations). Raw +counts of any detected destructive commands plus the pause time before +you confirmed. + +## Outcome dimensions — what you produced + +These dimensions describe the *final state* of your repo at the end of +the time-box. + +### 7. Architectural quality + +A narrative observation on the modularity, naming, and separation of +concerns in your final code. + +### 8. Test pass / spec satisfaction + +Whether your submission passes the acceptance tests for the project. +Reported as pass/fail per criterion. + +### 9. Throughput + +How you paced the work — time-to-first-passing-test, total elapsed +time, commit cadence. Reported as raw timestamps and durations. + +--- + +## What the AI does not observe + +- Anything you do outside this repository. +- Video of you or your desktop. +- The interviewer's verbal commentary as evidence about you (the AI is + explicitly instructed to treat interviewer remarks as situational + context only, not as judgments about your skill). +- Comparisons with other candidates. Each candidate is observed + independently. + +## Why no scoring + +Numerical scores compound bias when averaged across candidates and +anchor reviewers cognitively before they read the evidence. Narrative +observations let the manager interrogate the AI's reasoning instead of +the score. We think this is more honest and more useful, and it's the +ethical floor we are committed to. See +`docs/interview-classification-rationale.md` in the company's +teamhero.cli repo for the full ethics statement. diff --git a/teamhero-interview-kit/end.sh b/teamhero-interview-kit/end.sh new file mode 100755 index 0000000..e133d5f --- /dev/null +++ b/teamhero-interview-kit/end.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Interview kit: end.sh +# Verifies the recording exists, stages all interview artifacts, and creates +# a single commit. Run this AFTER you have exited the asciinema recording. +# +# Environment overrides: +# SKIP_COMMIT=1 skip the actual git commit (used by smoke tests) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$SCRIPT_DIR" + +if [ ! -f "$REPO_ROOT/.interview-state/started-at" ]; then + echo "✖ No record of start.sh having been run. Did you start the interview?" >&2 + exit 1 +fi + +CAST_PATH="$REPO_ROOT/terminal.cast" +if [ ! -f "$CAST_PATH" ] && [ "${SKIP_RECORD:-0}" != "1" ]; then + echo "✖ terminal.cast not found at $CAST_PATH. Did you finish the recording?" >&2 + exit 1 +fi + +RELEASE_PATH="$REPO_ROOT/PRIVACY_RELEASE.md" +LOG_PATH="$REPO_ROOT/interview.log" +RULES_PATH="$REPO_ROOT/INTERVIEW_RULES.md" + +# Stage everything that exists. +STAGE=() +[ -f "$CAST_PATH" ] && STAGE+=("$CAST_PATH") +[ -f "$RELEASE_PATH" ] && STAGE+=("$RELEASE_PATH") +[ -f "$LOG_PATH" ] && STAGE+=("$LOG_PATH") +[ -f "$RULES_PATH" ] && STAGE+=("$RULES_PATH") + +if [ "${SKIP_COMMIT:-0}" = "1" ]; then + echo "✓ end.sh: artifacts ready (SKIP_COMMIT=1 — no commit made)." + printf ' - %s\n' "${STAGE[@]}" + exit 0 +fi + +cd "$REPO_ROOT" + +if [ ! -d .git ]; then + echo "✖ Not a git repository. Was the project cloned correctly?" >&2 + exit 1 +fi + +git add "${STAGE[@]}" + +# Single combined commit so the timestamp matches the recording window. +git commit -m "Interview session: signed release + terminal recording + agent log" \ + --no-verify + +echo "✓ Committed. Push instructions:" +echo " git push origin HEAD" +echo +echo "Share the resulting repo URL with the interviewer." diff --git a/teamhero-interview-kit/lib/privacy-gate.sh b/teamhero-interview-kit/lib/privacy-gate.sh new file mode 100755 index 0000000..890d891 --- /dev/null +++ b/teamhero-interview-kit/lib/privacy-gate.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +# Privacy gate for the interview kit. Detects whether PRIVACY_RELEASE.md has +# been signed by the candidate. +# +# Detection rule: the file must exist, must not be empty, must contain a +# "## Signed" section with a non-whitespace, non-placeholder value beneath it, +# and must contain a "## Date" section with a non-whitespace, non-placeholder +# value beneath it. +# +# Usage: +# bash privacy-gate.sh /path/to/PRIVACY_RELEASE.md # returns 0 if signed +# source privacy-gate.sh; check_privacy_release_signed /path/... +# +# Works on macOS, Linux, and WSL (POSIX-compatible; uses awk + grep). + +# Note: `set -u` lives inside the direct-exec block at the bottom of this file +# rather than at the top level, because sourcing this script with `set -u` +# would leak nounset into the caller's shell. + +# extract_section — prints the body that follows a given +# "## Heading" line up to (but not including) the next "## " heading or EOF. +extract_section() { + local file="$1" + local heading="$2" + awk -v h="## ${heading}" ' + $0 == h { capturing = 1; next } + capturing && /^## / { capturing = 0 } + capturing { print } + ' "$file" +} + +# Returns 0 if the given string is a real candidate-supplied value (not empty, +# not whitespace-only, not the placeholder text we ship in the template). +is_real_value() { + local body="$1" + local trimmed + trimmed=$(printf '%s' "$body" | tr -d '[:space:]') + if [ -z "$trimmed" ]; then + return 1 + fi + # Reject the literal placeholder content from the shipped template. + if printf '%s' "$body" | grep -qiE '^\(placeholder.*\)$'; then + return 1 + fi + return 0 +} + +check_privacy_release_signed() { + local file="${1:-}" + if [ -z "$file" ]; then + return 2 + fi + if [ ! -f "$file" ]; then + return 3 + fi + if [ ! -s "$file" ]; then + return 4 + fi + local signed_body + local date_body + signed_body=$(extract_section "$file" "Signed") + date_body=$(extract_section "$file" "Date") + if ! is_real_value "$signed_body"; then + return 5 + fi + if ! is_real_value "$date_body"; then + return 6 + fi + return 0 +} + +# When invoked directly (not sourced), run the check on $1 and exit with the +# function's return code. nounset is enabled here, scoped to this block, so +# callers that source this library don't inherit it. +if [ "${BASH_SOURCE[0]:-$0}" = "$0" ]; then + set -u + check_privacy_release_signed "${1:-}" + exit $? +fi diff --git a/teamhero-interview-kit/start.sh b/teamhero-interview-kit/start.sh new file mode 100755 index 0000000..0ea316f --- /dev/null +++ b/teamhero-interview-kit/start.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Interview kit: start.sh +# Checks the privacy gate, verifies the Claude Code hooks are wired, and +# begins terminal recording with asciinema. Run this BEFORE you start the +# interview project. +# +# Environment overrides (for testing or non-standard installs): +# ASCIINEMA_BIN path to the asciinema binary (default: asciinema) +# SKIP_RECORD=1 skip the actual recording step (used by smoke tests) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$SCRIPT_DIR" + +# shellcheck source=lib/privacy-gate.sh +. "$SCRIPT_DIR/lib/privacy-gate.sh" + +RELEASE_PATH="$REPO_ROOT/PRIVACY_RELEASE.md" + +if ! check_privacy_release_signed "$RELEASE_PATH"; then + cat <<'EOF' >&2 +✖ Privacy release is not signed. + +This interview is recorded. Before you begin, you must: + + 1. Open PRIVACY_RELEASE.md in this repo. + 2. Read it carefully — it covers what is recorded, who sees it, + the no-training clause, your deletion right, and your appeal + mechanism (30 days after the hiring decision). + 3. Fill in your name in the "## Signed" section. + 4. Fill in today's date in the "## Date" section. + 5. Re-run ./start.sh. + +If you do not consent to the recording, do not sign — and let the +interviewer know. The session can continue without AI-generated +observations. +EOF + exit 1 +fi + +# Verify the Claude Code hooks are wired so the interview.log can be produced. +HOOKS_FILE="$REPO_ROOT/.claude/settings.json" +if [ ! -f "$HOOKS_FILE" ]; then + echo "✖ Missing $HOOKS_FILE — kit is incomplete. Re-clone the project." >&2 + exit 1 +fi +if ! grep -q '"UserPromptSubmit"' "$HOOKS_FILE"; then + echo "✖ $HOOKS_FILE does not declare a UserPromptSubmit hook. Kit is incomplete." >&2 + exit 1 +fi +if ! grep -q '"PreToolUse"' "$HOOKS_FILE"; then + echo "✖ $HOOKS_FILE does not declare a PreToolUse hook. Kit is incomplete." >&2 + exit 1 +fi + +if [ "${SKIP_RECORD:-0}" = "1" ]; then + # We still write started-at in this path since there's no preflight + # that could fail after this point. + mkdir -p "$REPO_ROOT/.interview-state" + date -u +%FT%TZ > "$REPO_ROOT/.interview-state/started-at" + echo "✓ Privacy gate passed. SKIP_RECORD=1 — not starting asciinema." + exit 0 +fi + +ASCIINEMA_BIN="${ASCIINEMA_BIN:-asciinema}" +if ! command -v "$ASCIINEMA_BIN" >/dev/null 2>&1; then + cat <&2 +✖ asciinema is not installed (looked for: $ASCIINEMA_BIN). + +Install: + macOS: brew install asciinema + Linux: sudo apt install asciinema (or your distro's package manager) + WSL: sudo apt install asciinema (inside your WSL distro) + +Then re-run ./start.sh. +EOF + exit 1 +fi + +# Mark "started" only after asciinema preflight passes. Writing it earlier +# would leave a stale marker behind if preflight failed, and end.sh would +# then mistakenly think a session was completed. +mkdir -p "$REPO_ROOT/.interview-state" +date -u +%FT%TZ > "$REPO_ROOT/.interview-state/started-at" + +CAST_PATH="$REPO_ROOT/terminal.cast" +echo "✓ Privacy gate passed." +echo " Starting terminal recording: $CAST_PATH" +echo " When you finish, exit this shell, then run ./end.sh" +exec "$ASCIINEMA_BIN" rec --overwrite "$CAST_PATH" diff --git a/tests/unit/cli/interview.spec.ts b/tests/unit/cli/interview.spec.ts new file mode 100644 index 0000000..ef534bb --- /dev/null +++ b/tests/unit/cli/interview.spec.ts @@ -0,0 +1,35 @@ +import { describe, expect, it, mock } from "bun:test"; +import { createConsola } from "consola"; +import { createCli } from "../../../src/cli/index.js"; + +function makeDeps() { + return { + auth: { + ensureAuthenticated: mock(async () => ({ + authenticated: true, + provider: "pat" as const, + message: "ok", + })), + login: mock(async () => ({ + authenticated: true, + provider: "pat" as const, + message: "ok", + })), + }, + logger: createConsola({ level: 0 }), + }; +} + +describe("teamhero interview CLI registration", () => { + it("registers an `interview` subcommand on the program", () => { + const program = createCli(makeDeps()); + const command = program.commands.find((c) => c.name() === "interview"); + expect(command).toBeDefined(); + }); + + it("the `interview` subcommand has a non-empty description", () => { + const program = createCli(makeDeps()); + const command = program.commands.find((c) => c.name() === "interview"); + expect(command?.description().length).toBeGreaterThan(0); + }); +}); diff --git a/tests/unit/cli/unknown-subcommand.spec.ts b/tests/unit/cli/unknown-subcommand.spec.ts new file mode 100644 index 0000000..eacbf8c --- /dev/null +++ b/tests/unit/cli/unknown-subcommand.spec.ts @@ -0,0 +1,101 @@ +import { describe, expect, it, mock } from "bun:test"; +import { createConsola } from "consola"; +import { run } from "../../../src/cli/index.js"; + +function makeDeps(loggerErrorSpy: ReturnType) { + const logger = createConsola({ level: 0 }); + logger.error = loggerErrorSpy as unknown as typeof logger.error; + return { + auth: { + ensureAuthenticated: mock(async () => ({ + authenticated: true, + provider: "pat" as const, + message: "ok", + })), + login: mock(async () => ({ + authenticated: true, + provider: "pat" as const, + message: "ok", + })), + }, + logger, + }; +} + +describe("teamhero CLI rejects unknown subcommands", () => { + it("logs an actionable error when given an unknown subcommand", async () => { + const errorSpy = mock(() => {}); + const deps = makeDeps(errorSpy); + const exitSpy = mock((_code?: number) => { + throw new Error("__exit__"); + }); + const originalExit = process.exit; + process.exit = exitSpy as unknown as typeof process.exit; + + try { + await run(["node", "teamhero", "definitely-not-a-command"], deps).catch( + () => {}, + ); + } finally { + process.exit = originalExit; + } + + expect(exitSpy).toHaveBeenCalled(); + const calledWith = exitSpy.mock.calls[0]?.[0]; + expect(calledWith).not.toBe(0); + expect(errorSpy).toHaveBeenCalled(); + const errorMessage = String(errorSpy.mock.calls[0]?.[0] ?? ""); + expect(errorMessage).toContain("definitely-not-a-command"); + }); + + it("does not reject when no subcommand is given (top-level invocation)", async () => { + const errorSpy = mock(() => {}); + const deps = makeDeps(errorSpy); + const exitSpy = mock((_code?: number) => { + throw new Error("__exit__"); + }); + const originalExit = process.exit; + process.exit = exitSpy as unknown as typeof process.exit; + + try { + // `teamhero --help` lets commander print top-level help and exit 0. + // We just want to confirm the unknown-subcommand guard is not triggered. + await run(["node", "teamhero", "--help"], deps).catch(() => {}); + } finally { + process.exit = originalExit; + } + + const errorCalls = errorSpy.mock.calls.map((c) => String(c[0] ?? "")); + const unknownSubcommandError = errorCalls.find((m) => + m.toLowerCase().includes("unknown"), + ); + expect(unknownSubcommandError).toBeUndefined(); + }); + + it("does not reject known subcommands at the guard stage", async () => { + const errorSpy = mock(() => {}); + const deps = makeDeps(errorSpy); + // We don't want spawnTui to actually fire; force an exit that bypasses it. + const exitSpy = mock((_code?: number) => { + throw new Error("__exit__"); + }); + const originalExit = process.exit; + process.exit = exitSpy as unknown as typeof process.exit; + + const knownSubcommands = ["report", "setup", "doctor", "interview"]; + + try { + for (const sub of knownSubcommands) { + errorSpy.mockClear(); + await run(["node", "teamhero", sub, "--help"], deps).catch(() => {}); + const errorCalls = errorSpy.mock.calls.map((c) => String(c[0] ?? "")); + const unknownSubcommandError = errorCalls.find((m) => + m.toLowerCase().includes("unknown subcommand"), + ); + expect(unknownSubcommandError).toBeUndefined(); + } + } finally { + process.exit = originalExit; + } + }); +}); diff --git a/tests/unit/services/discrepancy.spec.ts b/tests/unit/services/discrepancy.spec.ts index a4b47d3..751aff0 100644 --- a/tests/unit/services/discrepancy.spec.ts +++ b/tests/unit/services/discrepancy.spec.ts @@ -5,16 +5,28 @@ * mapAuditResultToDiscrepancyReport(), and normalizeRule(). */ -import { describe, expect, it } from "bun:test"; +import { describe, expect, it, mock } from "bun:test"; import type { ContributorDiscrepancy, SectionDiscrepancy, } from "../../../src/core/types.js"; +import * as envMod from "../../../src/lib/env.js"; import type { ReportMemberMetrics, ReportRenderInput, } from "../../../src/lib/report-renderer.js"; +// Other spec files mock src/lib/env.js with a stub that ignores process.env. +// mock.module() registrations leak across files when bun test runs without +// per-file isolation. Re-pin the real getEnv here so this file's env-driven +// tests work whether or not a prior file left a stub installed. Per +// CLAUDE.md, do NOT call mock.restore() in afterAll — it undoes the +// pinning and re-introduces leakage from other files. +mock.module("../../../src/lib/env.js", () => ({ + ...envMod, + getEnv: (key: string) => process.env[key], +})); + const { buildSectionAuditContexts, mapAuditResultToDiscrepancyReport, diff --git a/tests/unit/services/interview/bootstrap/openai-generator-client.spec.ts b/tests/unit/services/interview/bootstrap/openai-generator-client.spec.ts new file mode 100644 index 0000000..16c6179 --- /dev/null +++ b/tests/unit/services/interview/bootstrap/openai-generator-client.spec.ts @@ -0,0 +1,396 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { OpenAIGeneratorClient } from "../../../../../src/services/interview/bootstrap/openai-generator-client.js"; +import type { RoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function role(overrides: Partial = {}): RoleConfig { + return { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript", + domain: "Payments", + featureDescription: "Add idempotency keys to the refunds endpoint", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir: "/tmp/test-output", + ...overrides, + }; +} + +/** + * Builds a minimal OpenAI client fake that captures the prompt and returns a + * valid project JSON string. We test the internal prompt-building logic + * indirectly through the public `generate` method. + */ +function fakeOpenAI( + outputFiles: Array<{ path: string; content: string }> = [ + { path: "README.md", content: "# Project\n" }, + ], + capturedPrompts?: { calls: Array<{ input: string; model: string }> }, +) { + return { + responses: { + create: async (opts: { + model: string; + input: string; + text: unknown; + }) => { + if (capturedPrompts) { + capturedPrompts.calls.push({ input: opts.input, model: opts.model }); + } + return { + output_text: JSON.stringify({ files: outputFiles }), + }; + }, + }, + }; +} + +describe("OpenAIGeneratorClient.generate", () => { + it("returns the files array from the API response", async () => { + const files = [ + { path: "README.md", content: "# Project\n" }, + { path: "src/main.ts", content: "export const x = 1;\n" }, + ]; + const client = new OpenAIGeneratorClient(fakeOpenAI(files) as never); + const result = await client.generate({ config: role(), attempt: 1 }); + expect(result.files).toHaveLength(2); + expect(result.files[0].path).toBe("README.md"); + expect(result.files[1].path).toBe("src/main.ts"); + }); + + it("throws when the API returns no output_text", async () => { + const badOpenAI = { + responses: { + create: async () => ({}), + }, + }; + const client = new OpenAIGeneratorClient(badOpenAI as never); + await expect( + client.generate({ config: role(), attempt: 1 }), + ).rejects.toThrow(/no output_text/); + }); + + it("passes the role title, stack, and domain in the prompt input", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role(), attempt: 1 }); + expect(captured.calls).toHaveLength(1); + const prompt = captured.calls[0].input; + expect(prompt).toContain("Senior Backend Engineer"); + expect(prompt).toContain("TypeScript"); + expect(prompt).toContain("Payments"); + expect(prompt).toContain("Add idempotency keys"); + }); + + it("includes Mode A scaffold requirements in prompt for projectMode=A", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ projectMode: "A" }), attempt: 1 }); + const prompt = captured.calls[0].input; + // README.md is the only required candidate-facing file. Right-sizing + // hint nudges the model toward a substantive decomposition without + // encoding a hard LOC band that the validator would auto-reject. + expect(prompt).toContain("README.md"); + expect(prompt).toMatch(/cohesive modules/i); + }); + + it("explicitly forbids the AI from generating test files (Mode A)", async () => { + // A pre-existing skipped test like `describe.skip("addUser", ...)` + // leaks the API shape and function names the candidate is + // expected to design themselves. The prompt must tell the model + // not to author tests; the candidate writes their own as part of + // the evaluation. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ projectMode: "A" }), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("DO NOT GENERATE"); + expect(prompt).toContain("Any test files"); + // Regression: the old prompt said "include a failing or skipped + // test under tests/". That phrasing must not return — it's + // exactly what we just removed. + expect(prompt).not.toMatch(/include.*(failing|skipped) test/i); + }); + + it("explicitly forbids the AI from generating GLOSSARY.md (Mode A)", async () => { + // A glossary lists domain concepts; identifying those concepts + // is part of what's being evaluated, so a pre-baked GLOSSARY.md + // gives away the answer. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ projectMode: "A" }), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("DO NOT GENERATE"); + expect(prompt).toMatch(/GLOSSARY\.md\.\s+A glossary/); + }); + + it("does NOT encode a hard LOC band or deep-module quota in the prompt", async () => { + // Regression guard for the removed size validator: prior versions + // of this prompt asserted a 400-700 LOC range and an "AT LEAST 2 + // source files of 80+ lines" rule, mirrored on the validator side. + // Both have been removed because they weren't in the product spec + // and they were producing real friction (retries on + // perfectly-serviceable 300-LOC outputs). If a future prompt edit + // reintroduces these phrases, this test fails so we revisit + // whether the matching validator rule should come back too. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ projectMode: "A" }), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).not.toContain("400 and 700"); + expect(prompt).not.toContain("ABSOLUTE SIZE REQUIREMENTS"); + expect(prompt).not.toMatch(/AT LEAST 2 source files must each contain 80/); + }); + + it("explicitly forbids the AI from authoring CLAUDE.md or AGENTS.md (Mode A)", async () => { + // The prompt MUST tell the model not to write agent-facing files; those + // are owned by the kit. Otherwise the model hallucinates "Agent guidance" + // blocks that the candidate's agent then reads at run time. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ projectMode: "A" }), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("DO NOT GENERATE"); + expect(prompt).toContain("CLAUDE.md or AGENTS.md"); + }); + + it("explicitly forbids the AI from authoring CLAUDE.md or AGENTS.md (Mode B)", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ projectMode: "B" }), attempt: 1 }); + const prompt = captured.calls[0].input; + // Mode B uses a different prompt shape (single-line directives in + // the BRIEF.md spec); the original regex still matches there. + expect(prompt).toMatch(/DO NOT (generate|write).*CLAUDE\.md/i); + expect(prompt).toMatch(/DO NOT (generate|write).*AGENTS\.md/i); + }); + + it("includes Mode B brief requirements in prompt for projectMode=B", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role({ projectMode: "B" }), + attempt: 1, + }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("BRIEF.md"); + // Mode B must NOT include Mode A scaffolding requirements + expect(prompt).not.toContain("deep module"); + }); + + it("Mode B with stackByCandidate=false REQUIRES the named stack in BRIEF.md", async () => { + // Wizard's "Greenfield (use your stack)" option lands here. The + // brief must constrain the candidate to the stack the proctor + // already chose at Q3 — otherwise the proctor's stack signal is + // wasted. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role({ projectMode: "B", stack: "Go" }), + attempt: 1, + }); + const prompt = captured.calls[0].input; + expect(prompt).toMatch(/REQUIRES the candidate to use Go/); + // The stack-by-candidate path must NOT activate here. + expect(prompt).not.toMatch(/candidate selects their own tech stack/); + }); + + it("renders 'Domain: infer from the job description' when domain is empty (JD-supplied path)", async () => { + // The wizard skips the Domain question when a JD is attached + // (the JD describes the domain). The prompt must NOT render a + // bare "Domain: ." — instead it tells the model to derive the + // domain from the JD context block. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ domain: "" }), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).toMatch(/Domain:\s+infer from the job description/i); + expect(prompt).not.toMatch(/Domain:\s*\./); + }); + + it("renders explicit 'Domain: X.' when the proctor supplied a domain", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role({ domain: "Payments" }), + attempt: 1, + }); + expect(captured.calls[0].input).toMatch(/Domain:\s+Payments\./); + }); + + it("injects JD content into the generation prompt when jdInfluencesProject is true", async () => { + // The user's example: a junior healthtech JD should nudge the + // generator toward an EHR-flavoured feature. The mechanism is + // the project-generation prompt reading the JD body and giving + // it to the model as background context. This test pins that + // the JD content actually reaches the prompt. + const dir = mkdtempSync(join(tmpdir(), "iv-jd-gen-")); + try { + const jdPath = join(dir, "jd.md"); + writeFileSync( + jdPath, + "# Junior Healthcare Engineer\nFamiliarity with FHIR, HL7, EHR concepts.", + ); + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role({ + jdPath, + jdInfluencesProject: true, + }), + attempt: 1, + }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("Job description context"); + expect(prompt).toContain("FHIR, HL7, EHR"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("omits the JD when jdInfluencesProject is false even if jdPath is set", async () => { + // JD-without-influence: the JD goes only to the post-interview + // observer, not the project-generation prompt. The proctor might + // want the observer to see the JD without letting it leak + // EHR-flavoured features into the candidate-facing project. + const dir = mkdtempSync(join(tmpdir(), "iv-jd-no-influence-")); + try { + const jdPath = join(dir, "jd.md"); + writeFileSync(jdPath, "Sensitive JD content the candidate should not see."); + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role({ jdPath, jdInfluencesProject: false }), + attempt: 1, + }); + const prompt = captured.calls[0].input; + expect(prompt).not.toContain("Sensitive JD content"); + expect(prompt).not.toContain("Job description context"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("Mode B with stackByCandidate=true tells the BRIEF.md the candidate picks the stack", async () => { + // Wizard's "Greenfield (candidate picks stack)" option lands + // here. The brief must EXPLICITLY tell the candidate they + // choose the tooling — that's part of what's being evaluated. + // The proctor-stated stack should appear only as context, not + // as a requirement, so the candidate's choice itself is judged. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role({ projectMode: "B", stack: "Go", stackByCandidate: true }), + attempt: 1, + }); + const prompt = captured.calls[0].input; + expect(prompt).toMatch(/candidate selects their own tech stack/); + // Must not also demand the proctor's stack. + expect(prompt).not.toMatch(/REQUIRES the candidate to use Go/); + }); + + it("includes the attempt number in the prompt", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role(), attempt: 2 }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("attempt 2"); + }); + + it("includes previousFailures in the prompt on retry", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ + config: role(), + attempt: 2, + previousFailures: [ + "Missing README.md at project root.", + "No failing or skipped tests found.", + ], + }); + const prompt = captured.calls[0].input; + expect(prompt).toContain("Missing README.md"); + expect(prompt).toContain("No failing or skipped tests"); + }); + + it("does NOT include previousFailures section on the first attempt", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role(), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).not.toContain("Previous attempt failed"); + }); + + it("does NOT inline the full rubric (token-cost regression guard)", async () => { + // The full 9-dimension rubric used to be inlined here so the + // model could "build for observability." It was ~600 input + // tokens of review-side context per call; removing it after a + // proctor reported $1.36 for a single Mode B run saves that + // per attempt. The summary-style one-liner that replaced it + // still nudges the model to leave decision points for the + // candidate. ai-observer.ts still injects the full rubric on + // the review side, where it actually drives scoring. + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role(), attempt: 1 }); + const prompt = captured.calls[0].input; + expect(prompt).not.toContain("upfront-design"); + expect(prompt).not.toContain("context-engineering"); + expect(prompt).not.toContain("interview-reviewer v"); + // The replacement one-liner is what nudges the model now. + expect(prompt).toMatch(/engineering judgment under AI augmentation/i); + }); + + it("uses the custom model when specified in the constructor", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never, "gpt-4o"); + await client.generate({ config: role(), attempt: 1 }); + expect(captured.calls[0].model).toBe("gpt-4o"); + }); + + it("passes the time-box in the prompt", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role({ timeBoxMinutes: 60 }), attempt: 1 }); + expect(captured.calls[0].input).toContain("60"); + }); + + it("does NOT include any 'hiring manager addendum' block (the projectPrompt addendum was removed — the feature description is the single source)", async () => { + const captured: { calls: Array<{ input: string; model: string }> } = { calls: [] }; + const client = new OpenAIGeneratorClient(fakeOpenAI([], captured) as never); + await client.generate({ config: role(), attempt: 1 }); + // Regression guard: the proctor-addendum block used to wrap the + // projectPrompt field. After collapsing the wizard's redundant + // "Project prompt" step into the single feature-description either/or, + // the addendum block must never resurface — otherwise the prompt + // implies a second free-form field the user can't actually set. + expect(captured.calls[0].input).not.toContain("Additional instructions from the hiring manager"); + }); +}); + +describe("OpenAIGeneratorClient — JSON schema guard (PROJECT_RESPONSE_SCHEMA)", () => { + it("the API call uses strict json_schema with a 'files' array", async () => { + let capturedTextFormat: unknown; + const interceptOpenAI = { + responses: { + create: async (opts: { text: { format: unknown } }) => { + capturedTextFormat = opts.text; + return { + output_text: JSON.stringify({ files: [] }), + }; + }, + }, + }; + const client = new OpenAIGeneratorClient(interceptOpenAI as never); + await client.generate({ config: role(), attempt: 1 }); + const format = (capturedTextFormat as { format: { type: string; strict: boolean; schema: { properties: { files: unknown } } } }).format; + expect(format.type).toBe("json_schema"); + expect(format.strict).toBe(true); + expect(format.schema.properties.files).toBeDefined(); + }); +}); \ No newline at end of file diff --git a/tests/unit/services/interview/bootstrap/orchestrator.spec.ts b/tests/unit/services/interview/bootstrap/orchestrator.spec.ts new file mode 100644 index 0000000..7ce64bb --- /dev/null +++ b/tests/unit/services/interview/bootstrap/orchestrator.spec.ts @@ -0,0 +1,198 @@ +import { describe, expect, it } from "bun:test"; +import { + existsSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { runBootstrap } from "../../../../../src/services/interview/bootstrap/orchestrator.js"; +import type { + GeneratedProject, + GeneratorClient, +} from "../../../../../src/services/interview/bootstrap/project-generator.js"; +import type { RoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; +import { readRoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function modeAStub(loc = 500): GeneratedProject { + const padLines = Array.from( + { length: 100 }, + (_, k) => `export const v${k} = ${k};`, + ).join("\n"); + const files = [ + { path: "README.md", content: "# Project\n" }, + { path: "GLOSSARY.md", content: "# Glossary\n" }, + { path: "src/a.ts", content: padLines }, + { path: "src/b.ts", content: padLines }, + { + path: "tests/x.spec.ts", + content: + 'import { describe, it } from "bun:test";\ndescribe.skip("x", () => { it("todo", () => {}); });\n', + }, + ]; + const cur = padLines.split("\n").length * 2; + if (loc > cur) { + files.push({ + path: "src/pad.ts", + content: Array.from({ length: loc - cur }, (_, k) => `// ${k}`).join( + "\n", + ), + }); + } + return { files }; +} + +function client(...projects: GeneratedProject[]): GeneratorClient { + let i = 0; + return { + async generate() { + return projects[Math.min(i++, projects.length - 1)]; + }, + }; +} + +function baseConfig(outputDir: string): RoleConfig { + return { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript", + domain: "Payments", + featureDescription: "Add idempotency keys", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir, + }; +} + +describe("runBootstrap", () => { + it("validates config, generates the project, and writes role-config.json", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-orch-")); + try { + const result = await runBootstrap(baseConfig(dir), { + client: client(modeAStub()), + }); + expect(result.ok).toBe(true); + expect(existsSync(join(dir, "role-config.json"))).toBe(true); + expect(existsSync(join(dir, "README.md"))).toBe(true); + const persisted = readRoleConfig(dir); + expect(persisted?.roleSlug).toBe("senior-backend"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("rejects an invalid config without calling the generator", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-orch-")); + try { + let called = false; + const tracer: GeneratorClient = { + async generate() { + called = true; + return modeAStub(); + }, + }; + const result = await runBootstrap( + { ...baseConfig(dir), rubricMode: "custom" }, + { client: tracer }, + ); + expect(result.ok).toBe(false); + expect(called).toBe(false); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("writes an index.html placeholder for Mode B (greenfield) runs", async () => { + // The user picking Mode B (candidate brings their own) needs + // something concrete to open after running `bun run`. The + // orchestrator drops a minimal index.html so a candidate has a + // landing pad — without it Mode B output is just a markdown + // brief and the empty role-config.json. The stub deliberately + // references BRIEF.md and avoids prescribing any framework. + const dir = mkdtempSync(join(tmpdir(), "iv-orch-mode-b-")); + try { + const briefStub: GeneratedProject = { + files: [ + { + path: "BRIEF.md", + content: + "# Brief\n\n## Time-box\n60 minutes\n\n## Acceptance criteria\n- Works.\n\n## Deliverables\n- A repo.\n", + }, + ], + }; + const cfg: RoleConfig = { ...baseConfig(dir), projectMode: "B" }; + const result = await runBootstrap(cfg, { client: client(briefStub) }); + expect(result.ok).toBe(true); + expect(existsSync(join(dir, "index.html"))).toBe(true); + const html = readFileSync(join(dir, "index.html"), "utf8"); + expect(html).toContain("BRIEF.md"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("does NOT write an index.html stub for Mode A (would clobber generated source)", async () => { + // Mode A already has full source files; auto-writing an + // index.html could overwrite an AI-generated one or confuse + // the candidate about what's source vs scaffold. Pin this so + // a refactor can't accidentally extend the stub to Mode A. + const dir = mkdtempSync(join(tmpdir(), "iv-orch-mode-a-noindex-")); + try { + const result = await runBootstrap(baseConfig(dir), { + client: client(modeAStub()), + }); + expect(result.ok).toBe(true); + expect(existsSync(join(dir, "index.html"))).toBe(false); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("preserves an AI-generated index.html instead of clobbering it", async () => { + // If the AI's Mode B output happens to include its own + // index.html, the orchestrator must NOT overwrite it. The + // stub is a fallback for when the AI didn't author one. + const dir = mkdtempSync(join(tmpdir(), "iv-orch-mode-b-keep-")); + try { + const customHtml = "

AI-authored landing

"; + const project: GeneratedProject = { + files: [ + { + path: "BRIEF.md", + content: + "# Brief\n\n## Time-box\n60 minutes\n\n## Acceptance criteria\n- Works.\n\n## Deliverables\n- A repo.\n", + }, + { path: "index.html", content: customHtml }, + ], + }; + const cfg: RoleConfig = { ...baseConfig(dir), projectMode: "B" }; + const result = await runBootstrap(cfg, { client: client(project) }); + expect(result.ok).toBe(true); + const html = readFileSync(join(dir, "index.html"), "utf8"); + expect(html).toBe(customHtml); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("uses an embedded kit template directory when provided", async () => { + const projectDir = mkdtempSync(join(tmpdir(), "iv-orch-")); + const kitDir = mkdtempSync(join(tmpdir(), "iv-kit-")); + try { + writeFileSync(join(kitDir, "INTERVIEW_RULES.md"), "# Rules\n"); + const result = await runBootstrap(baseConfig(projectDir), { + client: client(modeAStub()), + kitTemplateDir: kitDir, + }); + expect(result.ok).toBe(true); + expect(existsSync(join(projectDir, "INTERVIEW_RULES.md"))).toBe(true); + } finally { + rmSync(projectDir, { recursive: true, force: true }); + rmSync(kitDir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/bootstrap/project-generator-security.spec.ts b/tests/unit/services/interview/bootstrap/project-generator-security.spec.ts new file mode 100644 index 0000000..8c9b487 --- /dev/null +++ b/tests/unit/services/interview/bootstrap/project-generator-security.spec.ts @@ -0,0 +1,262 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { homedir, tmpdir } from "node:os"; +import { join } from "node:path"; +import { + type GeneratedProject, + type GeneratorClient, + generateProject, +} from "../../../../../src/services/interview/bootstrap/project-generator.js"; +import type { RoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function role(overrides: Partial = {}): RoleConfig { + return { + roleSlug: "security-test", + roleTitle: "Security Test Role", + stack: "TypeScript", + domain: "Testing", + featureDescription: "Security boundary testing", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir: "(set per test)", + ...overrides, + }; +} + +function validModeAProject(): GeneratedProject { + const padLines = Array.from( + { length: 100 }, + (_, k) => `export const v${k} = ${k};`, + ).join("\n"); + return { + files: [ + { path: "README.md", content: "# Project\n" }, + { path: "GLOSSARY.md", content: "# Glossary\n" }, + { path: "src/deep-one.ts", content: padLines }, + { path: "src/deep-two.ts", content: padLines }, + { + path: "tests/feature.spec.ts", + content: + 'import { describe, it } from "bun:test";\ndescribe.skip("feature", () => { it("todo", () => {}); });\n', + }, + { + path: "src/pad.ts", + content: Array.from({ length: 300 }, (_, k) => `// line ${k}`).join( + "\n", + ), + }, + ], + }; +} + +function clientFor(project: GeneratedProject): GeneratorClient { + return { async generate() { return project; } }; +} + +describe("generateProject — path traversal security (resolveWithinRoot)", () => { + it("rejects absolute paths returned by the LLM generator", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-sec-")); + try { + const malicious: GeneratedProject = { + files: [ + // Absolute path — must be rejected + { path: "/etc/passwd", content: "evil" }, + ], + }; + await expect( + generateProject(role({ outputDir: dir }), clientFor(malicious)), + ).rejects.toThrow(/absolute/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("rejects path-traversal sequences that escape the output directory", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-sec-")); + try { + const malicious: GeneratedProject = { + files: [ + // Relative traversal escaping outputDir + { path: "../../etc/passwd", content: "evil" }, + ], + }; + await expect( + generateProject(role({ outputDir: dir }), clientFor(malicious)), + ).rejects.toThrow(/escapes/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("accepts deeply-nested paths that stay within the output directory", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-sec-")); + try { + const safe: GeneratedProject = { + files: [ + ...validModeAProject().files, + { path: "src/nested/deep/module.ts", content: "export const x = 1;" }, + ], + }; + const result = await generateProject(role({ outputDir: dir }), clientFor(safe)); + expect(result.ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("rejects paths containing null bytes (potential injection vector)", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-sec-")); + try { + // On Linux, null bytes in paths cause ENOENT or ENAMETOOLONG — the + // security boundary is that we never write outside the root. + // A path like "foo\0../../etc/passwd" resolves before the null byte + // on most OS path APIs, so the resolve() call catches this. + const malicious: GeneratedProject = { + files: [ + { path: "src/a.ts\0../../etc/evil", content: "evil" }, + ], + }; + // This may throw with any error (path error, traversal error) — the + // important thing is that we never silently succeed. + await expect( + generateProject(role({ outputDir: dir }), clientFor(malicious)), + ).rejects.toThrow(); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("generateProject — assertSafeToClear guards", () => { + it("refuses to use a filesystem-root path as outputDir", async () => { + // We can't actually point outputDir at / (it would be destructive) so we + // validate the guard via a custom generator that won't even be called + // because assertSafeToClear throws before calling the client. + let called = false; + const tracer: GeneratorClient = { + async generate() { + called = true; + return validModeAProject(); + }, + }; + await expect( + generateProject(role({ outputDir: "/" }), tracer), + ).rejects.toThrow(/root/i); + expect(called).toBe(false); + }); + + it("refuses to use the home directory as outputDir", async () => { + const home = homedir(); + if (!home) return; // skip in pathological environments + let called = false; + const tracer: GeneratorClient = { + async generate() { + called = true; + return validModeAProject(); + }, + }; + await expect( + generateProject(role({ outputDir: home }), tracer), + ).rejects.toThrow(/home/i); + expect(called).toBe(false); + }); + + it("accepts /tmp as outputDir (it is a safe, standard temp directory)", async () => { + // The assertSafeToClear guard rejects '/' (filesystem root) and the + // user's home directory, but NOT /tmp — /tmp is a valid and safe temp dir. + // dirname('/tmp') returns '/', which is NOT equal to '/tmp', so the + // parent === abs guard does not trigger. + // We just need to ensure a nested subdir under /tmp works correctly. + const dir = mkdtempSync(join(tmpdir(), "iv-sec-validate-")); + try { + // Use a valid project so we can check the full flow succeeds + const result = await generateProject( + role({ outputDir: dir }), + clientFor(validModeAProject()), + ); + // The project should succeed — /tmp-based dirs are safe + expect(result.ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("succeeds with a safe output dir that is not root/home", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-sec-safe-")); + try { + // A temporary directory nested under tmpdir is safe. + const result = await generateProject(role({ outputDir: dir }), clientFor(validModeAProject())); + expect(result.ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("generateProject — kit template conflict resolution", () => { + it("kit files take precedence over generator-produced files at the same path", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-kit-prio-")); + const kitDir = mkdtempSync(join(tmpdir(), "iv-kit-src-")); + try { + // The generator writes GLOSSARY.md with "Generator content" + // (using GLOSSARY rather than README so we don't model the AI + // authoring something it must not author per the new contract). + const overriddenProject: GeneratedProject = { + files: [ + ...validModeAProject().files.filter(f => f.path !== "GLOSSARY.md"), + { path: "GLOSSARY.md", content: "Generator content\n" }, + ], + }; + // The kit also has GLOSSARY.md with "Kit content" — kit wins + writeFileSync(join(kitDir, "GLOSSARY.md"), "Kit content\n"); + + const result = await generateProject( + role({ outputDir: dir }), + clientFor(overriddenProject), + { kitTemplateDir: kitDir }, + ); + expect(result.ok).toBe(true); + const content = readFileSync(join(dir, "GLOSSARY.md"), "utf8"); + expect(content).toBe("Kit content\n"); + } finally { + rmSync(dir, { recursive: true, force: true }); + rmSync(kitDir, { recursive: true, force: true }); + } + }); +}); + +describe("generateProject — retry passes previous failures to the client", () => { + it("provides previousFailures from the last attempt on retry", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-retry-")); + try { + const callLog: Array<{ attempt: number; previousFailures: readonly string[] | undefined }> = []; + const trackingClient: GeneratorClient = { + async generate(input) { + callLog.push({ attempt: input.attempt, previousFailures: input.previousFailures }); + if (input.attempt < 2) { + // Return a malformed project on first attempt + return { files: [{ path: "NOTES.md", content: "incomplete" }] }; + } + return validModeAProject(); + }, + }; + + const result = await generateProject( + role({ outputDir: dir }), + trackingClient, + { maxAttempts: 3 }, + ); + expect(result.ok).toBe(true); + expect(callLog).toHaveLength(2); + // First call: no previous failures + expect(callLog[0].attempt).toBe(1); + // Second call: receives the failures from attempt 1 + expect(callLog[1].attempt).toBe(2); + expect((callLog[1].previousFailures ?? []).length).toBeGreaterThan(0); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); \ No newline at end of file diff --git a/tests/unit/services/interview/bootstrap/project-generator.spec.ts b/tests/unit/services/interview/bootstrap/project-generator.spec.ts new file mode 100644 index 0000000..0b6a48b --- /dev/null +++ b/tests/unit/services/interview/bootstrap/project-generator.spec.ts @@ -0,0 +1,225 @@ +import { describe, expect, it } from "bun:test"; +import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + type GeneratorClient, + type GeneratedProject, + generateProject, + validateGenerated, +} from "../../../../../src/services/interview/bootstrap/project-generator.js"; +import type { RoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function role(overrides: Partial = {}): RoleConfig { + return { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript", + domain: "Payments", + featureDescription: "Add idempotency keys", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir: "(set per test)", + ...overrides, + }; +} + +// stubModeAProject returns a minimal Mode A project that passes the +// current validator: README.md and a source file. GLOSSARY.md and +// sample tests were removed from the requirements (they leaked hints +// to the candidate), and the kit-overlaid CLAUDE.md was removed for +// the same reason. +function stubModeAProject(): GeneratedProject { + return { + files: [ + { path: "README.md", content: "# Project\nWhat you're building: a thing.\n" }, + { path: "src/main.ts", content: "export const main = () => {};\n" }, + ], + }; +} + +function stubModeBProject(): GeneratedProject { + return { + files: [ + { + path: "BRIEF.md", + content: `# Brief\n\n## Time-box\n90 minutes\n\n## Acceptance criteria\n- Works.\n\n## Deliverables\n- A repo with passing tests.\n`, + }, + ], + }; +} + +function clientReturning(...projects: GeneratedProject[]): GeneratorClient { + let i = 0; + return { + async generate() { + const project = projects[Math.min(i, projects.length - 1)]; + i += 1; + return project; + }, + }; +} + +describe("generateProject (Mode A)", () => { + it("writes generated files into outputDir and reports success", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + const client = clientReturning(stubModeAProject()); + const result = await generateProject(role({ outputDir: dir }), client); + expect(result.ok).toBe(true); + expect(existsSync(join(dir, "README.md"))).toBe(true); + expect(existsSync(join(dir, "src", "main.ts"))).toBe(true); + expect(result.attempts).toBe(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("retries when validation fails, then succeeds on a later attempt within the default budget", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + // First attempt returns a malformed project (no README.md); + // second attempt succeeds. Default budget is 3 attempts. + const malformed: GeneratedProject = { + files: [{ path: "NOTES.md", content: "incomplete" }], + }; + const client = clientReturning(malformed, stubModeAProject()); + const result = await generateProject(role({ outputDir: dir }), client); + expect(result.ok).toBe(true); + expect(result.attempts).toBe(2); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("reports failure with diagnostic after exhausting the default attempt budget", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + const malformed: GeneratedProject = { + files: [{ path: "NOTES.md", content: "incomplete" }], + }; + // clientReturning clamps to the last project when it runs out, so + // this returns malformed for every attempt and exhausts the + // default 3-attempt budget. The single structural check + // (missing README.md) drives failure here. + const client = clientReturning(malformed); + const result = await generateProject(role({ outputDir: dir }), client); + expect(result.ok).toBe(false); + expect(result.attempts).toBe(3); + expect(result.failures.length).toBeGreaterThan(0); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("copies the embedded interview-kit templates into outputDir", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + const client = clientReturning(stubModeAProject()); + const kitSrc = mkdtempSync(join(tmpdir(), "iv-kit-")); + // stage a fake kit + const { writeFileSync, mkdirSync } = await import("node:fs"); + mkdirSync(join(kitSrc, ".claude"), { recursive: true }); + writeFileSync(join(kitSrc, "start.sh"), "#!/usr/bin/env bash\n"); + writeFileSync( + join(kitSrc, ".claude", "settings.json"), + '{"hooks":{}}\n', + ); + try { + const result = await generateProject(role({ outputDir: dir }), client, { + kitTemplateDir: kitSrc, + }); + expect(result.ok).toBe(true); + expect(existsSync(join(dir, "start.sh"))).toBe(true); + expect(existsSync(join(dir, ".claude", "settings.json"))).toBe(true); + } finally { + rmSync(kitSrc, { recursive: true, force: true }); + } + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("substitutes {{TIME_BOX}} placeholders when copying kit templates", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-tb-")); + try { + const client = clientReturning(stubModeAProject()); + const kitSrc = mkdtempSync(join(tmpdir(), "iv-kit-tb-")); + const { writeFileSync } = await import("node:fs"); + writeFileSync( + join(kitSrc, "INTERVIEW_RULES.md"), + "# Rules\n\nTime-box: **`{{TIME_BOX}}`** minutes.\n", + ); + writeFileSync( + join(kitSrc, "no-template.md"), + "This file has no placeholders.\n", + ); + try { + const result = await generateProject( + role({ outputDir: dir, timeBoxMinutes: 75 }), + client, + { kitTemplateDir: kitSrc }, + ); + expect(result.ok).toBe(true); + const body = readFileSync(join(dir, "INTERVIEW_RULES.md"), "utf8"); + expect(body).toContain("**`75`** minutes"); + expect(body).not.toContain("{{TIME_BOX}}"); + // Files without placeholders should pass through unchanged. + const untouched = readFileSync(join(dir, "no-template.md"), "utf8"); + expect(untouched).toBe("This file has no placeholders.\n"); + } finally { + rmSync(kitSrc, { recursive: true, force: true }); + } + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("validateGenerated", () => { + it("re-runs validation against an already-written output dir (Mode A)", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + const client = clientReturning(stubModeAProject()); + const cfg = role({ outputDir: dir }); + const result = await generateProject(cfg, client); + expect(result.ok).toBe(true); + const revalidation = validateGenerated(cfg); + expect(revalidation.ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("returns the failure list when the output dir is malformed", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + const cfg = role({ outputDir: dir, projectMode: "B" }); + const result = validateGenerated(cfg); + expect(result.ok).toBe(false); + expect(result.failures.length).toBeGreaterThan(0); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("generateProject (Mode B)", () => { + it("writes a BRIEF.md and reports success", async () => { + const dir = mkdtempSync(join(tmpdir(), "iv-gen-")); + try { + const client = clientReturning(stubModeBProject()); + const result = await generateProject( + role({ outputDir: dir, projectMode: "B" }), + client, + ); + expect(result.ok).toBe(true); + const brief = readFileSync(join(dir, "BRIEF.md"), "utf8"); + expect(brief.length).toBeGreaterThan(0); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/bootstrap/project-validator.spec.ts b/tests/unit/services/interview/bootstrap/project-validator.spec.ts new file mode 100644 index 0000000..c31f253 --- /dev/null +++ b/tests/unit/services/interview/bootstrap/project-validator.spec.ts @@ -0,0 +1,134 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + validateModeAProject, + validateModeBProject, +} from "../../../../../src/services/interview/bootstrap/project-validator.js"; + +function makeTempProject(): string { + return mkdtempSync(join(tmpdir(), "iv-validator-")); +} + +function writeModeAFixture( + dir: string, + opts: { withReadme?: boolean } = {}, +): void { + const o = { withReadme: true, ...opts }; + if (o.withReadme) writeFileSync(join(dir, "README.md"), "# Project\n"); + mkdirSync(join(dir, "src"), { recursive: true }); + writeFileSync(join(dir, "src", "main.ts"), "export const main = () => {};\n"); +} + +describe("project-validator (Mode A)", () => { + it("passes with only a README.md (the single structural requirement)", () => { + // The candidate-facing brief is the only required file. GLOSSARY, + // sample tests, and the kit's CLAUDE.md have all been removed — + // they hinted at the answer or coached the candidate's agent in + // ways that undermined the evaluation. + const dir = makeTempProject(); + try { + writeModeAFixture(dir); + const result = validateModeAProject(dir); + expect(result.ok).toBe(true); + expect(result.failures).toEqual([]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("fails when README.md is missing", () => { + const dir = makeTempProject(); + try { + writeModeAFixture(dir, { withReadme: false }); + const result = validateModeAProject(dir); + expect(result.ok).toBe(false); + expect(result.failures.some((f) => /README\.md/i.test(f))).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("does NOT require GLOSSARY.md (regression: glossary leaked domain hints)", () => { + // Pin the contract: a project without GLOSSARY.md must pass. + // Removed because a glossary lists the domain concepts the + // candidate is being evaluated on identifying themselves. + const dir = makeTempProject(); + try { + writeFileSync(join(dir, "README.md"), "# Project\n"); + const result = validateModeAProject(dir); + expect(result.ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("does NOT require sample tests (regression: pre-existing tests leaked the API shape)", () => { + // Pin the contract: a project without any test files must pass. + // Removed because a pre-existing `describe.skip("addUser", ...)` + // reveals the function name the candidate is expected to write. + // The candidate writes their own tests as part of the work. + const dir = makeTempProject(); + try { + writeFileSync(join(dir, "README.md"), "# Project\n"); + const result = validateModeAProject(dir); + expect(result.ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("project-validator (Mode B)", () => { + it("passes when BRIEF.md exists with required sections", () => { + const dir = makeTempProject(); + try { + writeFileSync( + join(dir, "BRIEF.md"), + `# Brief\n\n## Time-box\n90 minutes\n\n## Acceptance criteria\n- Works\n\n## Deliverables\n- A repo\n`, + ); + const result = validateModeBProject(dir); + expect(result.ok).toBe(true); + expect(result.failures).toEqual([]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("fails when BRIEF.md is missing", () => { + const dir = makeTempProject(); + try { + const result = validateModeBProject(dir); + expect(result.ok).toBe(false); + expect(result.failures.some((f) => /BRIEF\.md/i.test(f))).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("fails when BRIEF.md is empty", () => { + const dir = makeTempProject(); + try { + writeFileSync(join(dir, "BRIEF.md"), ""); + const result = validateModeBProject(dir); + expect(result.ok).toBe(false); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("fails when BRIEF.md is missing required sections", () => { + const dir = makeTempProject(); + try { + writeFileSync(join(dir, "BRIEF.md"), "# Brief\n\nJust a title.\n"); + const result = validateModeBProject(dir); + expect(result.ok).toBe(false); + expect( + result.failures.some((f) => /acceptance|deliverable|time-box/i.test(f)), + ).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/bootstrap/role-config-edge-cases.spec.ts b/tests/unit/services/interview/bootstrap/role-config-edge-cases.spec.ts new file mode 100644 index 0000000..6ac0a5c --- /dev/null +++ b/tests/unit/services/interview/bootstrap/role-config-edge-cases.spec.ts @@ -0,0 +1,236 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + type RoleConfig, + readRoleConfig, + validateRoleConfig, + writeRoleConfig, +} from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function baseConfig(): RoleConfig { + return { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript / Node", + domain: "Payments", + featureDescription: "Add idempotency keys to the refunds endpoint.", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir: "./roles/senior-backend", + }; +} + +describe("validateRoleConfig — boundary and negative cases", () => { + it("rejects an unknown projectMode (not A or B)", () => { + const c = { ...baseConfig(), projectMode: "C" } as unknown as RoleConfig; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /projectMode/i.test(f))).toBe(true); + }); + + it("rejects an unknown analysisMode", () => { + const c = { + ...baseConfig(), + analysisMode: "fully-automated", + } as unknown as RoleConfig; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /analysisMode/i.test(f))).toBe(true); + }); + + it("rejects an unknown rubricMode", () => { + const c = { + ...baseConfig(), + rubricMode: "partial-jd", + } as unknown as RoleConfig; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /rubricMode/i.test(f))).toBe(true); + }); + + it("rejects a timeBoxMinutes of exactly 14 (one below minimum of 15)", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: 14 }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /time.?box/i.test(f))).toBe(true); + }); + + it("accepts a timeBoxMinutes of exactly 15 (the minimum)", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: 15 }; + expect(validateRoleConfig(c).ok).toBe(true); + }); + + it("accepts a timeBoxMinutes of exactly 240 (the maximum)", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: 240 }; + expect(validateRoleConfig(c).ok).toBe(true); + }); + + it("rejects a timeBoxMinutes of exactly 241 (one above maximum)", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: 241 }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + }); + + it("rejects a non-finite timeBoxMinutes (NaN)", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: Number.NaN }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + }); + + it("rejects a non-finite timeBoxMinutes (Infinity)", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: Number.POSITIVE_INFINITY }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + }); + + it("rejects an empty string for featureDescription", () => { + const c: RoleConfig = { ...baseConfig(), featureDescription: "" }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /featureDescription/i.test(f))).toBe(true); + }); + + it("rejects a whitespace-only featureDescription", () => { + const c: RoleConfig = { ...baseConfig(), featureDescription: " " }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + }); + + it("rejects rubricMode=custom when customPrompt is only whitespace", () => { + const c: RoleConfig = { + ...baseConfig(), + rubricMode: "custom", + customPrompt: " ", + }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /customPrompt/i.test(f))).toBe(true); + }); + + it("collects multiple failures in a single validation pass", () => { + const c = { + ...baseConfig(), + roleSlug: "", + stack: "", + timeBoxMinutes: 0, + } as unknown as RoleConfig; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.length).toBeGreaterThanOrEqual(3); + }); +}); + +describe("readRoleConfig — error cases", () => { + it("throws when role-config.json contains invalid JSON", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-err-")); + try { + writeFileSync(join(dir, "role-config.json"), "{ not valid json }"); + expect(() => readRoleConfig(dir)).toThrow(/Malformed/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("throws when role-config.json contains a non-object top-level value", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-err-")); + try { + writeFileSync(join(dir, "role-config.json"), '"just a string"'); + expect(() => readRoleConfig(dir)).toThrow(/Malformed/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("throws when role-config.json contains null", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-err-")); + try { + writeFileSync(join(dir, "role-config.json"), "null"); + expect(() => readRoleConfig(dir)).toThrow(/Malformed/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("throws when role-config.json parses to valid JSON but fails RoleConfig validation", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-err-")); + try { + // Valid JSON object, but missing required fields + writeFileSync( + join(dir, "role-config.json"), + JSON.stringify({ roleSlug: "ok" }), + ); + expect(() => readRoleConfig(dir)).toThrow(/Invalid/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("writeRoleConfig — persistence edge cases", () => { + it("creates the directory when it does not exist yet", () => { + const base = mkdtempSync(join(tmpdir(), "iv-rc-")); + const dir = join(base, "nested", "role-dir"); + try { + writeRoleConfig(dir, baseConfig()); + const read = readRoleConfig(dir); + expect(read?.roleSlug).toBe("senior-backend"); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); + + it("overwrites an existing role-config.json", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-overwrite-")); + try { + writeRoleConfig(dir, baseConfig()); + const updated: RoleConfig = { ...baseConfig(), domain: "Logistics" }; + writeRoleConfig(dir, updated); + const read = readRoleConfig(dir); + expect(read?.domain).toBe("Logistics"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("stores optional jdPath when present (independent of rubric mode)", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-optionals-")); + const jdDir = mkdtempSync(join(tmpdir(), "iv-jd-")); + try { + const jdPath = join(jdDir, "jd.md"); + writeFileSync(jdPath, "# JD content\n"); + const cfg: RoleConfig = { + ...baseConfig(), + rubricMode: "default", + jdPath, + }; + writeRoleConfig(dir, cfg); + const read = readRoleConfig(dir); + expect(read?.jdPath).toBe(jdPath); + } finally { + rmSync(dir, { recursive: true, force: true }); + rmSync(jdDir, { recursive: true, force: true }); + } + }); + + it("stores optional customPrompt when present", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-rc-custom-")); + try { + const cfg: RoleConfig = { + ...baseConfig(), + rubricMode: "custom", + customPrompt: "Score primarily on architectural decisions", + }; + writeRoleConfig(dir, cfg); + const read = readRoleConfig(dir); + expect(read?.customPrompt).toBe( + "Score primarily on architectural decisions", + ); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); \ No newline at end of file diff --git a/tests/unit/services/interview/bootstrap/role-config.spec.ts b/tests/unit/services/interview/bootstrap/role-config.spec.ts new file mode 100644 index 0000000..383f7bd --- /dev/null +++ b/tests/unit/services/interview/bootstrap/role-config.spec.ts @@ -0,0 +1,235 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + type RoleConfig, + readRoleConfig, + validateRoleConfig, + writeRoleConfig, +} from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function baseConfig(): RoleConfig { + return { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript / Node", + domain: "Payments", + featureDescription: "Add idempotency keys to the refunds endpoint.", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir: "./roles/senior-backend", + }; +} + +describe("role-config validation", () => { + it("accepts a valid default-rubric config", () => { + const r = validateRoleConfig(baseConfig()); + expect(r.ok).toBe(true); + }); + + it("rejects missing roleSlug", () => { + const c = baseConfig(); + // @ts-expect-error testing invalid input + c.roleSlug = ""; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /roleSlug/i.test(f))).toBe(true); + }); + + it("rejects rubricMode=custom without a non-empty customPrompt", () => { + const c: RoleConfig = { ...baseConfig(), rubricMode: "custom" }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /customPrompt/i.test(f))).toBe(true); + }); + + it("accepts rubricMode=custom with a non-empty customPrompt", () => { + const c: RoleConfig = { + ...baseConfig(), + rubricMode: "custom", + customPrompt: "Look for X and Y.", + }; + expect(validateRoleConfig(c).ok).toBe(true); + }); + + it("rejects retired rubricMode 'default+jd'", () => { + // "default+jd" was retired in favour of a standalone JD field + // (jdPath, jdInfluencesProject). The validator must surface a + // clear error if a stale config file still uses the old value. + const c = { ...baseConfig(), rubricMode: "default+jd" as never } as RoleConfig; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /rubricMode/i.test(f))).toBe(true); + }); + + it("rejects a jdPath that does not exist on disk", () => { + // jdPath is now optional regardless of rubric mode, but when + // supplied it must point at a real file — otherwise the AI + // observer will read nothing. + const c: RoleConfig = { + ...baseConfig(), + rubricMode: "default", + jdPath: "/definitely/not/a/real/path/jd.md", + }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /jdPath/i.test(f))).toBe(true); + }); + + it("accepts a jdPath alongside the default rubric (independent inputs)", () => { + // The proctor can now combine ANY rubric with a JD — the old + // coupling was an unnecessary restriction. + const dir = mkdtempSync(join(tmpdir(), "iv-jd-")); + try { + const jdPath = join(dir, "jd.md"); + writeFileSync(jdPath, "# JD\n"); + const c: RoleConfig = { + ...baseConfig(), + rubricMode: "default", + jdPath, + }; + expect(validateRoleConfig(c).ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("rejects jdInfluencesProject=true without a jdPath", () => { + // The influence flag tells the generator to read the JD; without + // a path there's nothing to read. Caught at validation time so + // the bun subprocess never sees an inconsistent config. + const c: RoleConfig = { + ...baseConfig(), + rubricMode: "default", + jdInfluencesProject: true, + }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /jdInfluencesProject/i.test(f))).toBe(true); + }); + + it("accepts an empty domain when a jdPath is attached (JD describes the domain)", () => { + // The wizard skips the Domain question whenever a JD is + // provided; the role-config produced has an empty domain. + // Validation must accept this — otherwise headless callers + // hitting the same shape would crash. The OpenAI prompt + // renders a "Domain: infer from the job description" line + // for the model. + const dir = mkdtempSync(join(tmpdir(), "iv-jd-no-domain-")); + try { + const jdPath = join(dir, "jd.md"); + writeFileSync(jdPath, "# Healthtech\nWe build EHR integrations.\n"); + const c: RoleConfig = { + ...baseConfig(), + domain: "", // skipped by the wizard because JD is attached + jdPath, + }; + expect(validateRoleConfig(c).ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("rejects an empty domain when no jdPath is attached", () => { + const c: RoleConfig = { ...baseConfig(), domain: "" }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /domain/i.test(f))).toBe(true); + }); + + it("accepts jdInfluencesProject=true paired with a real jdPath", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-jd-influence-")); + try { + const jdPath = join(dir, "jd.md"); + writeFileSync( + jdPath, + "# Junior Healthcare Engineer\nFamiliarity with FHIR, HL7, or EHR concepts.\n", + ); + const c: RoleConfig = { + ...baseConfig(), + rubricMode: "default", + jdPath, + jdInfluencesProject: true, + }; + expect(validateRoleConfig(c).ok).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("rejects unsupported timeBoxMinutes", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: 7 }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /time.?box/i.test(f))).toBe(true); + }); + + it("accepts custom timeBoxMinutes when explicitly provided", () => { + const c: RoleConfig = { ...baseConfig(), timeBoxMinutes: 45 }; + // 45 is allowed as a custom value as long as it is between 15 and 240 + expect(validateRoleConfig(c).ok).toBe(true); + }); + + it("accepts stackByCandidate=true when paired with projectMode 'B'", () => { + // "Greenfield (candidate picks stack)" — the brief tells the + // candidate they pick the tooling. Only valid with Mode B (no + // starter code), where letting the candidate choose is coherent. + const c: RoleConfig = { + ...baseConfig(), + projectMode: "B", + stackByCandidate: true, + }; + expect(validateRoleConfig(c).ok).toBe(true); + }); + + it("rejects stackByCandidate=true with projectMode 'A'", () => { + // Mode A scaffolds code IN a specific stack, so "candidate picks + // the stack" is incoherent. Validator catches the misconfiguration + // before a confused brownfield project + greenfield brief ships. + const c: RoleConfig = { + ...baseConfig(), + projectMode: "A", + stackByCandidate: true, + }; + const r = validateRoleConfig(c); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /stackByCandidate/i.test(f))).toBe(true); + }); +}); + +describe("role-config persistence", () => { + it("round-trips through writeRoleConfig / readRoleConfig", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-roleio-")); + try { + const cfg = baseConfig(); + writeRoleConfig(dir, cfg); + const read = readRoleConfig(dir); + expect(read).toEqual(cfg); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("readRoleConfig returns null when no config file exists", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-roleio-")); + try { + expect(readRoleConfig(dir)).toBeNull(); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("writeRoleConfig refuses to write an invalid config", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-roleio-")); + try { + const bad = { ...baseConfig(), roleSlug: "" }; + // @ts-expect-error testing invalid input + expect(() => writeRoleConfig(dir, bad)).toThrow(); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/cohort/cohort-summary-edge-cases.spec.ts b/tests/unit/services/interview/cohort/cohort-summary-edge-cases.spec.ts new file mode 100644 index 0000000..9c750a0 --- /dev/null +++ b/tests/unit/services/interview/cohort/cohort-summary-edge-cases.spec.ts @@ -0,0 +1,222 @@ +import { describe, expect, it } from "bun:test"; +import { + mkdirSync, + mkdtempSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { AuditFrontmatter } from "../../../../../src/services/interview/review/audit-writer.js"; +import { + loadCohort, + renderCohortSummary, +} from "../../../../../src/services/interview/cohort/cohort-summary.js"; + +function makeCandidate( + roleDir: string, + slug: string, + fm: Partial, +): void { + const dir = join(roleDir, slug); + mkdirSync(dir, { recursive: true }); + const full: AuditFrontmatter = { + tags: ["hiring"], + candidate: slug, + role: "senior-backend", + date: "2026-05-10", + rubric_version: "1.0.0", + rubric_mode: "default", + signed_off: false, + ...fm, + }; + writeFileSync( + join(dir, "audit.json"), + JSON.stringify({ frontmatter: full, result: {} }), + ); +} + +describe("loadCohort — edge cases", () => { + it("skips non-directory entries (e.g. loose files) in the role directory", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + // Create a loose file (not a directory) at the top level of roleDir + writeFileSync(join(dir, "COHORT.md"), "# Cohort\n"); + writeFileSync(join(dir, "some-file.txt"), "random"); + // Create one valid candidate folder + makeCandidate(dir, "alice", { candidate: "Alice" }); + const records = loadCohort(dir); + // Only alice should be picked up; the loose files are not directories + expect(records).toHaveLength(1); + expect(records[0].frontmatter.candidate).toBe("Alice"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips subdirectories with malformed audit.json", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + // Good candidate + makeCandidate(dir, "alice", { candidate: "Alice" }); + // Bad candidate: malformed JSON + const badDir = join(dir, "bob"); + mkdirSync(badDir, { recursive: true }); + writeFileSync(join(badDir, "audit.json"), "{ not valid json }"); + const records = loadCohort(dir); + // Only alice (bad JSON silently skipped) + expect(records).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips audit.json files missing required frontmatter fields", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice" }); + // audit.json missing the 'candidate' field + const incompleteDir = join(dir, "bob"); + mkdirSync(incompleteDir, { recursive: true }); + writeFileSync( + join(incompleteDir, "audit.json"), + JSON.stringify({ + frontmatter: { role: "backend", date: "2026-05-10" }, + }), + ); + const records = loadCohort(dir); + expect(records).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips audit.json files where frontmatter is not an object", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice" }); + const badDir = join(dir, "carol"); + mkdirSync(badDir, { recursive: true }); + writeFileSync( + join(badDir, "audit.json"), + JSON.stringify({ frontmatter: "just a string" }), + ); + const records = loadCohort(dir); + expect(records).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("returns an empty list when the roleDir is empty", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + expect(loadCohort(dir)).toEqual([]); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("renderCohortSummary — session_date priority for display", () => { + it("uses session_date instead of date in the Interviewed column when available", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + // date = 2026-05-10, session_date = 2026-06-01 + makeCandidate(dir, "alice", { + candidate: "Alice", + date: "2026-05-10", + session_date: "2026-06-01", + }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).toContain("2026-06-01"); + // The submission date should NOT appear in place of the session date + expect(body).not.toContain("2026-05-10"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("falls back to date when session_date is absent", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "alice", { + candidate: "Alice", + date: "2026-05-10", + }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).toContain("2026-05-10"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("renderCohortSummary — sorting edge cases", () => { + it("orders chronologically using session_date when available", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "carol", { + candidate: "Carol", + date: "2026-05-20", + session_date: "2026-05-20", + }); + makeCandidate(dir, "alice", { + candidate: "Alice", + date: "2026-05-01", + session_date: "2026-05-01", + }); + const body = renderCohortSummary("senior-backend", loadCohort(dir), { + order: "chronological", + }); + expect(body.indexOf("Alice")).toBeLessThan(body.indexOf("Carol")); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("renders link to summary.md using the subfolder name", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "alice-2026-05-12", { candidate: "Alice" }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).toContain("alice-2026-05-12/summary.md"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("shows recommendation as — for pending sign-offs", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "bob", { candidate: "Bob", signed_off: false }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + // Recommendation column for unsigned should be "—" + expect(body).toContain("| — |"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("does NOT include score, rank, or total columns regardless of data", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-edge-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice", signed_off: true, recommendation: "Hire" }); + makeCandidate(dir, "bob", { candidate: "Bob", signed_off: false }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).not.toMatch(/\|\s*Score\s*\|/i); + expect(body).not.toMatch(/\|\s*Total\s*\|/i); + expect(body).not.toMatch(/\|\s*Rank\s*\|/i); + expect(body).not.toMatch(/\d+\.\d+/); // no decimal numbers (scores) + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("renderCohortSummary — includes role name in the title", () => { + it("renders the role slug in the cohort heading", () => { + const body = renderCohortSummary("senior-backend-engineer", []); + expect(body).toContain("# Cohort: senior-backend-engineer"); + }); +}); \ No newline at end of file diff --git a/tests/unit/services/interview/cohort/cohort-summary.spec.ts b/tests/unit/services/interview/cohort/cohort-summary.spec.ts new file mode 100644 index 0000000..bebcfe7 --- /dev/null +++ b/tests/unit/services/interview/cohort/cohort-summary.spec.ts @@ -0,0 +1,182 @@ +import { describe, expect, it } from "bun:test"; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { AuditFrontmatter } from "../../../../../src/services/interview/review/audit-writer.js"; +import { + loadCohort, + renderCohortSummary, + writeCohortSummary, +} from "../../../../../src/services/interview/cohort/cohort-summary.js"; + +function makeCandidate( + roleDir: string, + slug: string, + fm: Partial, +): void { + const dir = join(roleDir, slug); + mkdirSync(dir, { recursive: true }); + const full: AuditFrontmatter = { + tags: ["hiring"], + candidate: slug, + role: "senior-backend", + date: "2026-05-10", + rubric_version: "1.0.0", + rubric_mode: "default", + signed_off: false, + ...fm, + }; + writeFileSync( + join(dir, "audit.json"), + JSON.stringify({ frontmatter: full, result: {} }), + ); + writeFileSync(join(dir, "summary.md"), "stub"); +} + +describe("loadCohort", () => { + it("returns an empty list when the role directory does not exist", () => { + expect(loadCohort("/tmp/definitely-not-real")).toEqual([]); + }); + + it("reads audit.json from each candidate subfolder", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice Chen" }); + makeCandidate(dir, "bob", { candidate: "Bob Park" }); + const records = loadCohort(dir); + expect(records).toHaveLength(2); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips subfolders without an audit.json", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice" }); + mkdirSync(join(dir, "incomplete")); + expect(loadCohort(dir)).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("renderCohortSummary", () => { + it("places the advisory warning banner at the top", () => { + const body = renderCohortSummary("senior-backend", []); + expect(body).toMatch(/THIS COHORT REPORT IS ADVISORY/); + expect(body).toMatch(/not a score/); + }); + + it("emits the required columns and no score column", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice" }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).toContain( + "| Candidate | Interviewed | Sign-off | Recommendation | Audit |", + ); + // No score column or numerical totals column in the header. + expect(body).not.toMatch(/\|\s*Score\s*\|/i); + expect(body).not.toMatch(/\|\s*Total\s*\|/i); + expect(body).not.toMatch(/\|\s*Rank\s*\|/i); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("marks pending sign-offs with ⏳ and reviewed with ✅", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "alice", { + candidate: "Alice", + signed_off: true, + recommendation: "Hire", + }); + makeCandidate(dir, "bob", { candidate: "Bob", signed_off: false }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).toMatch(/Alice.*Reviewed/); + expect(body).toMatch(/Bob.*Pending/); + expect(body).toMatch(/Alice.*Hire/); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("orders alphabetically by default", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "zelda", { candidate: "Zelda" }); + makeCandidate(dir, "alice", { candidate: "Alice" }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body.indexOf("Alice")).toBeLessThan(body.indexOf("Zelda")); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("orders chronologically when requested", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "zelda", { + candidate: "Zelda", + date: "2026-05-01", + }); + makeCandidate(dir, "alice", { + candidate: "Alice", + date: "2026-05-15", + }); + const body = renderCohortSummary("senior-backend", loadCohort(dir), { + order: "chronological", + }); + expect(body.indexOf("Zelda")).toBeLessThan(body.indexOf("Alice")); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("renders a friendly message when there are no candidates", () => { + const body = renderCohortSummary("senior-backend", []); + expect(body).toContain("No candidates yet"); + }); + + it("counts pending vs reviewed in the header", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "alice", { signed_off: true, recommendation: "Hire" }); + makeCandidate(dir, "bob", { signed_off: false }); + makeCandidate(dir, "carol", { signed_off: false }); + const body = renderCohortSummary("senior-backend", loadCohort(dir)); + expect(body).toMatch(/3 \(2 pending sign-off, 1 reviewed\)/); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("writeCohortSummary", () => { + it("writes COHORT.md inside the role directory", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-cohort-")); + try { + makeCandidate(dir, "alice", { candidate: "Alice" }); + const out = writeCohortSummary({ + roleDir: dir, + roleSlug: "senior-backend", + }); + expect(existsSync(out.path)).toBe(true); + expect(out.recordCount).toBe(1); + const body = readFileSync(out.path, "utf8"); + expect(body).toContain("THIS COHORT REPORT IS ADVISORY"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/cohort/skill.spec.ts b/tests/unit/services/interview/cohort/skill.spec.ts new file mode 100644 index 0000000..3e367ad --- /dev/null +++ b/tests/unit/services/interview/cohort/skill.spec.ts @@ -0,0 +1,71 @@ +import { describe, expect, it } from "bun:test"; +import { existsSync, readFileSync } from "node:fs"; +import { resolve } from "node:path"; + +const SKILL_PATH = resolve( + import.meta.dir, + "../../../../../skills/teamhero-interview/SKILL.md", +); + +function loadSkill(): string { + return readFileSync(SKILL_PATH, "utf8"); +} + +describe("teamhero-interview Claude skill", () => { + it("exists in skills/teamhero-interview/SKILL.md", () => { + expect(existsSync(SKILL_PATH)).toBe(true); + }); + + it("has YAML frontmatter declaring name and description", () => { + const body = loadSkill(); + const fm = body.match(/^---\n([\s\S]*?)\n---/); + expect(fm).not.toBeNull(); + const block = fm?.[1] ?? ""; + expect(block).toMatch(/^name:\s*teamhero-interview\b/m); + expect(block).toMatch(/^description:\s+/m); + }); + + it("documents all 3 MVP verbs with example invocations", () => { + const body = loadSkill(); + for (const verb of ["bootstrap", "review", "cohort"]) { + expect(body).toContain(`teamhero interview ${verb}`); + } + }); + + it("mentions the v1.5 verb stubs (list-roles, list-candidates)", () => { + const body = loadSkill(); + expect(body).toContain("list-roles"); + expect(body).toContain("list-candidates"); + }); + + it("includes the ethical framing — observations not scores, bias diversification, human-in-the-loop", () => { + const body = loadSkill(); + expect(body).toMatch(/Observations, not scores/i); + expect(body).toMatch(/bias diversification/i); + expect(body).toMatch(/Human-in-the-loop/i); + }); + + it("instructs explicit refusal when the user asks for a numerical score", () => { + const body = loadSkill(); + expect(body).toMatch(/Do not produce scores/i); + }); + + it("describes cohort orchestration (read role config → review each → run cohort)", () => { + const body = loadSkill(); + expect(body).toMatch(/Cohort orchestration/); + expect(body).toMatch(/role config/); + expect(body).toMatch(/sign-off/); + }); + + it("explicitly states the skill contains no business logic", () => { + const body = loadSkill(); + expect(body).toMatch(/no business logic/i); + expect(body).toMatch(/src\/services\/interview/); + }); + + it("warns against feeding session_recording_url to the AI observer", () => { + const body = loadSkill(); + expect(body).toMatch(/session_recording_url/); + expect(body).toMatch(/not.*feed|do not.*observer/i); + }); +}); diff --git a/tests/unit/services/interview/kit/kit-smoke.spec.ts b/tests/unit/services/interview/kit/kit-smoke.spec.ts new file mode 100644 index 0000000..f31d7eb --- /dev/null +++ b/tests/unit/services/interview/kit/kit-smoke.spec.ts @@ -0,0 +1,146 @@ +import { describe, expect, it } from "bun:test"; +import { spawnSync } from "node:child_process"; +import { + cpSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; + +const KIT_DIR = resolve( + import.meta.dir, + "../../../../../teamhero-interview-kit", +); + +function stageKit(): { dir: string; cleanup: () => void } { + const dir = mkdtempSync(join(tmpdir(), "iv-kit-smoke-")); + cpSync(KIT_DIR, dir, { recursive: true }); + return { dir, cleanup: () => rmSync(dir, { recursive: true, force: true }) }; +} + +function sign(dir: string): void { + const path = join(dir, "PRIVACY_RELEASE.md"); + const body = readFileSync(path, "utf8") + .replace(/\(placeholder — candidate signs here\)/, "Jane Doe") + .replace(/\(placeholder — candidate dates here.*\)/, "2026-05-10"); + writeFileSync(path, body); +} + +describe("interview kit smoke", () => { + it("start.sh refuses to proceed when the release is unsigned", () => { + const { dir, cleanup } = stageKit(); + try { + const result = spawnSync("bash", [join(dir, "start.sh")], { + env: { ...process.env, SKIP_RECORD: "1" }, + encoding: "utf8", + }); + expect(result.status).not.toBe(0); + expect(result.stderr).toContain("not signed"); + } finally { + cleanup(); + } + }); + + it("start.sh proceeds when the release is signed", () => { + const { dir, cleanup } = stageKit(); + try { + sign(dir); + const result = spawnSync("bash", [join(dir, "start.sh")], { + env: { ...process.env, SKIP_RECORD: "1" }, + encoding: "utf8", + }); + expect(result.status).toBe(0); + expect(result.stdout).toContain("Privacy gate passed"); + } finally { + cleanup(); + } + }); + + it("settings.json declares both UserPromptSubmit and PreToolUse hooks", () => { + const body = readFileSync( + join(KIT_DIR, ".claude", "settings.json"), + "utf8", + ); + const cfg = JSON.parse(body); + expect(cfg.hooks.UserPromptSubmit).toBeDefined(); + expect(cfg.hooks.PreToolUse).toBeDefined(); + const hookCmd = cfg.hooks.UserPromptSubmit[0].hooks[0].command; + expect(hookCmd).toContain("interview.log"); + }); + + it("end.sh refuses when start.sh has not been run", () => { + const { dir, cleanup } = stageKit(); + try { + sign(dir); + const result = spawnSync("bash", [join(dir, "end.sh")], { + env: { ...process.env, SKIP_RECORD: "1", SKIP_COMMIT: "1" }, + encoding: "utf8", + }); + expect(result.status).not.toBe(0); + } finally { + cleanup(); + } + }); + + it("start.sh → end.sh round-trip works when release is signed (SKIP modes)", () => { + const { dir, cleanup } = stageKit(); + try { + sign(dir); + const start = spawnSync("bash", [join(dir, "start.sh")], { + env: { ...process.env, SKIP_RECORD: "1" }, + encoding: "utf8", + }); + expect(start.status).toBe(0); + const end = spawnSync("bash", [join(dir, "end.sh")], { + env: { ...process.env, SKIP_RECORD: "1", SKIP_COMMIT: "1" }, + encoding: "utf8", + }); + expect(end.status).toBe(0); + expect(end.stdout).toContain("artifacts ready"); + } finally { + cleanup(); + } + }); + + it("INTERVIEW_RULES.md mentions WSL setup for Windows candidates", () => { + const body = readFileSync( + join(KIT_DIR, "INTERVIEW_RULES.md"), + "utf8", + ); + expect(body).toMatch(/WSL/); + }); + + it("RUBRIC_OVERVIEW.md mentions all 9 dimensions", () => { + const body = readFileSync( + join(KIT_DIR, "RUBRIC_OVERVIEW.md"), + "utf8", + ); + for (const heading of [ + "Upfront design", + "Context engineering", + "Critical evaluation", + "Verification", + "Course correction", + "Risk awareness", + "Architectural quality", + "Test pass", + "Throughput", + ]) { + expect(body).toContain(heading); + } + }); + + it("PRIVACY_RELEASE.md includes the no-training clause, appeal mechanism, and REVIEW WITH LEGAL warning", () => { + const body = readFileSync( + join(KIT_DIR, "PRIVACY_RELEASE.md"), + "utf8", + ); + expect(body).toMatch(/REVIEW WITH LEGAL/); + expect(body).toMatch(/NO training use|not be used to train/i); + expect(body).toMatch(/appeal/i); + expect(body).toMatch(/30 days/); + }); +}); diff --git a/tests/unit/services/interview/kit/privacy-gate.spec.ts b/tests/unit/services/interview/kit/privacy-gate.spec.ts new file mode 100644 index 0000000..715c43a --- /dev/null +++ b/tests/unit/services/interview/kit/privacy-gate.spec.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from "bun:test"; +import { spawnSync } from "node:child_process"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join, resolve } from "node:path"; + +const GATE_SCRIPT = resolve( + import.meta.dir, + "../../../../../teamhero-interview-kit/lib/privacy-gate.sh", +); + +function runGate(filePath: string | undefined): number { + const args = filePath === undefined ? [GATE_SCRIPT] : [GATE_SCRIPT, filePath]; + const result = spawnSync("bash", args, { encoding: "utf8" }); + return result.status ?? -1; +} + +function tempFile(content: string): { path: string; cleanup: () => void } { + const dir = mkdtempSync(join(tmpdir(), "iv-priv-")); + const path = join(dir, "PRIVACY_RELEASE.md"); + writeFileSync(path, content); + return { path, cleanup: () => rmSync(dir, { recursive: true, force: true }) }; +} + +describe("privacy gate", () => { + it("returns 0 when the release is properly signed", () => { + const { path, cleanup } = tempFile( + `# Privacy Release\n\n## Signed\n\nJane Doe\n\n## Date\n\n2026-05-10\n`, + ); + try { + expect(runGate(path)).toBe(0); + } finally { + cleanup(); + } + }); + + it("returns non-zero when the file is missing entirely", () => { + const code = runGate("/tmp/this-path-definitely-does-not-exist-xyz"); + expect(code).not.toBe(0); + }); + + it("returns non-zero when the file is empty", () => { + const { path, cleanup } = tempFile(""); + try { + expect(runGate(path)).not.toBe(0); + } finally { + cleanup(); + } + }); + + it("returns non-zero when sections contain only the placeholder text", () => { + const { path, cleanup } = tempFile( + `# Privacy Release\n\n## Signed\n\n(placeholder — candidate signs here)\n\n## Date\n\n(placeholder — candidate dates here)\n`, + ); + try { + expect(runGate(path)).not.toBe(0); + } finally { + cleanup(); + } + }); + + it("returns non-zero when the file has signature but no date", () => { + const { path, cleanup } = tempFile( + `# Privacy Release\n\n## Signed\n\nJane Doe\n\n## Date\n\n\n`, + ); + try { + expect(runGate(path)).not.toBe(0); + } finally { + cleanup(); + } + }); + + it("returns non-zero when the file has date but no signature", () => { + const { path, cleanup } = tempFile( + `# Privacy Release\n\n## Signed\n\n\n\n## Date\n\n2026-05-10\n`, + ); + try { + expect(runGate(path)).not.toBe(0); + } finally { + cleanup(); + } + }); + + it("returns non-zero when no path is provided", () => { + expect(runGate(undefined)).not.toBe(0); + }); + + it("returns non-zero when sections are missing entirely", () => { + const { path, cleanup } = tempFile(`# Privacy Release\n\nSome other text.\n`); + try { + expect(runGate(path)).not.toBe(0); + } finally { + cleanup(); + } + }); + + it("accepts when sections contain whitespace around the real value", () => { + const { path, cleanup } = tempFile( + `# Privacy Release\n\n## Signed\n\n Jane Doe \n\n## Date\n\n 2026-05-10 \n`, + ); + try { + expect(runGate(path)).toBe(0); + } finally { + cleanup(); + } + }); +}); diff --git a/tests/unit/services/interview/review/ai-observer.spec.ts b/tests/unit/services/interview/review/ai-observer.spec.ts new file mode 100644 index 0000000..39e0790 --- /dev/null +++ b/tests/unit/services/interview/review/ai-observer.spec.ts @@ -0,0 +1,333 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + buildObserverPrompt, + humanOnlyObservations, + INTERVIEWER_BIAS_GUARD, + OBSERVATION_RESPONSE_SCHEMA, + OpenAIObserverClient, + rejectIfScored, +} from "../../../../../src/services/interview/review/ai-observer.js"; +import type { RoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; +import type { EvidenceEvent } from "../../../../../src/services/interview/review/types.js"; + +function role(overrides: Partial = {}): RoleConfig { + return { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript", + domain: "Payments", + featureDescription: "Add idempotency keys", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: "ai-assisted", + rubricMode: "default", + outputDir: "/tmp/out", + ...overrides, + }; +} + +const evt: EvidenceEvent = { + type: "prompt", + timestamp: "2026-05-10T10:00:00Z", + source: "interview.log", + text: "add an idempotency middleware", +}; + +describe("buildObserverPrompt", () => { + it("includes the interviewer-bias guard verbatim", () => { + const prompt = buildObserverPrompt({ config: role(), events: [evt] }); + expect(prompt.instructions).toContain(INTERVIEWER_BIAS_GUARD); + }); + + it("includes role metadata in the user input", () => { + const prompt = buildObserverPrompt({ config: role(), events: [evt] }); + expect(prompt.input).toContain("Senior Backend Engineer"); + expect(prompt.input).toContain("Payments"); + }); + + it("includes the rubric for the 5 observable dimensions only", () => { + const prompt = buildObserverPrompt({ config: role(), events: [evt] }); + const observable = [ + "upfront-design", + "context-engineering", + "critical-evaluation", + "course-correction", + "architectural-quality", + ]; + for (const id of observable) { + expect(prompt.instructions).toContain(id); + } + // Deterministic dims should NOT appear in the observer rubric block. + // (They may still appear elsewhere as context, but not in the rubric list.) + expect(prompt.instructions).not.toContain("- verification ("); + expect(prompt.instructions).not.toContain("- test-pass ("); + }); + + it("includes the custom prompt when rubricMode is custom", () => { + const prompt = buildObserverPrompt({ + config: role({ rubricMode: "custom", customPrompt: "watch for X" }), + events: [evt], + }); + expect(prompt.instructions).toContain("watch for X"); + }); + + it("includes the JD content whenever jdPath is set (independent of rubric mode)", () => { + // Standalone JD: the observer now references the JD whenever + // it's been provided, regardless of whether the rubric is + // "default" or "custom". The old "default+jd" coupling forced + // the proctor to choose between custom rubric guidance and JD + // context — now they can combine both. + const dir = mkdtempSync(join(tmpdir(), "iv-jd-")); + try { + const path = join(dir, "jd.md"); + writeFileSync(path, "Looking for someone with payments domain depth."); + const prompt = buildObserverPrompt({ + config: role({ rubricMode: "default", jdPath: path }), + events: [evt], + }); + expect(prompt.instructions).toContain("payments domain depth"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("combines custom rubric guidance AND JD content when both are supplied", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-jd-combo-")); + try { + const path = join(dir, "jd.md"); + writeFileSync(path, "Senior engineer, FHIR/HL7 background expected."); + const prompt = buildObserverPrompt({ + config: role({ + rubricMode: "custom", + customPrompt: "watch for X", + jdPath: path, + }), + events: [evt], + }); + expect(prompt.instructions).toContain("watch for X"); + expect(prompt.instructions).toContain("FHIR/HL7"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("does NOT include the session recording URL in the prompt", () => { + const prompt = buildObserverPrompt({ + config: role(), + events: [evt], + sessionRecordingUrl: "https://zoom.us/rec/secret-123", + }); + expect(prompt.input).not.toContain("zoom.us"); + expect(prompt.input).not.toContain("secret-123"); + expect(prompt.instructions).not.toContain("zoom.us"); + }); + + it("includes interviewer notes when provided", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-notes-")); + try { + const path = join(dir, "notes.md"); + writeFileSync(path, "Candidate seemed prepared."); + const prompt = buildObserverPrompt({ + config: role(), + events: [evt], + interviewerNotesPath: path, + }); + expect(prompt.input).toContain("Candidate seemed prepared."); + // And the bias guard is still in instructions, NOT in input + expect(prompt.instructions).toContain(INTERVIEWER_BIAS_GUARD); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("OBSERVATION_RESPONSE_SCHEMA", () => { + it("declares strict json_schema with NO score/weighted_total/band fields anywhere", () => { + const s = JSON.stringify(OBSERVATION_RESPONSE_SCHEMA); + expect(s).not.toContain('"score"'); + expect(s).not.toContain('"weighted_total"'); + expect(s).not.toContain('"raw_total"'); + expect(s).not.toContain('"band"'); + expect(s).not.toContain('"signal_count"'); + // Critical: additionalProperties must be false at every object level + expect(OBSERVATION_RESPONSE_SCHEMA.additionalProperties).toBe(false); + }); +}); + +describe("rejectIfScored", () => { + it("throws when the response contains a 'score' field", () => { + expect(() => + rejectIfScored({ + observations: [{ dimension_id: "upfront-design", score: 0.6 }], + }), + ).toThrow(/score/); + }); + + it("throws when the response contains a 'weighted_total' field", () => { + expect(() => + rejectIfScored({ + observations: [], + weighted_total: 0.8, + }), + ).toThrow(/weighted_total/); + }); + + it("accepts a clean response", () => { + expect(() => + rejectIfScored({ + observations: [ + { + dimension_id: "upfront-design", + observation: "x", + reasoning: "y", + evidence_excerpts: [], + }, + ], + }), + ).not.toThrow(); + }); +}); + +describe("summarizeEvents (indirect via buildObserverPrompt)", () => { + it("renders all event types — prompt, tool-use, command, commit, transcript", () => { + const events: EvidenceEvent[] = [ + { + type: "prompt", + timestamp: "2026-05-10T10:00:00Z", + source: "interview.log", + text: "design the API", + }, + { + type: "tool-use", + timestamp: "2026-05-10T10:00:30Z", + source: "interview.log", + tool: "Edit", + }, + { + type: "command", + timestamp: "2026-05-10T10:01:00Z", + source: "terminal.cast", + command: "bun test", + }, + { + type: "commit", + timestamp: "2026-05-10T10:02:00Z", + source: "git", + sha: "abc1234", + message: "initial", + insertions: 10, + deletions: 2, + }, + { + type: "transcript-line", + timestamp: "2026-05-10T10:03:00Z", + source: "transcript", + speaker: "Interviewer", + text: "How are you thinking about this?", + }, + ]; + const prompt = buildObserverPrompt({ config: role(), events }); + expect(prompt.input).toContain("PROMPT: design the API"); + expect(prompt.input).toContain("TOOL: Edit"); + expect(prompt.input).toContain("$ bun test"); + expect(prompt.input).toContain("COMMIT abc1234"); + expect(prompt.input).toContain("(transcript) Interviewer:"); + }); + + it("shows '(no events recorded)' when given an empty event list", () => { + const prompt = buildObserverPrompt({ config: role(), events: [] }); + expect(prompt.input).toContain("(no events recorded)"); + }); +}); + +describe("OpenAIObserverClient", () => { + it("calls the OpenAI Responses API, parses output_text, and rejects scored responses", async () => { + const fakeOpenAI = { + responses: { + create: async () => ({ + output_text: JSON.stringify({ + observations: [ + { + dimension_id: "upfront-design", + observation: "Candidate sketched the API first.", + reasoning: "First prompt described data flow.", + evidence_excerpts: [ + { + source: "interview.log", + content: "design the API", + }, + ], + }, + ], + }), + }), + }, + }; + const client = new OpenAIObserverClient( + fakeOpenAI as unknown as ConstructorParameters[0], + ); + const result = await client.observe({ + instructions: "test", + input: "test", + }); + expect(result.observations).toHaveLength(1); + expect(result.observations[0].dimension_id).toBe("upfront-design"); + }); + + it("throws when output_text is missing", async () => { + const fakeOpenAI = { + responses: { + create: async () => ({}), + }, + }; + const client = new OpenAIObserverClient( + fakeOpenAI as unknown as ConstructorParameters[0], + ); + await expect( + client.observe({ instructions: "test", input: "test" }), + ).rejects.toThrow(/no output_text/); + }); + + it("rejects scored responses returned by the API (defense-in-depth)", async () => { + const fakeOpenAI = { + responses: { + create: async () => ({ + output_text: JSON.stringify({ + observations: [ + { dimension_id: "upfront-design", score: 0.7 }, + ], + }), + }), + }, + }; + const client = new OpenAIObserverClient( + fakeOpenAI as unknown as ConstructorParameters[0], + ); + await expect( + client.observe({ instructions: "test", input: "test" }), + ).rejects.toThrow(/forbidden field 'score'/); + }); +}); + +describe("humanOnlyObservations", () => { + it("returns exactly the 5 observable-dimension placeholders", () => { + const obs = humanOnlyObservations(); + expect(obs).toHaveLength(5); + const ids = obs.map((o) => o.dimension_id).sort(); + expect(ids).toEqual( + [ + "architectural-quality", + "context-engineering", + "course-correction", + "critical-evaluation", + "upfront-design", + ].sort(), + ); + for (const o of obs) { + expect(o.observation).toContain("manager to write"); + } + }); +}); diff --git a/tests/unit/services/interview/review/asciinema-edge-cases.spec.ts b/tests/unit/services/interview/review/asciinema-edge-cases.spec.ts new file mode 100644 index 0000000..7ea4c50 --- /dev/null +++ b/tests/unit/services/interview/review/asciinema-edge-cases.spec.ts @@ -0,0 +1,288 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { parseAsciinemaCast } from "../../../../../src/services/interview/review/collectors/asciinema.js"; + +function tmp(): string { + return mkdtempSync(join(tmpdir(), "iv-asciinema-edge-")); +} + +function makeHeader(extra: object = {}): string { + return JSON.stringify({ + version: 2, + width: 80, + height: 24, + timestamp: 1700000000, + ...extra, + }); +} + +function ev(delta: number, kind: "i" | "o", data: string): string { + return JSON.stringify([delta, kind, data]); +} + +describe("asciinema parser — control code handling", () => { + it("ignores control codes (arrow keys, ESC sequences) without crashing", () => { + const dir = tmp(); + try { + const path = join(dir, "ctrl.cast"); + // ESC [ A = up-arrow escape sequence (3 chars); should be skipped + const esc = "\x1b[A"; + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "i", "l"), + ev(0.2, "i", esc), + ev(0.3, "i", "s"), + ev(0.4, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + // Control sequences skipped; buffer only has 'l' and 's' + expect(result.commands).toHaveLength(1); + expect(result.commands[0].command).toBe("ls"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips output ('o') events entirely — only input ('i') events build commands", () => { + const dir = tmp(); + try { + const path = join(dir, "output.cast"); + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "o", "prompt$ "), + ev(0.2, "i", "l"), + ev(0.3, "i", "s"), + ev(0.4, "o", "file1 file2\n"), + ev(0.5, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + expect(result.commands).toHaveLength(1); + expect(result.commands[0].command).toBe("ls"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("handles Enter typed as '\\n' (LF) as well as '\\r' (CR)", () => { + const dir = tmp(); + try { + const path = join(dir, "lf.cast"); + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "i", "p"), + ev(0.2, "i", "w"), + ev(0.3, "i", "d"), + ev(0.4, "i", "\n"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + expect(result.commands).toHaveLength(1); + expect(result.commands[0].command).toBe("pwd"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips Enter keystrokes on an empty buffer (no command emitted)", () => { + const dir = tmp(); + try { + const path = join(dir, "empty-enter.cast"); + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "i", "\r"), // Enter on empty buffer + ev(0.2, "i", "\r"), // Another bare Enter + ev(0.3, "i", "l"), + ev(0.4, "i", "s"), + ev(0.5, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + // Only the 'ls' command should be emitted; bare Enters are ignored + expect(result.commands).toHaveLength(1); + expect(result.commands[0].command).toBe("ls"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("handles tab characters in command input", () => { + const dir = tmp(); + try { + const path = join(dir, "tab.cast"); + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "i", "b"), + ev(0.2, "i", "u"), + ev(0.3, "i", "n"), + ev(0.4, "i", "\t"), // tab (e.g. shell autocomplete attempt) + ev(0.5, "i", "t"), + ev(0.6, "i", "e"), + ev(0.7, "i", "s"), + ev(0.8, "i", "t"), + ev(0.9, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + // Tab is captured in the buffer, resulting in "bun\ttest" + expect(result.commands).toHaveLength(1); + expect(result.commands[0].command).toContain("bun"); + expect(result.commands[0].command).toContain("test"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("asciinema parser — header and metadata", () => { + it("falls back to epoch 0 when header has no timestamp field", () => { + const dir = tmp(); + try { + const path = join(dir, "no-ts.cast"); + const header = JSON.stringify({ version: 2, width: 80, height: 24 }); + writeFileSync( + path, + [ + header, + ev(1.5, "i", "l"), + ev(2.0, "i", "s"), + ev(3.0, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + // With baseEpoch=0, the timestamp is derived from delta alone + expect(result.header.version).toBe(2); + expect(result.commands).toHaveLength(1); + // The timestamp should be a valid ISO string even without a header epoch + expect(result.commands[0].timestamp).toMatch(/^\d{4}-\d{2}-\d{2}T/); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("preserves all raw events in the events array", () => { + const dir = tmp(); + try { + const path = join(dir, "events.cast"); + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "i", "l"), + ev(0.2, "o", "total 0\n"), + ev(0.3, "i", "s"), + ev(0.4, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + // 4 events: 1 output, 3 input + expect(result.events).toHaveLength(4); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("returns the header env field when present", () => { + const dir = tmp(); + try { + const path = join(dir, "env.cast"); + writeFileSync( + path, + [ + makeHeader({ env: { SHELL: "/bin/zsh", TERM: "xterm-256color" } }), + ev(0.1, "i", "l"), + ev(0.2, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + expect(result.header.env?.SHELL).toBe("/bin/zsh"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("tolerates blank lines between event entries", () => { + const dir = tmp(); + try { + const path = join(dir, "blanks.cast"); + writeFileSync( + path, + [ + makeHeader(), + "", + ev(0.1, "i", "l"), + "", + ev(0.2, "i", "s"), + ev(0.3, "i", "\r"), + "", + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + expect(result.commands).toHaveLength(1); + expect(result.commands[0].command).toBe("ls"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("asciinema parser — pauseSecondsBeforeEnter calculation", () => { + it("computes pause relative to the last typed key, not the first", () => { + const dir = tmp(); + try { + const path = join(dir, "pause.cast"); + // l at t=0.1, s at t=5.0, Enter at t=5.2 → pause = 5.2 - 5.0 = 0.2 + writeFileSync( + path, + [ + makeHeader(), + ev(0.1, "i", "l"), + ev(5.0, "i", "s"), + ev(5.2, "i", "\r"), + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + expect(result.commands[0].command).toBe("ls"); + expect(result.commands[0].pauseSecondsBeforeEnter).toBeCloseTo(0.2, 1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("clamps pauseSecondsBeforeEnter to 0 when Enter delta is before last key delta", () => { + const dir = tmp(); + try { + const path = join(dir, "no-neg-pause.cast"); + // Pathological cast: Enter at 0.1, key at 0.5 (would give negative pause) + writeFileSync( + path, + [ + makeHeader({ timestamp: 0 }), + // Only Enter — no preceding keystrokes — pause should be 0 + ev(0.1, "i", "l"), + ev(0.0, "i", "\r"), // delta less than lastKeyDelta + ].join("\n"), + ); + const result = parseAsciinemaCast(path); + // Result may have 0 or 1 command depending on ordering; either way, no negative pause + for (const cmd of result.commands) { + expect(cmd.pauseSecondsBeforeEnter).toBeGreaterThanOrEqual(0); + } + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); \ No newline at end of file diff --git a/tests/unit/services/interview/review/audit-writer-edge-cases.spec.ts b/tests/unit/services/interview/review/audit-writer-edge-cases.spec.ts new file mode 100644 index 0000000..47a0328 --- /dev/null +++ b/tests/unit/services/interview/review/audit-writer-edge-cases.spec.ts @@ -0,0 +1,346 @@ +import { describe, expect, it } from "bun:test"; +import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + type AuditFrontmatter, + renderAudit, + renderSummary, + validateSignOff, + writeAudit, +} from "../../../../../src/services/interview/review/audit-writer.js"; +import type { ReviewResult } from "../../../../../src/services/interview/review/types.js"; + +function frontmatter( + overrides: Partial = {}, +): AuditFrontmatter { + return { + tags: ["hiring", "candidate", "senior-backend"], + candidate: "Jane Doe", + role: "senior-backend", + date: "2026-05-10", + rubric_version: "1.0.0", + rubric_mode: "default", + signed_off: false, + ...overrides, + }; +} + +function emptyResult(): ReviewResult { + return { + rubric_version: "1.0.0", + candidate_id: "jane-doe-2026-05-10", + role_slug: "senior-backend", + observed_at: "2026-05-10T11:00:00Z", + observations: [], + measurements: [], + }; +} + +describe("YAML frontmatter — yamlScalar quoting for special characters", () => { + it("quotes a candidate name containing a colon", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ candidate: "Doe, Jane: III" }), + outputDir: "/tmp/x", + }); + // The colon in the name requires YAML quoting; unquoted "Jane: III" would be parsed as mapping + expect(body).toMatch(/candidate:\s*"Doe, Jane: III"/); + }); + + it("quotes a candidate name containing a hash (comment marker)", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ candidate: "Jane #Doe" }), + outputDir: "/tmp/x", + }); + expect(body).toMatch(/candidate:\s*"Jane #Doe"/); + }); + + it("quotes a candidate name with leading whitespace", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ candidate: " Jane Doe" }), + outputDir: "/tmp/x", + }); + expect(body).toMatch(/candidate:\s*" Jane Doe"/); + }); + + it("does NOT quote a plain candidate name with no special chars", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ candidate: "Jane Doe" }), + outputDir: "/tmp/x", + }); + // Plain name — no quoting needed + expect(body).toContain("candidate: Jane Doe"); + expect(body).not.toContain('candidate: "Jane Doe"'); + }); + + it("quotes a tag containing commas or brackets", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ tags: ["hiring", "role: senior-backend"] }), + outputDir: "/tmp/x", + }); + // The tag "role: senior-backend" contains a colon and must be quoted inside the array + expect(body).toMatch(/tags: \[.*"role: senior-backend".*\]/); + }); + + it("places the YAML block before the warning banner", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + const yamlEnd = body.indexOf("---\n", 4); // second --- + const bannerStart = body.indexOf("THIS AUDIT IS ADVISORY"); + expect(yamlEnd).toBeGreaterThan(0); + expect(bannerStart).toBeGreaterThan(yamlEnd); + }); + + it("includes recommendation in frontmatter only when signed_off is true", () => { + const signed = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ + signed_off: true, + recommendation: "Hire with notes", + }), + outputDir: "/tmp/x", + }); + const unsigned = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter({ signed_off: false }), + outputDir: "/tmp/x", + }); + expect(signed).toContain("recommendation:"); + expect(unsigned).not.toContain("recommendation:"); + }); + + it("omits optional session fields when not provided", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).not.toContain("session_recording_url:"); + expect(body).not.toContain("session_platform:"); + expect(body).not.toContain("session_date:"); + }); +}); + +describe("renderSummary — evidence excerpt truncation at 200 chars", () => { + it("truncates evidence content to 200 chars in summary tier", () => { + const long = "x".repeat(300); + const result: ReviewResult = { + ...emptyResult(), + observations: [ + { + dimension_id: "upfront-design", + observation: "The candidate planned carefully.", + reasoning: "Clear planning before coding.", + evidence_excerpts: [{ source: "interview.log", content: long }], + }, + ], + }; + const body = renderSummary({ + result, + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + // Summary truncates at 200; the 201st+ characters should not appear + expect(body).not.toContain(long); + // But the first 200 chars should be present + expect(body).toContain("x".repeat(200)); + // Ellipsis appended after truncation + expect(body).toContain("…"); + }); +}); + +describe("renderAudit — evidence excerpt NOT truncated", () => { + it("does not truncate evidence content in the audit tier", () => { + const long = "x".repeat(400); + const result: ReviewResult = { + ...emptyResult(), + observations: [ + { + dimension_id: "upfront-design", + observation: "Planned carefully.", + reasoning: "Detailed reasoning.", + evidence_excerpts: [{ source: "interview.log", content: long }], + }, + ], + }; + const body = renderAudit({ + result, + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + // Full content preserved in audit tier + expect(body).toContain(long); + // No ellipsis for full content + expect(body).not.toContain(long.slice(0, 200) + "…"); + }); + + it("renders evidence with optional timestamp when provided", () => { + const result: ReviewResult = { + ...emptyResult(), + observations: [ + { + dimension_id: "upfront-design", + observation: "Planned.", + reasoning: "Reasoning.", + evidence_excerpts: [ + { + timestamp: "2026-05-10T10:05:00Z", + source: "terminal.cast", + content: "git commit -m 'initial'", + }, + ], + }, + ], + }; + const body = renderAudit({ + result, + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain("[2026-05-10T10:05:00Z]"); + expect(body).toContain("terminal.cast"); + }); +}); + +describe("renderSummary / renderAudit — dimensions with no evidence", () => { + it("shows a fallback message for dimensions with no observation or measurement", () => { + const body = renderSummary({ + result: emptyResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + // With empty obs/meas, all dimensions should show the fallback + expect(body).toContain("No evidence captured"); + }); +}); + +describe("renderSummary — caveats field", () => { + it("renders caveats when present on an observation", () => { + const result: ReviewResult = { + ...emptyResult(), + observations: [ + { + dimension_id: "critical-evaluation", + observation: "Could not determine clearly.", + reasoning: "Limited evidence.", + evidence_excerpts: [], + caveats: "The terminal recording was missing for the first 20 minutes.", + }, + ], + }; + const body = renderSummary({ + result, + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain("The terminal recording was missing"); + expect(body).toContain("Caveats:"); + }); + + it("does NOT render a Caveats line when the caveats field is absent", () => { + const result: ReviewResult = { + ...emptyResult(), + observations: [ + { + dimension_id: "upfront-design", + observation: "Planned well.", + reasoning: "Reasoning.", + evidence_excerpts: [], + }, + ], + }; + const body = renderSummary({ + result, + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).not.toContain("**Caveats:**"); + }); +}); + +describe("validateSignOff — boundary cases", () => { + it("accepts reasoning of exactly 20 characters", () => { + const r = validateSignOff({ + recommendation: "Hire", + reasoning: "x".repeat(20), + }); + expect(r.ok).toBe(true); + }); + + it("rejects reasoning of exactly 19 characters (one below minimum)", () => { + const r = validateSignOff({ + recommendation: "Hire", + reasoning: "x".repeat(19), + }); + expect(r.ok).toBe(false); + }); + + it("trims whitespace before checking reasoning length", () => { + // Spaces-only reasoning is effectively blank + const r = validateSignOff({ + recommendation: "Hire", + reasoning: " ".repeat(25), + }); + expect(r.ok).toBe(false); + }); + + it("returns multiple failures when both recommendation and reasoning are invalid", () => { + const r = validateSignOff({ recommendation: "Unsure", reasoning: "" }); + expect(r.ok).toBe(false); + expect(r.failures.length).toBeGreaterThanOrEqual(2); + }); +}); + +describe("writeAudit — output structure", () => { + it("audit.json round-trips the frontmatter and result faithfully", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-audit-edge-")); + try { + const fm = frontmatter({ + signed_off: true, + recommendation: "No hire", + session_platform: "zoom", + session_date: "2026-05-10", + }); + const result: ReviewResult = { + ...emptyResult(), + observations: [ + { + dimension_id: "upfront-design", + observation: "No planning observed.", + reasoning: "Jumped straight to prompting.", + evidence_excerpts: [], + }, + ], + }; + const outputs = writeAudit({ result, frontmatter: fm, outputDir: dir }); + const json = JSON.parse(readFileSync(outputs.auditJsonPath, "utf8")); + expect(json.frontmatter.recommendation).toBe("No hire"); + expect(json.frontmatter.session_platform).toBe("zoom"); + expect(json.result.observations[0].dimension_id).toBe("upfront-design"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("produces an evidence/ directory even when no evidence files are provided", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-audit-edge-")); + try { + const outputs = writeAudit({ + result: emptyResult(), + frontmatter: frontmatter(), + outputDir: dir, + }); + expect(existsSync(outputs.evidenceDir)).toBe(true); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/review/audit-writer.spec.ts b/tests/unit/services/interview/review/audit-writer.spec.ts new file mode 100644 index 0000000..d1ae8b9 --- /dev/null +++ b/tests/unit/services/interview/review/audit-writer.spec.ts @@ -0,0 +1,255 @@ +import { describe, expect, it } from "bun:test"; +import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { + type AuditFrontmatter, + renderAudit, + renderSummary, + validateSignOff, + writeAudit, +} from "../../../../../src/services/interview/review/audit-writer.js"; +import type { + ReviewResult, + Observation, +} from "../../../../../src/services/interview/review/types.js"; + +function frontmatter( + overrides: Partial = {}, +): AuditFrontmatter { + return { + tags: ["hiring", "candidate", "senior-backend"], + candidate: "Jane Doe", + role: "senior-backend", + date: "2026-05-10", + rubric_version: "1.0.0", + rubric_mode: "default", + signed_off: false, + ...overrides, + }; +} + +function sampleObservation(): Observation { + return { + dimension_id: "upfront-design", + observation: + "The candidate sketched the data model and aligned with the prompt before generating code.", + reasoning: + "At 10:02 they wrote a paragraph describing the API and only at 10:05 did they prompt the agent for code.", + evidence_excerpts: [ + { + timestamp: "2026-05-10T10:02:00Z", + source: "interview.log", + content: "Let me sketch the data model first…", + }, + ], + }; +} + +function sampleResult(): ReviewResult { + return { + rubric_version: "1.0.0", + candidate_id: "jane-doe-2026-05-10", + role_slug: "senior-backend", + observed_at: "2026-05-10T11:00:00Z", + observations: [sampleObservation()], + measurements: [ + { + dimension_id: "verification", + facts: [ + { label: "Total test runs", value: 8 }, + { label: "Test runs immediately after a prompt", value: 5 }, + ], + }, + { + dimension_id: "test-pass", + facts: [ + { label: "Passing tests", value: 12 }, + { label: "Failing tests", value: 0 }, + ], + }, + ], + }; +} + +describe("renderSummary", () => { + it("starts with the YAML frontmatter", () => { + const body = renderSummary({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body.startsWith("---\n")).toBe(true); + expect(body).toContain("candidate: Jane Doe"); + expect(body).toContain("rubric_version: 1.0.0"); + }); + + it("renders the mandatory warning banner", () => { + const body = renderSummary({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toMatch(/THIS AUDIT IS ADVISORY/); + expect(body).toMatch(/not a score/); + }); + + it("preserves the AI's reasoning chain (not just the observation)", () => { + const body = renderSummary({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain("paragraph describing the API"); + }); + + it("renders measurements for deterministic dimensions", () => { + const body = renderSummary({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain("Total test runs: 8"); + expect(body).toContain("Passing tests: 12"); + }); + + it("includes the sign-off section with categorical recommendation choices", () => { + const body = renderSummary({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain("Sign-off (MANDATORY)"); + expect(body).toContain("Hire | Hire with notes | No hire"); + }); + + it("includes session_recording_url in frontmatter when provided", () => { + const body = renderSummary({ + result: sampleResult(), + frontmatter: frontmatter({ + session_recording_url: "https://zoom.us/rec/xyz", + session_platform: "zoom", + session_date: "2026-05-10", + }), + outputDir: "/tmp/x", + }); + // URL is quoted because it contains characters (`:`, `/`) that YAML + // parsers can mishandle in bare scalars. Quoting makes the audit.json + // round-trip safe even when the URL has colons, hashes, etc. + expect(body).toContain( + `session_recording_url: "https://zoom.us/rec/xyz"`, + ); + expect(body).toContain("session_platform: zoom"); + }); +}); + +describe("renderAudit", () => { + it("includes the warning banner just like summary.md", () => { + const body = renderAudit({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toMatch(/THIS AUDIT IS ADVISORY/); + }); + + it("preserves the reasoning chain (transparency across tiers)", () => { + const body = renderAudit({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain("paragraph describing the API"); + }); + + it("renders evidence excerpts without truncation", () => { + const long = "x".repeat(400); + const res: ReviewResult = { + ...sampleResult(), + observations: [ + { + ...sampleObservation(), + evidence_excerpts: [ + { source: "interview.log", content: long }, + ], + }, + ], + }; + const body = renderAudit({ + result: res, + frontmatter: frontmatter(), + outputDir: "/tmp/x", + }); + expect(body).toContain(long); + }); +}); + +describe("writeAudit", () => { + it("produces summary.md, audit.md, audit.json, and evidence/", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-audit-")); + try { + const outputs = writeAudit({ + result: sampleResult(), + frontmatter: frontmatter(), + outputDir: dir, + }); + expect(existsSync(outputs.summaryPath)).toBe(true); + expect(existsSync(outputs.auditPath)).toBe(true); + expect(existsSync(outputs.auditJsonPath)).toBe(true); + expect(existsSync(outputs.evidenceDir)).toBe(true); + const json = JSON.parse(readFileSync(outputs.auditJsonPath, "utf8")); + expect(json.frontmatter.candidate).toBe("Jane Doe"); + expect(json.result.observations).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("validateSignOff", () => { + it("requires a categorical recommendation", () => { + const r = validateSignOff({ + recommendation: "Maybe", + reasoning: "x".repeat(50), + }); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /recommendation/.test(f))).toBe(true); + }); + + it("rejects empty reasoning", () => { + const r = validateSignOff({ recommendation: "Hire", reasoning: "" }); + expect(r.ok).toBe(false); + expect(r.failures.some((f) => /reasoning/.test(f))).toBe(true); + }); + + it("rejects too-short reasoning", () => { + const r = validateSignOff({ recommendation: "Hire", reasoning: "yes." }); + expect(r.ok).toBe(false); + }); + + it("accepts Hire with substantive reasoning", () => { + const r = validateSignOff({ + recommendation: "Hire", + reasoning: + "They showed solid context engineering and clean architecture, with appropriate caution on the destructive operations.", + }); + expect(r.ok).toBe(true); + }); + + it("accepts Hire with notes and No hire as valid categorical choices", () => { + expect( + validateSignOff({ + recommendation: "Hire with notes", + reasoning: + "Strong overall, but I want a check-in on the verification habits in week 2.", + }).ok, + ).toBe(true); + expect( + validateSignOff({ + recommendation: "No hire", + reasoning: + "The architectural choices and lack of test discipline indicate a mismatch with the role's needs.", + }).ok, + ).toBe(true); + }); +}); diff --git a/tests/unit/services/interview/review/collectors.spec.ts b/tests/unit/services/interview/review/collectors.spec.ts new file mode 100644 index 0000000..ed87e34 --- /dev/null +++ b/tests/unit/services/interview/review/collectors.spec.ts @@ -0,0 +1,206 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { parseAsciinemaCast } from "../../../../../src/services/interview/review/collectors/asciinema.js"; +import { parseGitHistory } from "../../../../../src/services/interview/review/collectors/git-history.js"; +import { parseInterviewLog } from "../../../../../src/services/interview/review/collectors/jsonl-log.js"; +import { parseTranscript } from "../../../../../src/services/interview/review/collectors/transcript.js"; + +function tmp(): string { + return mkdtempSync(join(tmpdir(), "iv-coll-")); +} + +describe("asciinema parser", () => { + it("extracts the header and reconstructs commands from input events", () => { + const dir = tmp(); + try { + const path = join(dir, "terminal.cast"); + const header = JSON.stringify({ + version: 2, + width: 80, + height: 24, + timestamp: 1700000000, + }); + const events = [ + [0.1, "i", "l"], + [0.2, "i", "s"], + [0.3, "i", "\r"], + [1.5, "i", "p"], + [1.6, "i", "w"], + [1.7, "i", "d"], + [3.0, "i", "\r"], + ] + .map((e) => JSON.stringify(e)) + .join("\n"); + writeFileSync(path, `${header}\n${events}\n`); + const result = parseAsciinemaCast(path); + expect(result.header.version).toBe(2); + expect(result.commands.length).toBe(2); + expect(result.commands[0].command).toBe("ls"); + expect(result.commands[1].command).toBe("pwd"); + // pwd took 3.0-1.7 = 1.3s pause before Enter + expect(result.commands[1].pauseSecondsBeforeEnter).toBeCloseTo(1.3, 1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("throws on empty cast files", () => { + const dir = tmp(); + try { + const path = join(dir, "empty.cast"); + writeFileSync(path, ""); + expect(() => parseAsciinemaCast(path)).toThrow(); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("handles backspace correctly", () => { + const dir = tmp(); + try { + const path = join(dir, "bs.cast"); + const header = JSON.stringify({ + version: 2, + width: 80, + height: 24, + timestamp: 1700000000, + }); + const events = [ + [0.1, "i", "l"], + [0.2, "i", "x"], + [0.3, "i", ""], + [0.4, "i", "s"], + [0.5, "i", "\r"], + ] + .map((e) => JSON.stringify(e)) + .join("\n"); + writeFileSync(path, `${header}\n${events}\n`); + const result = parseAsciinemaCast(path); + expect(result.commands[0].command).toBe("ls"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("interview.log parser", () => { + it("separates user-prompt-submit and pre-tool-use events", () => { + const dir = tmp(); + try { + const path = join(dir, "interview.log"); + const lines = [ + JSON.stringify({ + event: "user-prompt-submit", + timestamp: "2026-05-10T10:00:00Z", + prompt: "add a test", + }), + JSON.stringify({ + event: "pre-tool-use", + timestamp: "2026-05-10T10:00:05Z", + tool_name: "Bash", + tool_input: { command: "bun test" }, + }), + JSON.stringify({ + event: "user-prompt-submit", + timestamp: "2026-05-10T10:01:00Z", + prompt: "now fix it", + }), + ].join("\n"); + writeFileSync(path, `${lines}\n`); + const result = parseInterviewLog(path); + expect(result.prompts).toHaveLength(2); + expect(result.toolUses).toHaveLength(1); + expect(result.toolUses[0].tool).toBe("Bash"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("silently skips malformed lines", () => { + const dir = tmp(); + try { + const path = join(dir, "interview.log"); + writeFileSync( + path, + `not-json\n${JSON.stringify({ event: "user-prompt-submit", timestamp: "2026-05-10T10:00:00Z", prompt: "hi" })}\n{broken\n`, + ); + const result = parseInterviewLog(path); + expect(result.prompts).toHaveLength(1); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("transcript parser", () => { + it("parses [HH:MM:SS] Speaker: text format", () => { + const dir = tmp(); + try { + const path = join(dir, "t.txt"); + writeFileSync( + path, + `[00:01:23] Alice: I'll start with the data model.\n[00:02:01] Bob: That's the right call.\n`, + ); + const result = parseTranscript(path, { + sessionStartIso: "2026-05-10T10:00:00.000Z", + }); + expect(result).toHaveLength(2); + expect(result[0].speaker).toBe("Alice"); + expect(result[0].text).toContain("data model"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("parses bare Speaker: text without timestamps", () => { + const dir = tmp(); + try { + const path = join(dir, "t.txt"); + writeFileSync(path, `Alice: hello\nBob: hi\n`); + const result = parseTranscript(path); + expect(result).toHaveLength(2); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + it("skips VTT timing lines and comment lines", () => { + const dir = tmp(); + try { + const path = join(dir, "t.vtt"); + writeFileSync( + path, + `# header\n00:00:00.000 --> 00:00:05.000\nAlice: hi\n`, + ); + const result = parseTranscript(path); + expect(result).toHaveLength(1); + expect(result[0].speaker).toBe("Alice"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("git history parser", () => { + it("parses output from a stub git runner", () => { + const stub = (args: string[]) => { + expect(args[0]).toBe("log"); + return `abc123${"a".repeat(34)}\t2026-05-10T10:00:00Z\tFirst commit\n5\t3\tsrc/a.ts\n2\t1\tsrc/b.ts\nfff999${"f".repeat(34)}\t2026-05-10T10:30:00Z\tSecond commit\n10\t0\tsrc/c.ts\n`; + }; + const result = parseGitHistory("/tmp/repo", stub); + expect(result).toHaveLength(2); + expect(result[0].message).toBe("First commit"); + expect(result[0].insertions).toBe(7); + expect(result[0].deletions).toBe(4); + expect(result[1].insertions).toBe(10); + }); + + it("returns an empty list when git fails", () => { + const stub = () => { + throw new Error("not a repo"); + }; + expect(parseGitHistory("/tmp/missing", stub)).toEqual([]); + }); +}); diff --git a/tests/unit/services/interview/review/extractors.spec.ts b/tests/unit/services/interview/review/extractors.spec.ts new file mode 100644 index 0000000..8c69223 --- /dev/null +++ b/tests/unit/services/interview/review/extractors.spec.ts @@ -0,0 +1,155 @@ +import { describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { extractRiskAwareness } from "../../../../../src/services/interview/review/extractors/risk-awareness.js"; +import { extractTestPass } from "../../../../../src/services/interview/review/extractors/test-pass.js"; +import { extractThroughput } from "../../../../../src/services/interview/review/extractors/throughput.js"; +import { extractVerification } from "../../../../../src/services/interview/review/extractors/verification.js"; +import type { EvidenceEvent } from "../../../../../src/services/interview/review/types.js"; + +const prompt = (ts: string, text: string): EvidenceEvent => ({ + type: "prompt", + timestamp: ts, + source: "interview.log", + text, +}); + +const cmd = ( + ts: string, + command: string, + pause?: number, +): EvidenceEvent => ({ + type: "command", + timestamp: ts, + source: "terminal.cast", + command, + pauseSecondsBeforeEnter: pause, +}); + +const commit = (ts: string, sha: string, msg: string): EvidenceEvent => ({ + type: "commit", + timestamp: ts, + source: "git", + sha, + message: msg, + insertions: 5, + deletions: 1, +}); + +describe("verification extractor", () => { + it("counts test runs and typechecks across runners", () => { + const events: EvidenceEvent[] = [ + cmd("2026-05-10T10:00:00Z", "bun test"), + cmd("2026-05-10T10:01:00Z", "go test ./..."), + cmd("2026-05-10T10:02:00Z", "tsc --noEmit"), + cmd("2026-05-10T10:03:00Z", "git diff"), + cmd("2026-05-10T10:04:00Z", "ls"), + ]; + const m = extractVerification(events); + expect(m.dimension_id).toBe("verification"); + const find = (label: RegExp) => m.facts.find((f) => label.test(f.label)); + expect(find(/Total test runs/)?.value).toBe(2); + expect(find(/typecheck/)?.value).toBe(1); + expect(find(/Diff\/grep/)?.value).toBe(1); + }); + + it("tracks test-runs-after-prompt interleaving", () => { + const events: EvidenceEvent[] = [ + prompt("2026-05-10T10:00:00Z", "add a test"), + cmd("2026-05-10T10:00:30Z", "bun test"), + prompt("2026-05-10T10:01:00Z", "now fix it"), + cmd("2026-05-10T10:01:30Z", "bun test"), + ]; + const m = extractVerification(events); + const interleaved = m.facts.find((f) => /after a prompt/.test(f.label)); + expect(interleaved?.value).toBe(2); + }); +}); + +describe("risk-awareness extractor", () => { + it("reports zero detections on a clean session", () => { + const m = extractRiskAwareness([ + cmd("2026-05-10T10:00:00Z", "ls"), + cmd("2026-05-10T10:01:00Z", "bun test"), + ]); + expect(m.facts[0].label).toMatch(/Destructive commands detected/); + expect(m.facts[0].value).toBe(0); + }); + + it("detects rm -rf and reports the pause time", () => { + const m = extractRiskAwareness([ + cmd("2026-05-10T10:00:00Z", "rm -rf ./build", 3.2), + ]); + expect(m.facts).toHaveLength(1); + expect(m.facts[0].label).toBe("rm -rf"); + expect(m.facts[0].context).toMatch(/3\.20s/); + }); + + it("detects force pushes and resets", () => { + const m = extractRiskAwareness([ + cmd("2026-05-10T10:00:00Z", "git push origin main --force"), + cmd("2026-05-10T10:01:00Z", "git reset --hard HEAD~3"), + ]); + expect(m.facts).toHaveLength(2); + }); +}); + +describe("test-pass extractor", () => { + it("reports passing/failing counts from the injected runner", () => { + const m = extractTestPass("/tmp/fake", () => ({ + passed: 12, + failed: 3, + output: "...", + })); + expect(m.dimension_id).toBe("test-pass"); + expect(m.facts.find((f) => f.label === "Passing tests")?.value).toBe(12); + expect(m.facts.find((f) => f.label === "Failing tests")?.value).toBe(3); + expect(m.facts.find((f) => f.label === "Pass rate")?.value).toBe("12/15"); + }); + + it("reports n/a when no tests ran", () => { + const m = extractTestPass("/tmp/fake", () => ({ + passed: 0, + failed: 0, + output: "", + })); + expect(m.facts.find((f) => f.label === "Pass rate")?.value).toBe("n/a"); + }); + + it("falls through to the default real runner on a directory without go.mod or package.json", () => { + const dir = mkdtempSync(join(tmpdir(), "iv-tp-")); + try { + const m = extractTestPass(dir); + expect(m.dimension_id).toBe("test-pass"); + expect(m.facts.find((f) => f.label === "Pass rate")?.value).toBe("n/a"); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); +}); + +describe("throughput extractor", () => { + it("reports elapsed, commit count, and time-to-first-test", () => { + const events: EvidenceEvent[] = [ + cmd("2026-05-10T10:00:00Z", "ls"), + cmd("2026-05-10T10:05:00Z", "bun test"), + commit("2026-05-10T10:10:00Z", "abc", "first commit"), + commit("2026-05-10T10:20:00Z", "def", "second commit"), + ]; + const m = extractThroughput(events); + const elapsed = m.facts.find((f) => f.label === "Elapsed"); + expect(elapsed?.value).toBe("20m00s"); + const commits = m.facts.find((f) => f.label === "Total commits"); + expect(commits?.value).toBe(2); + const ttft = m.facts.find((f) => f.label === "Time to first test run"); + expect(ttft?.value).toBe("5m00s"); + }); + + it("returns 'unknown' boundaries when given empty events", () => { + const m = extractThroughput([]); + expect(m.facts.find((f) => f.label === "Session start")?.value).toBe( + "unknown", + ); + }); +}); diff --git a/tests/unit/services/interview/review/review-orchestrator.spec.ts b/tests/unit/services/interview/review/review-orchestrator.spec.ts new file mode 100644 index 0000000..69e2621 --- /dev/null +++ b/tests/unit/services/interview/review/review-orchestrator.spec.ts @@ -0,0 +1,211 @@ +import { describe, expect, it } from "bun:test"; +import { + existsSync, + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { reviewCandidate } from "../../../../../src/services/interview/review/review-orchestrator.js"; +import { writeRoleConfig } from "../../../../../src/services/interview/bootstrap/role-config.js"; + +function stageRepo(opts: { analysisMode?: "ai-assisted" | "human-only" } = {}) { + const dir = mkdtempSync(join(tmpdir(), "iv-review-")); + writeRoleConfig(dir, { + roleSlug: "senior-backend", + roleTitle: "Senior Backend Engineer", + stack: "TypeScript", + domain: "Payments", + featureDescription: "Add idempotency", + timeBoxMinutes: 90, + projectMode: "A", + analysisMode: opts.analysisMode ?? "ai-assisted", + rubricMode: "default", + outputDir: dir, + }); + writeFileSync( + join(dir, "interview.log"), + `${JSON.stringify({ + event: "user-prompt-submit", + timestamp: "2026-05-10T10:00:00Z", + prompt: "let me sketch the data model", + })}\n${JSON.stringify({ + event: "pre-tool-use", + timestamp: "2026-05-10T10:00:30Z", + tool_name: "Bash", + tool_input: { command: "bun test" }, + })}\n`, + ); + const castHeader = JSON.stringify({ + version: 2, + width: 80, + height: 24, + timestamp: 1747876800, // 2026-05-22T00:00:00Z-ish + }); + const castEvents = [ + [0.1, "i", "b"], + [0.2, "i", "u"], + [0.3, "i", "n"], + [0.4, "i", " "], + [0.5, "i", "t"], + [0.6, "i", "e"], + [0.7, "i", "s"], + [0.8, "i", "t"], + [1.0, "i", "\r"], + ] + .map((e) => JSON.stringify(e)) + .join("\n"); + writeFileSync(join(dir, "terminal.cast"), `${castHeader}\n${castEvents}\n`); + writeFileSync( + join(dir, "PRIVACY_RELEASE.md"), + "# Privacy Release\n## Signed\nJane Doe\n## Date\n2026-05-10\n", + ); + mkdirSync(join(dir, "src")); + writeFileSync(join(dir, "package.json"), '{"name":"x","scripts":{"test":"echo nothing"}}'); + return dir; +} + +const stubObserver = { + async observe() { + return { + observations: [ + { + dimension_id: "upfront-design" as const, + observation: "Sketched the data model before code.", + reasoning: "Prompt timing shows design-first.", + evidence_excerpts: [ + { + source: "interview.log" as const, + content: "let me sketch the data model", + }, + ], + }, + ], + }; + }, +}; + +describe("reviewCandidate orchestrator", () => { + it("produces summary.md, audit.md, audit.json, and evidence/ for a complete repo", async () => { + const repo = stageRepo(); + const out = mkdtempSync(join(tmpdir(), "iv-out-")); + try { + const outcome = await reviewCandidate( + { + repoUrl: "stub", + candidateName: "Jane Doe", + localRepoPath: repo, + outputDir: out, + }, + { + observer: stubObserver, + testRunner: () => ({ passed: 1, failed: 0, output: "" }), + }, + ); + expect(outcome.ok).toBe(true); + expect(outcome.outputs).toBeDefined(); + if (!outcome.outputs) throw new Error("no outputs"); + expect(existsSync(outcome.outputs.summaryPath)).toBe(true); + expect(existsSync(outcome.outputs.auditPath)).toBe(true); + expect(existsSync(outcome.outputs.auditJsonPath)).toBe(true); + // Privacy release is copied to evidence/ + expect(existsSync(join(outcome.outputs.evidenceDir, "PRIVACY_RELEASE.md"))).toBe(true); + } finally { + rmSync(repo, { recursive: true, force: true }); + rmSync(out, { recursive: true, force: true }); + } + }); + + it("fails clearly when the candidate repo has no role-config.json", async () => { + const repo = mkdtempSync(join(tmpdir(), "iv-bad-")); + const out = mkdtempSync(join(tmpdir(), "iv-out-")); + try { + const outcome = await reviewCandidate( + { + repoUrl: "stub", + candidateName: "Jane", + localRepoPath: repo, + outputDir: out, + }, + { observer: stubObserver }, + ); + expect(outcome.ok).toBe(false); + expect(outcome.failures.join(" ")).toMatch(/role-config\.json/); + } finally { + rmSync(repo, { recursive: true, force: true }); + rmSync(out, { recursive: true, force: true }); + } + }); + + it("uses human-only blank templates when role config requests human-only mode", async () => { + const repo = stageRepo({ analysisMode: "human-only" }); + const out = mkdtempSync(join(tmpdir(), "iv-out-")); + // Observer should NOT be called. + let observerCalls = 0; + const observer = { + async observe() { + observerCalls += 1; + return { observations: [] }; + }, + }; + try { + const outcome = await reviewCandidate( + { + repoUrl: "stub", + candidateName: "Jane", + localRepoPath: repo, + outputDir: out, + }, + { + observer, + testRunner: () => ({ passed: 0, failed: 0, output: "" }), + }, + ); + expect(outcome.ok).toBe(true); + expect(observerCalls).toBe(0); + const summary = readFileSync(outcome.outputs?.summaryPath ?? "", "utf8"); + expect(summary).toContain("manager to write"); + } finally { + rmSync(repo, { recursive: true, force: true }); + rmSync(out, { recursive: true, force: true }); + } + }); + + it("session_recording_url ends up in frontmatter but not in the audit prose", async () => { + const repo = stageRepo(); + const out = mkdtempSync(join(tmpdir(), "iv-out-")); + try { + const outcome = await reviewCandidate( + { + repoUrl: "stub", + candidateName: "Jane", + localRepoPath: repo, + outputDir: out, + sessionRecordingUrl: "https://zoom.us/rec/secret-xyz", + sessionPlatform: "zoom", + sessionDate: "2026-05-10", + }, + { + observer: stubObserver, + testRunner: () => ({ passed: 1, failed: 0, output: "" }), + }, + ); + expect(outcome.ok).toBe(true); + const summary = readFileSync(outcome.outputs?.summaryPath ?? "", "utf8"); + expect(summary).toContain( + `session_recording_url: "https://zoom.us/rec/secret-xyz"`, + ); + // Below the frontmatter, the URL must not appear (verifies it didn't leak + // into the LLM observer's narrative prose). + const belowFrontmatter = summary.split(/^---$/m).slice(2).join(""); + expect(belowFrontmatter).not.toContain("zoom.us"); + expect(belowFrontmatter).not.toContain("secret-xyz"); + } finally { + rmSync(repo, { recursive: true, force: true }); + rmSync(out, { recursive: true, force: true }); + } + }); +}); diff --git a/tests/unit/services/interview/shared/events.spec.ts b/tests/unit/services/interview/shared/events.spec.ts new file mode 100644 index 0000000..8832ba8 --- /dev/null +++ b/tests/unit/services/interview/shared/events.spec.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from "bun:test"; +import { + type InterviewEvent, + parseInterviewEvent, + serializeInterviewEvent, +} from "../../../../../src/services/interview/shared/events.js"; + +describe("interview events protocol", () => { + it("round-trips a progress event through serialize/parse", () => { + const event: InterviewEvent = { + type: "progress", + step: "collect-evidence", + status: "start", + }; + const line = serializeInterviewEvent(event); + expect(parseInterviewEvent(line)).toEqual(event); + }); + + it("serializes one event per line (no embedded newlines)", () => { + const line = serializeInterviewEvent({ + type: "progress", + step: "x", + status: "done", + message: "ok", + }); + expect(line.endsWith("\n")).toBe(true); + expect(line.slice(0, -1).includes("\n")).toBe(false); + }); + + it("parseInterviewEvent returns null on invalid JSON", () => { + expect(parseInterviewEvent("not-json")).toBeNull(); + }); + + it("parseInterviewEvent returns null on JSON missing a known event type", () => { + expect(parseInterviewEvent(JSON.stringify({ foo: "bar" }))).toBeNull(); + }); + + it("parseInterviewEvent returns null on unknown event type", () => { + expect( + parseInterviewEvent(JSON.stringify({ type: "no-such-event" })), + ).toBeNull(); + }); +}); diff --git a/tests/unit/services/interview/shared/rubric.spec.ts b/tests/unit/services/interview/shared/rubric.spec.ts new file mode 100644 index 0000000..67e0db2 --- /dev/null +++ b/tests/unit/services/interview/shared/rubric.spec.ts @@ -0,0 +1,107 @@ +import { describe, expect, it } from "bun:test"; +import { + getDimension, + getDimensions, + getEvidenceMode, + getRubricVersion, + RUBRIC_VERSION, +} from "../../../../../src/services/interview/shared/rubric.js"; + +describe("interview rubric", () => { + it("exports a non-empty RUBRIC_VERSION string", () => { + expect(typeof RUBRIC_VERSION).toBe("string"); + expect(RUBRIC_VERSION.length).toBeGreaterThan(0); + }); + + it("getRubricVersion() returns the same value as RUBRIC_VERSION", () => { + expect(getRubricVersion()).toBe(RUBRIC_VERSION); + }); + + it("getDimensions() returns exactly 9 dimensions", () => { + expect(getDimensions()).toHaveLength(9); + }); + + it("getDimension(id) returns the dimension for each known id", () => { + const knownIds = [ + "upfront-design", + "context-engineering", + "critical-evaluation", + "verification", + "course-correction", + "risk-awareness", + "architectural-quality", + "test-pass", + "throughput", + ] as const; + for (const id of knownIds) { + const dim = getDimension(id); + expect(dim).toBeDefined(); + expect(dim?.id).toBe(id); + } + }); + + it("getEvidenceMode classifies the 4 deterministic dimensions", () => { + const deterministicIds = [ + "verification", + "risk-awareness", + "test-pass", + "throughput", + ] as const; + for (const id of deterministicIds) { + expect(getEvidenceMode(id)).toBe("deterministic"); + } + }); + + it("getEvidenceMode classifies the 2 hybrid dimensions", () => { + const hybridIds = ["context-engineering", "course-correction"] as const; + for (const id of hybridIds) { + expect(getEvidenceMode(id)).toBe("hybrid"); + } + }); + + it("getEvidenceMode classifies the 3 llm-judge dimensions", () => { + const llmJudgeIds = [ + "upfront-design", + "critical-evaluation", + "architectural-quality", + ] as const; + for (const id of llmJudgeIds) { + expect(getEvidenceMode(id)).toBe("llm-judge"); + } + }); + + it("every dimension has all required fields populated", () => { + for (const dim of getDimensions()) { + expect(typeof dim.id).toBe("string"); + expect(dim.id.length).toBeGreaterThan(0); + expect(typeof dim.title).toBe("string"); + expect(dim.title.length).toBeGreaterThan(0); + expect(typeof dim.description).toBe("string"); + expect(dim.description.length).toBeGreaterThan(0); + expect(["deterministic", "hybrid", "llm-judge"]).toContain( + dim.evidenceMode, + ); + expect(["process", "outcome"]).toContain(dim.group); + expect(Array.isArray(dim.maturityLineage)).toBe(true); + expect(dim.maturityLineage.length).toBeGreaterThan(0); + } + }); + + it("dimensions are grouped as 6 process and 3 outcome", () => { + const dims = getDimensions(); + const process = dims.filter((d) => d.group === "process"); + const outcome = dims.filter((d) => d.group === "outcome"); + expect(process).toHaveLength(6); + expect(outcome).toHaveLength(3); + // Outcome dims must be the three "what they produced" dims + expect(outcome.map((d) => d.id).sort()).toEqual( + ["architectural-quality", "test-pass", "throughput"].sort(), + ); + }); + + it("getDimension(unknownId) returns undefined", () => { + // Cast through unknown — the call site is allowed to pass arbitrary strings + // (e.g. user input, deserialized JSON) and must safely return undefined. + expect(getDimension("not-a-real-dimension" as unknown as never)).toBeUndefined(); + }); +}); diff --git a/tui/go.mod b/tui/go.mod index d6c32b9..73828b1 100644 --- a/tui/go.mod +++ b/tui/go.mod @@ -8,7 +8,9 @@ require ( github.com/charmbracelet/glamour v1.0.0 github.com/charmbracelet/huh v1.0.0 github.com/charmbracelet/lipgloss v1.1.1-0.20250404203927-76690c660834 + github.com/charmbracelet/x/exp/teatest v0.0.0-20260511003329-c066bcf2349a github.com/charmbracelet/x/term v0.2.2 + github.com/muesli/reflow v0.3.0 ) require ( @@ -23,7 +25,7 @@ require ( github.com/charmbracelet/x/ansi v0.11.6 // indirect github.com/charmbracelet/x/cellbuf v0.0.15 // indirect github.com/charmbracelet/x/conpty v0.2.0 // indirect - github.com/charmbracelet/x/exp/golden v0.0.0-20260330094520-2dce04b6f8a4 // indirect + github.com/charmbracelet/x/exp/golden v0.0.0-20260511003329-c066bcf2349a // indirect github.com/charmbracelet/x/exp/slice v0.0.0-20260330094520-2dce04b6f8a4 // indirect github.com/charmbracelet/x/exp/strings v0.1.0 // indirect github.com/charmbracelet/x/xpty v0.1.3 // indirect @@ -41,7 +43,6 @@ require ( github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/cancelreader v0.2.2 // indirect - github.com/muesli/reflow v0.3.0 // indirect github.com/muesli/termenv v0.16.0 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect diff --git a/tui/go.sum b/tui/go.sum index 66e6cf0..049d2cf 100644 --- a/tui/go.sum +++ b/tui/go.sum @@ -36,12 +36,14 @@ github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMx github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q= github.com/charmbracelet/x/conpty v0.2.0 h1:eKtA2hm34qNfgJCDp/M6Dc0gLy7e07YEK4qAdNGOvVY= github.com/charmbracelet/x/conpty v0.2.0/go.mod h1:fexgUnVrZgw8scD49f6VSi0Ggj9GWYIrpedRthAwW/8= -github.com/charmbracelet/x/exp/golden v0.0.0-20260330094520-2dce04b6f8a4 h1:KUnGB9CyCFWFXb3DysCYnQknFYa+AzdrR37VehWYI8U= -github.com/charmbracelet/x/exp/golden v0.0.0-20260330094520-2dce04b6f8a4/go.mod h1:6fMpcW6iwN/kX+xJ52eqVWsDiBTe0UJD24JLoHFe+P0= +github.com/charmbracelet/x/exp/golden v0.0.0-20260511003329-c066bcf2349a h1:L06IdBUJRwDjS2ja7e8HFCnAyH4OktJHRyQiZ9FBRYg= +github.com/charmbracelet/x/exp/golden v0.0.0-20260511003329-c066bcf2349a/go.mod h1:6fMpcW6iwN/kX+xJ52eqVWsDiBTe0UJD24JLoHFe+P0= github.com/charmbracelet/x/exp/slice v0.0.0-20260330094520-2dce04b6f8a4 h1:VSd4zShIAf/4FgEDFJpapEcAPrc7h3dyyN7V9JlJpQw= github.com/charmbracelet/x/exp/slice v0.0.0-20260330094520-2dce04b6f8a4/go.mod h1:vqEfX6xzqW1pKKZUUiFOKg0OQ7bCh54Q2vR/tserrRA= github.com/charmbracelet/x/exp/strings v0.1.0 h1:i69S2XI7uG1u4NLGeJPSYU++Nmjvpo9nwd6aoEm7gkA= github.com/charmbracelet/x/exp/strings v0.1.0/go.mod h1:/ehtMPNh9K4odGFkqYJKpIYyePhdp1hLBRvyY4bWkH8= +github.com/charmbracelet/x/exp/teatest v0.0.0-20260511003329-c066bcf2349a h1:pHBGVvQvgh8uQE63XQ+fDYwiVg447N5qMA+wHGUeMGk= +github.com/charmbracelet/x/exp/teatest v0.0.0-20260511003329-c066bcf2349a/go.mod h1:aPVjFrBwbJgj5Qz1F0IXsnbcOVJcMKgu1ySUfTAxh7k= github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= github.com/charmbracelet/x/termios v0.1.1 h1:o3Q2bT8eqzGnGPOYheoYS8eEleT5ZVNYNy8JawjaNZY= diff --git a/tui/interview.go b/tui/interview.go new file mode 100644 index 0000000..e9b246d --- /dev/null +++ b/tui/interview.go @@ -0,0 +1,118 @@ +package main + +import ( + "fmt" + "io" + + "github.com/charmbracelet/huh" + "github.com/charmbracelet/lipgloss" +) + +func printInterviewUsage(out io.Writer) { + fmt.Fprint(out, `Usage: teamhero interview [flags] + +Review candidate AI-collaboration interviews. + +Verbs: + bootstrap Configure a role and generate the candidate coding project + review Review a single candidate's interview artifacts + cohort Review the cohort across all candidates for a role + +Run 'teamhero interview --help' for verb-specific help. +`) +} + +// interviewVerbOptions returns the picker choices. Each Value must be a +// non-empty string distinct from the zero value of `string` — otherwise huh +// treats Cancel (whose value used to be "") as the bound `verb`'s current +// value, places the cursor on Cancel (the LAST row), and the viewport +// scrolls to keep that cursor visible, clipping every option above it. The +// user only sees "> Cancel" on first paint until they press the up arrow. +// Regression test: TestInterviewVerbOptions_NoValueMatchesZeroDefault. +func interviewVerbOptions() []huh.Option[string] { + return []huh.Option[string]{ + huh.NewOption("Bootstrap — generate a candidate coding project", "bootstrap"), + huh.NewOption("Review — review a single candidate's interview", "review"), + huh.NewOption("Cohort — review all candidates for a role", "cohort"), + huh.NewOption("Cancel", "cancel"), + } +} + +// interviewVerbPicker returns the verb the user chose ("bootstrap" / "review" +// / "cohort"), "" if they cancelled, or an error. Tests override this so the +// dispatcher logic can be exercised without a TTY. +// +// The picker is wrapped with the shared shell-header + hints-footer so the +// `teamhero interview` no-args screen lands inside the same contextual +// frame as every other top-level command. Without the frame the user +// drops out of the app's visual layout and the picker looks like an +// unrelated tool. +var interviewVerbPicker = func() (string, error) { + w := termWidth() + fmt.Println(renderShellHeader(w)) + fmt.Println() + + var verb string + form := huh.NewForm( + huh.NewGroup( + huh.NewSelect[string](). + Title("teamhero interview"). + Description("What would you like to do?"). + Options(interviewVerbOptions()...). + Value(&verb), + ), + ).WithTheme(huh.ThemeCharm()).WithWidth(w * 3 / 5) + if err := form.Run(); err != nil { + if err == huh.ErrUserAborted { + return "", nil + } + return "", err + } + + hintStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + fmt.Println() + fmt.Println(hintStyle.Render("enter continue • ctrl+c quit")) + + // Map the "cancel" sentinel back to the caller's "" no-op convention so + // the dispatcher's existing `if verb == ""` check covers both abort + // (ctrl-c) and explicit Cancel selection. + if verb == "cancel" { + return "", nil + } + return verb, nil +} + +func runInterview(args []string, stdout, stderr io.Writer) int { + if len(args) == 0 { + // Non-TTY callers (CI, piped stdin, `go test`) cannot drive the picker; + // keep the legacy usage-and-exit-1 behavior so scripts stay deterministic. + if !isStdinTTY() { + printInterviewUsage(stderr) + return 1 + } + verb, err := interviewVerbPicker() + if err != nil { + fmt.Fprintf(stderr, "interview menu failed: %v\n", err) + return 1 + } + if verb == "" { + return 0 + } + args = []string{verb} + } + verb := args[0] + rest := args[1:] + switch verb { + case "bootstrap": + launcher := newHuhBootstrapWizardLauncher(BootstrapWizardDefaults{}) + return runInterviewBootstrapWithWizard(rest, bunBootstrapRunner{}, launcher, stdout, stderr) + case "review": + return runInterviewReview(rest, bunReviewRunner{}, stdout, stderr) + case "cohort": + return runInterviewCohort(rest, bunCohortRunner{}, stdout, stderr) + default: + fmt.Fprintf(stderr, "teamhero interview: unknown verb %q\n", verb) + printInterviewUsage(stderr) + return 1 + } +} diff --git a/tui/interview_bootstrap.go b/tui/interview_bootstrap.go new file mode 100644 index 0000000..f2314d1 --- /dev/null +++ b/tui/interview_bootstrap.go @@ -0,0 +1,538 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "strings" +) + +// bootstrapPayload is the agent-handoff schema emitted to stdout when +// --json is passed. Versioned via the Schema field so downstream +// consumers can fail loud on breaking changes. Pointer types for +// optional nested objects (jd, github) so absent fields serialize as +// `null` rather than empty structs — clearer for the consumer. +type bootstrapPayload struct { + Schema string `json:"schema"` + Role bootstrapRolePayload `json:"role"` + Project bootstrapProjectPayload `json:"project"` + AI bootstrapAIPayload `json:"ai"` + Github *bootstrapGithubPayload `json:"github"` +} + +type bootstrapAIPayload struct { + // Model is the OpenAI model used for project generation. Echoes + // whatever the bun subprocess saw (AI_MODEL env override or the + // gpt-5-mini default). Useful for an orchestrating agent that + // wants to attribute costs by model in HR notifications. + Model string `json:"model"` +} + +type bootstrapRolePayload struct { + Slug string `json:"slug"` + Title string `json:"title"` + Stack string `json:"stack"` + Domain string `json:"domain"` +} + +type bootstrapProjectPayload struct { + Mode string `json:"mode"` + StackByCandidate bool `json:"stackByCandidate"` + OutputDir string `json:"outputDir"` + TimeBoxMinutes int `json:"timeBoxMinutes"` + Feature string `json:"feature"` + AnalysisMode string `json:"analysisMode"` + RubricMode string `json:"rubricMode"` + JD *bootstrapJDPayload `json:"jd"` +} + +type bootstrapJDPayload struct { + Path string `json:"path"` + InfluencesProject bool `json:"influencesProject"` +} + +type bootstrapGithubPayload struct { + URL string `json:"url"` +} + +// buildBootstrapPayload assembles the agent payload from the run's +// validated options plus an optional GitHub URL captured from a +// --publish run. Kept pure so it's trivially unit-testable; the +// dispatcher composes it with the io.Writer side-effect. +func buildBootstrapPayload(opts *BootstrapOptions, githubURL string) bootstrapPayload { + tb, _ := strconv.Atoi(strings.TrimSpace(opts.TimeBox)) + outAbs, _ := filepath.Abs(opts.OutputDir) + if outAbs == "" { + outAbs = opts.OutputDir + } + var jd *bootstrapJDPayload + if strings.TrimSpace(opts.JDPath) != "" { + jd = &bootstrapJDPayload{ + Path: opts.JDPath, + InfluencesProject: opts.JDInfluencesProject, + } + } + var gh *bootstrapGithubPayload + if strings.TrimSpace(githubURL) != "" { + gh = &bootstrapGithubPayload{URL: githubURL} + } + model := strings.TrimSpace(os.Getenv("AI_MODEL")) + if model == "" { + model = "gpt-5-mini" + } + return bootstrapPayload{ + Schema: "teamhero.interview.bootstrap/v1", + Role: bootstrapRolePayload{ + Slug: opts.Role, + Title: opts.RoleTitle, + Stack: opts.Stack, + Domain: opts.Domain, + }, + Project: bootstrapProjectPayload{ + Mode: opts.ModeProject, + StackByCandidate: opts.StackByCandidate, + OutputDir: outAbs, + TimeBoxMinutes: tb, + Feature: opts.Feature, + AnalysisMode: opts.ModeAnalysis, + RubricMode: opts.ModeRubric, + JD: jd, + }, + AI: bootstrapAIPayload{Model: model}, + Github: gh, + } +} + +func writeBootstrapPayload(w io.Writer, payload bootstrapPayload) error { + enc := json.NewEncoder(w) + enc.SetIndent("", " ") + return enc.Encode(payload) +} + +// BootstrapOptions are the headless flags accepted by `teamhero interview bootstrap`. +type BootstrapOptions struct { + Role string + RoleTitle string + Stack string + Domain string + Feature string + TimeBox string + ModeProject string + ModeAnalysis string + ModeRubric string + JDPath string + CustomPrompt string + OutputDir string + KitDir string + Headless bool + NoConfirm bool + Foreground bool + // StackByCandidate flips Mode B's brief from "use Stack" to + // "candidate picks their own stack". Only meaningful when + // ModeProject == "B"; the validator rejects the combination + // otherwise so the headless protocol stays explicit. Set by the + // wizard's "Greenfield (candidate picks stack)" option or the + // --stack-by-candidate headless flag. + StackByCandidate bool + // JDInfluencesProject tells the project-generation prompt to read + // the JD at JDPath and tailor the generated repo to its seniority + // and domain (e.g., junior healthtech → EHR-flavoured feature). + // Requires JDPath; the validator rejects the combination otherwise. + // Independent of ModeRubric — the JD is now a standalone input + // rather than being smuggled in via a rubric value. + JDInfluencesProject bool + // Debug toggles verbose run-context logs in the bun subprocess (the + // generator client) and the Go dispatcher. Off by default — light + // run logs print regardless so failure triage doesn't require a rerun. + Debug bool + // EmitJSON switches the dispatcher into agent-payload mode. On + // success the dispatcher prints a single bootstrapPayload JSON + // object to stdout; the regular human-readable "Project: " + // output and the publish prompt are routed to stderr (or + // suppressed) so stdout stays parseable. Designed for callers + // where another agent reads stdout to schedule the interview, + // notify HR, etc. + EmitJSON bool + // Publish auto-publishes the generated repo to GitHub when set. + // No interactive prompt — the dispatcher calls the same publish + // path the TTY prompt would have called and surfaces the URL. + // Orthogonal to EmitJSON: --publish alone pushes silently; + // --publish --json includes the URL in the emitted payload; + // --json alone leaves github.url null in the payload. + Publish bool +} + +// ParseBootstrapFlags parses headless flags from the args following `bootstrap`. +// Returns nil and an error message if a flag value is missing. +func ParseBootstrapFlags(args []string) (*BootstrapOptions, string) { + opts := &BootstrapOptions{} + i := 0 + for i < len(args) { + a := args[i] + switch a { + case "--headless": + opts.Headless = true + case "--no-confirm": + opts.NoConfirm = true + case "--foreground": + opts.Foreground = true + case "--debug", "-d": + opts.Debug = true + case "--json": + opts.EmitJSON = true + case "--publish": + opts.Publish = true + case "--stack-by-candidate": + opts.StackByCandidate = true + case "--jd-influences-project": + opts.JDInfluencesProject = true + case "--role", "--role-title", "--stack", "--domain", "--feature", + "--time-box", "--mode-project", "--mode-analysis", "--mode-rubric", + "--jd-path", "--custom-prompt", + "--output-dir", "--kit-dir": + if i+1 >= len(args) { + return nil, fmt.Sprintf("flag %s requires a value", a) + } + val := args[i+1] + switch a { + case "--role": + opts.Role = val + case "--role-title": + opts.RoleTitle = val + case "--stack": + opts.Stack = val + case "--domain": + opts.Domain = val + case "--feature": + opts.Feature = val + case "--time-box": + opts.TimeBox = val + case "--mode-project": + opts.ModeProject = val + case "--mode-analysis": + opts.ModeAnalysis = val + case "--mode-rubric": + opts.ModeRubric = val + case "--jd-path": + opts.JDPath = val + case "--custom-prompt": + opts.CustomPrompt = val + case "--output-dir": + opts.OutputDir = val + case "--kit-dir": + opts.KitDir = val + } + i++ + default: + return nil, fmt.Sprintf("unknown flag: %s", a) + } + i++ + } + return opts, "" +} + +// ValidateBootstrapOptions returns a non-empty string describing why the +// options are invalid, or "" when they are complete. +func ValidateBootstrapOptions(opts *BootstrapOptions) string { + required := map[string]string{ + "--role": opts.Role, + "--stack": opts.Stack, + "--feature": opts.Feature, + "--mode-project": opts.ModeProject, + "--mode-analysis": opts.ModeAnalysis, + "--mode-rubric": opts.ModeRubric, + "--output-dir": opts.OutputDir, + } + // --domain is required UNLESS a --jd-path is supplied. The job + // description, when attached, describes the business domain; + // forcing the proctor to also type it as a separate flag is + // redundant and error-prone. + if strings.TrimSpace(opts.JDPath) == "" && strings.TrimSpace(opts.Domain) == "" { + required["--domain"] = "" + } + missing := []string{} + for flag, val := range required { + if strings.TrimSpace(val) == "" { + missing = append(missing, flag) + } + } + if len(missing) > 0 { + return "missing required flags: " + strings.Join(missing, ", ") + } + if opts.ModeProject != "A" && opts.ModeProject != "B" { + return "--mode-project must be 'A' or 'B'" + } + if opts.StackByCandidate && opts.ModeProject != "B" { + // stack-by-candidate is a Mode B variant. Combining it with Mode A + // is incoherent — Mode A generates a starter codebase IN a stack, + // so "candidate picks the stack" can't apply. Reject explicitly so + // callers don't get a brownfield project with a mismatched brief. + return "--stack-by-candidate requires --mode-project B" + } + if opts.ModeAnalysis != "ai-assisted" && opts.ModeAnalysis != "human-only" { + return "--mode-analysis must be 'ai-assisted' or 'human-only'" + } + switch opts.ModeRubric { + case "default", "custom": + default: + return "--mode-rubric must be 'default' or 'custom'" + } + if opts.ModeRubric == "custom" && strings.TrimSpace(opts.CustomPrompt) == "" { + return "--mode-rubric 'custom' requires --custom-prompt" + } + // jd-path is now optional regardless of rubric mode. When supplied, + // the file must exist. --jd-influences-project requires a path + // (the generator has nothing to read otherwise). + if strings.TrimSpace(opts.JDPath) != "" { + if _, err := os.Stat(opts.JDPath); err != nil { + return fmt.Sprintf("--jd-path does not exist: %s", opts.JDPath) + } + } + if opts.JDInfluencesProject && strings.TrimSpace(opts.JDPath) == "" { + return "--jd-influences-project requires --jd-path" + } + return "" +} + +// BootstrapRunner spawns the TS bootstrap process. Tests substitute a stub. +type BootstrapRunner interface { + Run(opts *BootstrapOptions, stdout, stderr io.Writer) int +} + +// bunBootstrapRunner is the production runner that spawns the TS script via bun. +type bunBootstrapRunner struct{} + +func (bunBootstrapRunner) Run(opts *BootstrapOptions, stdout, stderr io.Writer) int { + args := []string{"run", findBootstrapScript()} + args = append(args, + "--role", opts.Role, + "--stack", opts.Stack, + "--domain", opts.Domain, + "--feature", opts.Feature, + "--mode-project", opts.ModeProject, + "--mode-analysis", opts.ModeAnalysis, + "--mode-rubric", opts.ModeRubric, + "--output-dir", opts.OutputDir, + ) + if opts.RoleTitle != "" { + args = append(args, "--role-title", opts.RoleTitle) + } + if opts.TimeBox != "" { + args = append(args, "--time-box", opts.TimeBox) + } + if opts.JDPath != "" { + args = append(args, "--jd-path", opts.JDPath) + } + if opts.CustomPrompt != "" { + args = append(args, "--custom-prompt", opts.CustomPrompt) + } + if opts.KitDir != "" { + args = append(args, "--kit-dir", opts.KitDir) + } + if opts.StackByCandidate { + args = append(args, "--stack-by-candidate") + } + if opts.JDInfluencesProject { + args = append(args, "--jd-influences-project") + } + if opts.Debug { + args = append(args, "--debug") + } + + bunPath := resolveBunBinary() + cmd := exec.Command(bunPath, args...) + cmd.Stdout = stdout + cmd.Stderr = stderr + cmd.Env = os.Environ() + if err := cmd.Run(); err != nil { + if exit, ok := err.(*exec.ExitError); ok { + return exit.ExitCode() + } + fmt.Fprintf(stderr, "Failed to run bootstrap subprocess: %v\n", err) + return 1 + } + return 0 +} + +// findBootstrapScript locates scripts/run-interview-bootstrap.ts relative to the +// installed teamhero.cli source tree. Falls back to a best-effort path next to +// the TUI binary's working directory. +func findBootstrapScript() string { + candidates := []string{ + "scripts/run-interview-bootstrap.ts", + "../scripts/run-interview-bootstrap.ts", + } + if runtime.GOOS != "windows" { + exe, err := os.Executable() + if err == nil { + candidates = append(candidates, + filepath.Join(filepath.Dir(exe), "..", "scripts", "run-interview-bootstrap.ts"), + ) + } + } + for _, c := range candidates { + if _, err := os.Stat(c); err == nil { + return c + } + } + return "scripts/run-interview-bootstrap.ts" +} + +// applyBootstrapDefaults fills in any optional flag whose value is derivable +// from the rest of the config so the proctor doesn't have to repeat the +// obvious defaults every run. +// +// - --output-dir defaults to `./interviews/`. The repo's +// .gitignore covers `interviews/`, so generated candidate material +// never accidentally lands in a commit. +// - --time-box defaults to "60" minutes — the recommended length for a +// candidate interview project. Override with --time-box per the +// original PRD when the role needs more or less runway. +// - --kit-dir defaults to `teamhero-interview-kit` (resolved relative +// to the current working directory) so the bootstrap scripts, +// INTERVIEW_RULES.md, AGENTS.md, PRIVACY_RELEASE.md, .claude/CLAUDE.md, +// and other scaffolding files are ALWAYS copied into the generated +// repo — regardless of whether the proctor picked a generated +// starter project (Mode A) or a brief-only flow (Mode B). Without +// this default a proctor who forgot to pass --kit-dir got the AI +// output but none of the proctor/candidate guidance, which broke +// the recording workflow. +// +// Defaults are applied in-place; an explicit user flag always wins. +func applyBootstrapDefaults(opts *BootstrapOptions) { + if opts == nil { + return + } + role := strings.TrimSpace(opts.Role) + if strings.TrimSpace(opts.OutputDir) == "" && role != "" { + opts.OutputDir = filepath.Join("interviews", role) + } + if strings.TrimSpace(opts.TimeBox) == "" { + opts.TimeBox = "60" + } + if strings.TrimSpace(opts.KitDir) == "" { + opts.KitDir = "teamhero-interview-kit" + } +} + +// runInterviewBootstrap dispatches the bootstrap verb. Parses flags, validates, +// invokes the runner. On success it prints a clickable output-dir link and, +// when running interactively, offers to publish the generated repo to GitHub. +// Returns the exit code. +func runInterviewBootstrap(args []string, runner BootstrapRunner, stdout, stderr io.Writer) int { + opts, parseErr := ParseBootstrapFlags(args) + if parseErr != "" { + fmt.Fprintln(stderr, parseErr) + return 1 + } + if !opts.Headless { + fmt.Fprintln(stderr, "teamhero interview bootstrap: only --headless mode is implemented in this slice; pass --headless and all required flags.") + return 1 + } + applyBootstrapDefaults(opts) + if msg := ValidateBootstrapOptions(opts); msg != "" { + fmt.Fprintln(stderr, msg) + return 1 + } + logBootstrapRunContext(opts, stderr) + // In --json mode, the bun subprocess's progress chatter must not + // pollute stdout. Route its stdout to stderr so the calling agent + // sees only our final JSON payload on stdout. + runnerStdout := stdout + if opts.EmitJSON { + runnerStdout = stderr + } + exit := runner.Run(opts, runnerStdout, stderr) + if exit != 0 { + return exit + } + githubURL := "" + if opts.EmitJSON { + // Agent-payload mode: human-readable link goes to stderr (so + // it's still visible to a human watching the terminal), then + // we emit the structured JSON to stdout. Publish behavior in + // this mode is gated on --publish, NOT on the TTY/no-confirm + // dance — agent callers want explicit, predictable behavior. + printBootstrapSuccessLink(opts.OutputDir, stderr) + if opts.Publish { + githubURL = autoPublishToGitHub(opts, stderr) + } + payload := buildBootstrapPayload(opts, githubURL) + if err := writeBootstrapPayload(stdout, payload); err != nil { + fmt.Fprintf(stderr, "failed to emit JSON payload: %v\n", err) + return 1 + } + return 0 + } + // Human-interactive default path. + printBootstrapSuccessLink(opts.OutputDir, stdout) + if opts.Publish { + autoPublishToGitHub(opts, stderr) + } else if isStdinTTY() && !opts.NoConfirm { + // Suppress the publish prompt on non-interactive runs (CI, piped + // stdin) and when --no-confirm explicitly opts out, so scripted + // callers never block on a huh form. + offerPublishToGitHub(opts, stdout, stderr) + } + return 0 +} + +// autoPublishToGitHub is the non-interactive publish path. Returns the +// resulting repo URL on success, or "" when publish couldn't run +// (no token configured, push failed, etc.). Real implementation +// lives in interview_bootstrap_publish.go; the var indirection keeps +// tests from spawning git/gh subprocesses. +var autoPublishToGitHub = func(opts *BootstrapOptions, stderr io.Writer) string { + fmt.Fprintln(stderr, "auto-publish: not yet wired to a real GitHub client; skipping") + return "" +} + +// logBootstrapRunContext emits a single-line summary of the validated +// options before the bun subprocess runs so a failure ticket can be +// triaged without rerunning. Always prints (light context); the verbose +// per-field dump is delegated to the bun subprocess via --debug. +// +// Goes to stderr because stdout is reserved for the user-facing success +// link / OSC 8 hyperlink, which the TUI consumes verbatim. +func logBootstrapRunContext(opts *BootstrapOptions, w io.Writer) { + if opts == nil { + return + } + jdShort := opts.JDPath + if jdShort == "" { + jdShort = "(none)" + } + fmt.Fprintf(w, + "[bootstrap] role=%s mode=%s stack=%s stack-by-candidate=%t domain=%s time-box=%sm rubric=%s jd=%s jd-influences-project=%t output=%s kit=%s debug=%t\n", + opts.Role, opts.ModeProject, opts.Stack, opts.StackByCandidate, opts.Domain, opts.TimeBox, + opts.ModeRubric, jdShort, opts.JDInfluencesProject, opts.OutputDir, opts.KitDir, opts.Debug, + ) +} + +// printBootstrapSuccessLink emits the generated project's path as an OSC 8 +// hyperlink so the proctor can ctrl-click to open it in their OS file +// browser. The display label prefers a path relative to the current +// working directory (so a project under ~/Documents/interviews shows as +// "interviews/" rather than "/home/.../Documents/interviews/") +// — but the underlying file:// URL is always absolute so the click +// actually opens. Falls back to absolute display if Rel fails or escapes +// upward via "..". +func printBootstrapSuccessLink(dir string, w io.Writer) { + abs, link := absPathLink(dir) + if link == "" { + fmt.Fprintf(w, "Project: %s\n", abs) + return + } + display := abs + if cwd, err := os.Getwd(); err == nil { + if rel, err := filepath.Rel(cwd, abs); err == nil && !strings.HasPrefix(rel, "..") { + display = rel + } + } + fmt.Fprintf(w, "Project: %s\n", osc8Link(link, display)) +} diff --git a/tui/interview_bootstrap_form.go b/tui/interview_bootstrap_form.go new file mode 100644 index 0000000..54fc217 --- /dev/null +++ b/tui/interview_bootstrap_form.go @@ -0,0 +1,44 @@ +package main + +import ( + "fmt" + "strings" +) + +// runHuhBootstrapWizard runs the bootstrap wizard as a single bubbletea +// program (interviewBootstrapTeaModel) so the wizard adopts the same +// shell-header + summary-panel layout as the report wizard. The data +// container (bootstrapWizardModel) and per-screen validators are +// unchanged; only the runner is. This function exists as the launcher +// entry point so callers don't need to know about the tea-program seam. +func runHuhBootstrapWizard(d BootstrapWizardDefaults) (*BootstrapWizardResult, error) { + return runBootstrapTeaWizard(d) +} + +// summarizeBootstrapModel renders a compact one-line summary of the +// wizard's collected values. Used by the confirm-step description and by +// callers that want a short config string. +func summarizeBootstrapModel(m bootstrapWizardModel) string { + jd := "none" + if m.jdProvided == "yes" && m.jdPath != "" { + jd = m.jdPath + if m.jdInfluencesProject == "yes" { + jd += " (shapes project)" + } + } + return fmt.Sprintf( + "role=%s · stack=%s · domain=%s · time-box=%s · project=%s · analysis=%s · rubric=%s · jd=%s · out=%s", + m.role, m.stack, m.domain, m.timeBox, m.modeProject, m.modeAnalysis, m.modeRubric, jd, m.outputDir, + ) +} + +// nonEmpty produces a huh.Input.Validate-compatible function that rejects +// whitespace-only input for the given field name. +func nonEmpty(field string) func(string) error { + return func(s string) error { + if strings.TrimSpace(s) == "" { + return fmt.Errorf("%s is required", field) + } + return nil + } +} diff --git a/tui/interview_bootstrap_generate.go b/tui/interview_bootstrap_generate.go new file mode 100644 index 0000000..6c52b25 --- /dev/null +++ b/tui/interview_bootstrap_generate.go @@ -0,0 +1,299 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/charmbracelet/bubbles/spinner" + tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/lipgloss" + "github.com/muesli/reflow/wordwrap" +) + +// runBootstrapGenerate wraps a synchronous BootstrapRunner.Run call in a +// bubbletea program so the user sees a spinner while the bun subprocess works +// and lands on a persistent result screen after — instead of having the TUI +// exit silently the moment the subprocess returns. The result screen shows +// the absolute output path as an OSC 8 hyperlink (ctrl-click opens it in the +// OS file browser) and waits for esc / ctrl+c / q before quitting. +// +// Returns the same int the underlying runner would have returned. Stderr from +// the subprocess is mirrored to the caller's stderr after the result screen +// dismisses so warnings aren't swallowed. +var runBootstrapGenerate = func( + runner BootstrapRunner, + opts *BootstrapOptions, + stdout, stderr io.Writer, +) int { + m := newBootstrapGenerateModel(runner, opts) + p := tea.NewProgram(m, tea.WithAltScreen()) + final, err := p.Run() + if err != nil { + fmt.Fprintf(stderr, "Result screen failed: %v\n", err) + return 1 + } + gm, ok := final.(*bootstrapGenerateModel) + if !ok { + return 1 + } + // Forward captured subprocess streams to the caller now that the alt-screen + // is torn down. Stdout first (consola success line) then stderr (warnings). + if gm.stdoutBuf.Len() > 0 { + _, _ = io.Copy(stdout, &gm.stdoutBuf) + } + if gm.stderrBuf.Len() > 0 { + _, _ = io.Copy(stderr, &gm.stderrBuf) + } + return gm.exitCode +} + +// bootstrapGeneratePhase is the high-level state of the result screen. +type bootstrapGeneratePhase int + +const ( + bgPhaseRunning bootstrapGeneratePhase = iota + bgPhaseSuccess + bgPhaseFailure +) + +// bootstrapGenerateModel renders the generation spinner and, after the +// subprocess returns, the result screen. The model owns the captured +// stdout/stderr buffers so the parent can forward them once the TUI exits. +type bootstrapGenerateModel struct { + runner BootstrapRunner + opts *BootstrapOptions + + phase bootstrapGeneratePhase + exitCode int + stdoutBuf bytes.Buffer + stderrBuf bytes.Buffer + + spin spinner.Model + width, height int +} + +// subprocessDoneMsg is dispatched once the bun subprocess returns. exitCode +// is the runner's int return — non-zero means generation failed. +type subprocessDoneMsg struct { + exitCode int +} + +func newBootstrapGenerateModel(runner BootstrapRunner, opts *BootstrapOptions) *bootstrapGenerateModel { + sp := spinner.New() + sp.Spinner = spinner.Dot + sp.Style = lipgloss.NewStyle().Foreground(lipgloss.Color("14")) + return &bootstrapGenerateModel{ + runner: runner, + opts: opts, + phase: bgPhaseRunning, + spin: sp, + } +} + +func (m *bootstrapGenerateModel) Init() tea.Cmd { + return tea.Batch(m.spin.Tick, m.runSubprocess()) +} + +// runSubprocess returns a tea.Cmd that drives the bun subprocess on a +// goroutine (Bubble Tea runs Cmd in a goroutine) and emits a subprocessDoneMsg +// when it finishes. +func (m *bootstrapGenerateModel) runSubprocess() tea.Cmd { + return func() tea.Msg { + code := m.runner.Run(m.opts, &m.stdoutBuf, &m.stderrBuf) + return subprocessDoneMsg{exitCode: code} + } +} + +func (m *bootstrapGenerateModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch msg := msg.(type) { + case tea.WindowSizeMsg: + m.width = msg.Width + m.height = msg.Height + return m, nil + + case tea.KeyMsg: + // In running phase we deliberately ignore most keys so a stray press + // doesn't kill the in-flight subprocess and leave a half-written + // scaffold. Ctrl+C still works as a hard abort. + if msg.String() == "ctrl+c" { + return m, tea.Quit + } + if m.phase == bgPhaseRunning { + return m, nil + } + switch msg.String() { + case "esc", "q", "enter": + return m, tea.Quit + } + return m, nil + + case subprocessDoneMsg: + m.exitCode = msg.exitCode + if msg.exitCode == 0 { + m.phase = bgPhaseSuccess + } else { + m.phase = bgPhaseFailure + } + return m, nil + + case spinner.TickMsg: + var cmd tea.Cmd + m.spin, cmd = m.spin.Update(msg) + return m, cmd + } + + return m, nil +} + +func (m *bootstrapGenerateModel) View() string { + w := m.width + if w <= 0 { + w = 80 + } + + header := renderShellHeader(w) + hintStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + + var body, hints string + switch m.phase { + case bgPhaseRunning: + body = m.renderRunning() + hints = hintStyle.Render("ctrl+c to abort") + case bgPhaseSuccess: + body = m.renderSuccess() + hints = hintStyle.Render("esc / ctrl+c to dismiss") + case bgPhaseFailure: + body = m.renderFailure() + hints = hintStyle.Render("esc / ctrl+c to dismiss") + } + + return lipgloss.JoinVertical(lipgloss.Left, header, "", body, "", hints) +} + +func (m *bootstrapGenerateModel) renderRunning() string { + label := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + title := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("212")) + model := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("14")) + return fmt.Sprintf( + " %s %s\n\n %s\n %s %s\n", + m.spin.View(), + title.Render("Generating role scaffold…"), + label.Render("OpenAI is drafting your role files; this typically takes 30–90 seconds."), + label.Render("Model:"), + model.Render(bootstrapModelName()), + ) +} + +// bootstrapModelName returns the OpenAI model the generator is configured +// to use. Mirrors the precedence in OpenAIGeneratorClient: the +// AI_MODEL env var overrides the gpt-5-mini default. Surfaced in the +// TUI so the proctor sees which LLM is on the hook before a $1+ run. +func bootstrapModelName() string { + if v := strings.TrimSpace(os.Getenv("AI_MODEL")); v != "" { + return v + } + return "gpt-5-mini" +} + +func (m *bootstrapGenerateModel) renderSuccess() string { + titleStyle := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("10")) // green + labelStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + pathStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("14")) + + abs, link := absPathLink(m.opts.OutputDir) + pathLine := osc8Link(link, pathStyle.Render(abs)) + + return strings.Join([]string{ + " " + titleStyle.Render("✓ Role scaffold ready"), + "", + " " + labelStyle.Render("Output: ") + pathLine, + " " + labelStyle.Render("Ctrl-click the path above to open it in your file manager."), + }, "\n") +} + +func (m *bootstrapGenerateModel) renderFailure() string { + w := m.width + if w <= 0 { + w = 80 + } + titleStyle := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("9")) // red + labelStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + + // Wrap each captured stderr line to fit the terminal so long error + // messages (e.g. "ERROR - No failing/skipped tests found …") aren't + // truncated by the alt-screen. Body lines are indented 4 spaces, so the + // wrap budget is terminal width minus that indent. Below 20 cells the + // wrap becomes useless, so we clamp instead of rendering an empty column. + const indent = " " + wrapWidth := w - len(indent) + if wrapWidth < 20 { + wrapWidth = 20 + } + + lines := []string{ + " " + titleStyle.Render(fmt.Sprintf("✗ Generation failed (exit code %d)", m.exitCode)), + "", + } + if errMsg := strings.TrimSpace(m.stderrBuf.String()); errMsg != "" { + // Render the captured stderr tail (last few lines) so the user has + // context without dumping the entire buffer over the result screen. + tail := lastLines(errMsg, 6) + lines = append(lines, " "+labelStyle.Render("Last output:")) + for _, l := range strings.Split(tail, "\n") { + for _, wrapped := range strings.Split(wordwrap.String(l, wrapWidth), "\n") { + lines = append(lines, indent+wrapped) + } + } + } else { + lines = append(lines, " "+labelStyle.Render("No stderr was captured. See subprocess output after dismissing.")) + } + return strings.Join(lines, "\n") +} + +// absPathLink resolves a possibly-relative directory into an absolute path +// plus a `file://` URL suitable for OSC 8 hyperlinks. On error (path can't +// be resolved) it returns the input unchanged so the result screen still +// displays *something* useful — the link just won't open. +func absPathLink(p string) (abs, fileURL string) { + abs, err := filepath.Abs(p) + if err != nil || abs == "" { + return p, "" + } + // filepath.ToSlash converts Windows backslashes to forward slashes. + // On Windows abs starts with a drive letter (`C:\foo`) → after ToSlash + // `C:/foo`; the spec wants `file:///C:/foo` (three slashes). On Unix + // abs starts with `/` so `file://` + `/path` already gives three slashes. + slashed := filepath.ToSlash(abs) + if strings.HasPrefix(slashed, "/") { + fileURL = "file://" + slashed + } else { + fileURL = "file:///" + slashed + } + return abs, fileURL +} + +// osc8Link wraps label in an OSC 8 hyperlink escape sequence pointing to +// target. Modern terminals (iTerm2, Windows Terminal, WezTerm, Kitty, recent +// gnome-terminal) render it as a clickable link; older ones fall back to +// showing the label as plain text (the escape bytes are zero-width). +func osc8Link(target, label string) string { + if target == "" { + return label + } + const esc = "\x1b" + return esc + "]8;;" + target + esc + "\\" + label + esc + "]8;;" + esc + "\\" +} + +// lastLines returns the trailing n newline-separated lines of s. Used to +// keep the failure screen compact when a subprocess dumps a long stderr. +func lastLines(s string, n int) string { + lines := strings.Split(strings.TrimRight(s, "\n"), "\n") + if len(lines) <= n { + return strings.Join(lines, "\n") + } + return strings.Join(lines[len(lines)-n:], "\n") +} diff --git a/tui/interview_bootstrap_generate_test.go b/tui/interview_bootstrap_generate_test.go new file mode 100644 index 0000000..f47419b --- /dev/null +++ b/tui/interview_bootstrap_generate_test.go @@ -0,0 +1,373 @@ +package main + +import ( + "bytes" + "io" + "path/filepath" + "runtime" + "strings" + "testing" + + "github.com/charmbracelet/bubbles/spinner" + tea "github.com/charmbracelet/bubbletea" +) + +// TestOsc8Link_WrapsLabelInEscapeSequence asserts that the OSC 8 hyperlink +// helper produces the expected escape envelope. Ctrl-click in modern +// terminals depends on the exact byte sequence — start (ESC ]8;; ESC \), +// label, end (ESC ]8;; ESC \). +func TestOsc8Link_WrapsLabelInEscapeSequence(t *testing.T) { + out := osc8Link("file:///tmp/roles/x", "/tmp/roles/x") + if !strings.HasPrefix(out, "\x1b]8;;file:///tmp/roles/x\x1b\\") { + t.Errorf("link should start with OSC 8 open + URL + ST, got %q", out) + } + if !strings.HasSuffix(out, "\x1b]8;;\x1b\\") { + t.Errorf("link should end with OSC 8 close, got %q", out) + } + if !strings.Contains(out, "/tmp/roles/x") { + t.Errorf("link should contain visible label, got %q", out) + } +} + +// TestOsc8Link_EmptyTargetReturnsBareLabel ensures a missing target degrades +// to plain text rather than emitting a broken link. +func TestOsc8Link_EmptyTargetReturnsBareLabel(t *testing.T) { + if got := osc8Link("", "/tmp/roles/x"); got != "/tmp/roles/x" { + t.Errorf("empty target should return label unwrapped, got %q", got) + } +} + +// TestAbsPathLink_ProducesThreeSlashFileURL pins the URL shape — file:///abs +// on Unix and file:///:/... on Windows. Three leading slashes are +// required by RFC 8089. +func TestAbsPathLink_ProducesThreeSlashFileURL(t *testing.T) { + dir := t.TempDir() // absolute, exists + abs, link := absPathLink(dir) + if abs == "" { + t.Fatal("abs should be non-empty for an existing temp dir") + } + if !strings.HasPrefix(link, "file:///") { + t.Errorf("link should start with file:/// (three slashes), got %q", link) + } + // The slash-converted absolute path should appear in the URL on both + // platforms. + if !strings.Contains(link, filepath.ToSlash(abs)) { + t.Errorf("link should embed the slash-converted abs path %q, got %q", filepath.ToSlash(abs), link) + } +} + +// TestAbsPathLink_RelativePathIsResolvedToAbsolute covers the wizard's +// default output dir form (./roles/) — it must be expanded so the +// file:// URL is openable. +func TestAbsPathLink_RelativePathIsResolvedToAbsolute(t *testing.T) { + abs, link := absPathLink("./roles/test-role") + if !filepath.IsAbs(abs) { + t.Errorf("abs should be an absolute path, got %q", abs) + } + if link == "" || !strings.HasPrefix(link, "file:///") { + t.Errorf("link should be a file:/// URL, got %q", link) + } +} + +// TestLastLines_TrimsLongStderr keeps the failure screen compact when a +// subprocess dumps a long stack trace. +func TestLastLines_TrimsLongStderr(t *testing.T) { + in := "a\nb\nc\nd\ne\nf\ng" + got := lastLines(in, 3) + if got != "e\nf\ng" { + t.Errorf("expected last 3 lines, got %q", got) + } +} + +func TestLastLines_ShorterThanLimitReturnsAll(t *testing.T) { + in := "a\nb" + if got := lastLines(in, 5); got != "a\nb" { + t.Errorf("expected full input, got %q", got) + } +} + +// fakeRunner records the options it received and returns a configurable +// exit code + stderr payload so we can exercise both success and failure +// paths of the generate model without a real subprocess. +type fakeRunner struct { + code int + stderr string + stdout string + called bool +} + +func (f *fakeRunner) Run(_ *BootstrapOptions, stdout, stderr io.Writer) int { + f.called = true + if f.stdout != "" { + _, _ = stdout.Write([]byte(f.stdout)) + } + if f.stderr != "" { + _, _ = stderr.Write([]byte(f.stderr)) + } + return f.code +} + +// TestGenerateModel_SubprocessSuccessTransitionsToSuccessPhase drives the +// model directly via Update so we can assert phase transitions without a TTY. +func TestGenerateModel_SubprocessSuccessTransitionsToSuccessPhase(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./roles/x"}) + if m.phase != bgPhaseRunning { + t.Fatalf("initial phase should be Running, got %v", m.phase) + } + model, _ := m.Update(subprocessDoneMsg{exitCode: 0}) + gm := model.(*bootstrapGenerateModel) + if gm.phase != bgPhaseSuccess { + t.Errorf("expected Success phase after zero-exit, got %v", gm.phase) + } +} + +func TestGenerateModel_SubprocessFailureTransitionsToFailurePhase(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./roles/x"}) + model, _ := m.Update(subprocessDoneMsg{exitCode: 5}) + gm := model.(*bootstrapGenerateModel) + if gm.phase != bgPhaseFailure { + t.Errorf("expected Failure phase after non-zero exit, got %v", gm.phase) + } + if gm.exitCode != 5 { + t.Errorf("expected exitCode 5 to be retained, got %d", gm.exitCode) + } +} + +// TestGenerateModel_RunningPhase_IgnoresOrdinaryKeys protects the user from +// accidentally aborting mid-generation. The half-written scaffold would be +// unrecoverable, so we only honor Ctrl+C while running. +func TestGenerateModel_RunningPhase_IgnoresOrdinaryKeys(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + for _, key := range []tea.KeyMsg{ + {Type: tea.KeyEsc}, + {Type: tea.KeyRunes, Runes: []rune{'q'}}, + {Type: tea.KeyEnter}, + } { + _, cmd := m.Update(key) + if cmd != nil { + t.Errorf("key %v during running phase should not produce a Cmd, got %v", key, cmd) + } + if m.phase != bgPhaseRunning { + t.Errorf("key %v should not change phase from Running, got %v", key, m.phase) + } + } +} + +// TestGenerateModel_RunningPhase_CtrlCQuits — Ctrl+C is the hard-abort that +// always works, even mid-generation. +func TestGenerateModel_RunningPhase_CtrlCQuits(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + _, cmd := m.Update(tea.KeyMsg{Type: tea.KeyCtrlC}) + if cmd == nil { + t.Fatal("Ctrl+C should produce a Quit Cmd") + } +} + +// TestGenerateModel_DonePhase_DismissKeys verifies the documented dismiss +// affordance: esc / q / enter / ctrl+c all close the result screen. +func TestGenerateModel_DonePhase_DismissKeys(t *testing.T) { + for _, key := range []tea.KeyMsg{ + {Type: tea.KeyEsc}, + {Type: tea.KeyRunes, Runes: []rune{'q'}}, + {Type: tea.KeyEnter}, + {Type: tea.KeyCtrlC}, + } { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + // Move to success phase first. + m.Update(subprocessDoneMsg{exitCode: 0}) + _, cmd := m.Update(key) + if cmd == nil { + t.Errorf("dismiss key %v should produce a Quit Cmd in Success phase", key) + } + } +} + +// TestGenerateModel_View_RunningPhase asserts the visible elements while the +// subprocess is in flight: shell header, spinner, progress label, abort hint. +func TestGenerateModel_View_RunningPhase(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./roles/test"}) + m.width = 100 + view := stripANSI(m.View()) + for _, want := range []string{"TEAM HERO", "Generating role scaffold", "ctrl+c to abort"} { + if !strings.Contains(view, want) { + t.Errorf("running-phase view missing %q\n--- view ---\n%s", want, view) + } + } +} + +// TestGenerateModel_View_SuccessPhase asserts the result screen contains the +// success tick, an absolute path (not the input relative form), and dismiss +// hints — and that the path is wrapped in an OSC 8 hyperlink so terminals +// render it as ctrl-clickable. +func TestGenerateModel_View_SuccessPhase(t *testing.T) { + tmp := t.TempDir() + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: tmp}) + m.width = 100 + m.Update(subprocessDoneMsg{exitCode: 0}) + raw := m.View() + view := stripANSI(raw) + + for _, want := range []string{"✓ Role scaffold ready", tmp, "esc / ctrl+c to dismiss"} { + if !strings.Contains(view, want) { + t.Errorf("success-phase view missing %q\n--- view ---\n%s", want, view) + } + } + // OSC 8 hyperlink envelope should be in the raw (pre-strip) output so + // terminals capable of rendering it can pick it up. + if !strings.Contains(raw, "\x1b]8;;file://") { + t.Errorf("success-phase view should embed an OSC 8 file:// hyperlink, got:\n%s", raw) + } +} + +func TestGenerateModel_View_SuccessPhase_ExpandsRelativePath(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./roles/relative-test"}) + m.width = 100 + m.Update(subprocessDoneMsg{exitCode: 0}) + view := stripANSI(m.View()) + + if strings.Contains(view, "./roles/relative-test") && !strings.Contains(view, "/relative-test") { + t.Errorf("path should be displayed as absolute, not relative\n--- view ---\n%s", view) + } +} + +// TestGenerateModel_View_FailurePhase asserts the user sees the exit code +// and a tail of the captured stderr. +func TestGenerateModel_View_FailurePhase(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + m.width = 100 + // Pre-populate stderr to simulate a captured subprocess failure. + m.stderrBuf = *bytes.NewBufferString("line1\nline2\nfatal: out of quota\n") + m.Update(subprocessDoneMsg{exitCode: 2}) + view := stripANSI(m.View()) + + for _, want := range []string{"Generation failed", "exit code 2", "fatal: out of quota"} { + if !strings.Contains(view, want) { + t.Errorf("failure-phase view missing %q\n--- view ---\n%s", want, view) + } + } +} + +// TestGenerateModel_View_FailurePhase_WrapsLongLines protects against the +// alt-screen truncating long subprocess errors (the symptom that bit us: +// "Mode A projects must include at least one failing/sk[truncated]"). On a +// narrow terminal the wrapped output must contain the *complete* message and +// span more lines than the unwrapped form. +func TestGenerateModel_View_FailurePhase_WrapsLongLines(t *testing.T) { + longLine := "ERROR - No failing or skipped tests found. Mode A projects must include at least one failing/skipped test in tests/something.test.ts so the rubric can grade test-driven recovery." + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + m.width = 60 + m.stderrBuf = *bytes.NewBufferString(longLine + "\n") + m.Update(subprocessDoneMsg{exitCode: 1}) + view := stripANSI(m.View()) + + // Full message must survive — assert distinct fragments because wordwrap + // reflows the line and may split across line breaks (a string search + // across "\n " won't find a span the user can read just fine). + for _, fragment := range []string{ + "No failing or skipped tests found", + "failing/skipped test", + "tests/something.test.ts", + "driven recovery", + } { + // Collapse whitespace so wrap breaks don't defeat the assertion. + flat := strings.Join(strings.Fields(view), " ") + if !strings.Contains(flat, fragment) { + t.Errorf("wrapped view should contain %q somewhere, got flattened:\n%s", fragment, flat) + } + } + // Wrap must produce multiple visible lines that each fit the budget + // (width 60 minus 4-space indent = 56 cells). We allow a small overshoot + // because wordwrap won't break long unbroken tokens. + for _, line := range strings.Split(view, "\n") { + trimmed := strings.TrimRight(line, " ") + if len(trimmed) > 80 { + t.Errorf("wrapped line exceeded reasonable width (%d cells): %q", len(trimmed), trimmed) + } + } + // At least one wrap break must have happened. + bodyLines := 0 + for _, line := range strings.Split(view, "\n") { + if strings.Contains(line, "failing") || strings.Contains(line, "rubric") || strings.Contains(line, "Mode A") { + bodyLines++ + } + } + if bodyLines < 2 { + t.Errorf("expected the long error to wrap across multiple lines, got %d body lines:\n%s", bodyLines, view) + } +} + +// TestGenerateModel_View_FailurePhase_TinyWidthDoesNotPanic clamps the wrap +// budget when the terminal is absurdly narrow (e.g. fresh tea program before +// the first WindowSizeMsg arrives, width=0). Should render *something* rather +// than panicking on a non-positive wrap width. +func TestGenerateModel_View_FailurePhase_TinyWidthDoesNotPanic(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + m.width = 0 // no WindowSizeMsg yet + m.stderrBuf = *bytes.NewBufferString("a long line with several words to consume the wrap budget\n") + m.Update(subprocessDoneMsg{exitCode: 1}) + view := stripANSI(m.View()) + if !strings.Contains(view, "Generation failed") { + t.Errorf("expected header even at width=0, got:\n%s", view) + } +} + +// TestGenerateModel_Init_DispatchesSpinnerAndSubprocess confirms Init +// schedules both the spinner tick (so the user gets visible motion) and the +// subprocess Cmd (so generation actually starts). +func TestGenerateModel_Init_DispatchesSpinnerAndSubprocess(t *testing.T) { + fake := &fakeRunner{code: 0} + m := newBootstrapGenerateModel(fake, &BootstrapOptions{OutputDir: "/tmp/x"}) + cmd := m.Init() + if cmd == nil { + t.Fatal("Init should return a non-nil Cmd") + } +} + +// TestGenerateModel_SpinnerTickAdvancesSpinner makes sure the spinner is +// hooked up so the user gets visible motion during long runs. +func TestGenerateModel_SpinnerTickAdvancesSpinner(t *testing.T) { + m := newBootstrapGenerateModel(nil, &BootstrapOptions{OutputDir: "./x"}) + before := m.spin.View() + model, cmd := m.Update(spinner.TickMsg{}) + if cmd == nil { + t.Error("spinner tick should re-issue its own Tick command") + } + if _, ok := model.(*bootstrapGenerateModel); !ok { + t.Fatalf("Update should return *bootstrapGenerateModel, got %T", model) + } + // Spinner frame is allowed to be identical between consecutive ticks + // (slow update cadence). We're just confirming the chain is wired. + _ = before +} + +// TestGenerateModel_StreamForwarding ensures subprocess stdout/stderr are +// captured into the model's buffers (not the caller's) so the alt-screen +// isn't clobbered mid-render. The top-level runBootstrapGenerate forwards +// them after the tea program exits — this test stops before that point. +func TestGenerateModel_StreamForwarding(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("io.Writer-based runner.Run is platform-agnostic but path assertions below assume Unix") + } + fake := &fakeRunner{code: 0, stdout: "consola: success\n", stderr: "warning: deprecated flag\n"} + m := newBootstrapGenerateModel(fake, &BootstrapOptions{OutputDir: "/tmp/x"}) + // Directly invoke the Cmd that Init schedules instead of running the + // program — this gives us full control over message ordering. + cmd := m.runSubprocess() + msg := cmd() + done, ok := msg.(subprocessDoneMsg) + if !ok { + t.Fatalf("expected subprocessDoneMsg, got %T", msg) + } + if done.exitCode != 0 { + t.Errorf("expected exit 0, got %d", done.exitCode) + } + if !fake.called { + t.Error("fake runner should have been called") + } + if !strings.Contains(m.stdoutBuf.String(), "consola: success") { + t.Errorf("subprocess stdout should be captured to model buffer, got %q", m.stdoutBuf.String()) + } + if !strings.Contains(m.stderrBuf.String(), "deprecated flag") { + t.Errorf("subprocess stderr should be captured to model buffer, got %q", m.stderrBuf.String()) + } +} diff --git a/tui/interview_bootstrap_ideas.go b/tui/interview_bootstrap_ideas.go new file mode 100644 index 0000000..c2df283 --- /dev/null +++ b/tui/interview_bootstrap_ideas.go @@ -0,0 +1,217 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "path/filepath" + "strings" + "time" +) + +// ideaFetcherHTTPTimeout is sized for the OpenAI Responses API generating +// structured content, not the cheap auth-probe calls the shared +// defaultHTTPClient (5s) is tuned for. gpt-5-mini routinely takes 15-45s +// to return 5 ideas under strict json_schema. +const ideaFetcherHTTPTimeout = 90 * time.Second + +// ProjectIdea is one of N candidate project ideas the AI returns when the +// proctor picks "Suggest ideas" instead of writing a custom prompt. +type ProjectIdea struct { + Title string `json:"title"` + Blurb string `json:"blurb"` +} + +// IdeaProfile is the subset of the wizard's role-config that conditions the +// idea-generation prompt. Kept narrow so the fetcher contract doesn't have to +// move every time the wizard grows a new field. +type IdeaProfile struct { + Role string + RoleTitle string + Stack string + Domain string + Feature string + TimeBoxMinutes int + ProjectMode string +} + +// IdeaFetcher returns a list of project ideas tailored to the role profile. +// Tests substitute a stub so no real OpenAI traffic happens in CI. +type IdeaFetcher interface { + Fetch(p IdeaProfile) ([]ProjectIdea, error) +} + +// openAIIdeaFetcher hits api.openai.com/v1/responses with a structured-output +// schema that returns 3-5 ideas. Bills against the same OPENAI_API_KEY that +// `teamhero setup` writes to ~/.config/teamhero/.env, so no separate auth +// step is needed. +type openAIIdeaFetcher struct { + apiKey string + model string + client HTTPDoer +} + +// newOpenAIIdeaFetcher loads the API key from the persisted credentials file +// (same lookup `populateAIFields` uses for the report wizard). Returns a +// descriptive error when the key is absent so the wizard can surface +// "configure setup first" instead of dropping the user into a confusing +// 401 from OpenAI. +func newOpenAIIdeaFetcher() (*openAIIdeaFetcher, error) { + creds := loadExistingCredentials(filepath.Join(configDir(), ".env")) + key := strings.TrimSpace(creds["OPENAI_API_KEY"]) + if key == "" { + return nil, fmt.Errorf("OPENAI_API_KEY not configured — run `teamhero setup` to add one") + } + model := firstNonEmptyStr(creds["AI_MODEL"], "gpt-5-mini") + return &openAIIdeaFetcher{ + apiKey: key, + model: model, + client: &http.Client{Timeout: ideaFetcherHTTPTimeout}, + }, nil +} + +// buildIdeaPrompt is exported (lowercase but referenced by tests in the same +// package) so the prompt text is verifiable without hitting the network. +func buildIdeaPrompt(p IdeaProfile) string { + roleLabel := p.RoleTitle + if strings.TrimSpace(roleLabel) == "" { + roleLabel = p.Role + } + return fmt.Sprintf(`Generate 5 distinct project ideas suitable for a candidate coding interview. + +Role context (this is the candidate's profile as captured by the hiring manager): +- Role: %s +- Stack: %s +- Domain: %s +- Feature focus: %s +- Time-box: %d minutes +- Project mode: %s + +Each idea must be completable within the time-box by a single engineer working with an AI assistant. Vary the ideas — different sub-problems within the same domain, not minor reframings of one idea. + +Return JSON with an "ideas" array. Each entry has: +- title: short headline (4-8 words) +- blurb: 2-3 sentence description of what the candidate will build and why it tests the role profile above.`, roleLabel, p.Stack, p.Domain, p.Feature, p.TimeBoxMinutes, p.ProjectMode) +} + +// ideasResponseSchema is the JSON Schema body we hand to OpenAI's +// Responses API. `strict: true` forces the model to comply or fail loudly, +// rather than returning malformed output that we'd have to parse defensively. +var ideasResponseSchema = map[string]any{ + "type": "object", + "additionalProperties": false, + "required": []string{"ideas"}, + "properties": map[string]any{ + "ideas": map[string]any{ + "type": "array", + "minItems": 3, + "maxItems": 5, + "items": map[string]any{ + "type": "object", + "additionalProperties": false, + "required": []string{"title", "blurb"}, + "properties": map[string]any{ + "title": map[string]any{"type": "string"}, + "blurb": map[string]any{"type": "string"}, + }, + }, + }, + }, +} + +func (f *openAIIdeaFetcher) Fetch(p IdeaProfile) ([]ProjectIdea, error) { + prompt := buildIdeaPrompt(p) + payload := map[string]any{ + "model": f.model, + "input": prompt, + "text": map[string]any{ + "format": map[string]any{ + "type": "json_schema", + "name": "interview_project_ideas", + "strict": true, + "schema": ideasResponseSchema, + }, + }, + } + body, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("marshal request: %w", err) + } + req, _ := http.NewRequest("POST", openAIAPIBaseURL+"/v1/responses", strings.NewReader(string(body))) + req.Header.Set("Authorization", "Bearer "+f.apiKey) + req.Header.Set("Content-Type", "application/json") + + resp, err := f.client.Do(req) + if err != nil { + return nil, fmt.Errorf("OpenAI request failed: %w", err) + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("OpenAI returned HTTP %d: %s", resp.StatusCode, truncateForError(string(respBody))) + } + return parseIdeasResponse(respBody) +} + +// parseIdeasResponse extracts the ideas list from the Responses-API envelope. +// The API may surface the JSON either as `output_text` (top-level +// convenience field) or as the first content block of `output[0]`; handle +// both so a future API revision that drops the convenience field doesn't +// silently break us. +func parseIdeasResponse(raw []byte) ([]ProjectIdea, error) { + var envelope struct { + OutputText string `json:"output_text"` + Output []struct { + Content []struct { + Text string `json:"text"` + } `json:"content"` + } `json:"output"` + } + if err := json.Unmarshal(raw, &envelope); err != nil { + return nil, fmt.Errorf("decode OpenAI envelope: %w", err) + } + text := envelope.OutputText + if text == "" && len(envelope.Output) > 0 && len(envelope.Output[0].Content) > 0 { + text = envelope.Output[0].Content[0].Text + } + if strings.TrimSpace(text) == "" { + return nil, fmt.Errorf("OpenAI returned no text payload (envelope: %s)", truncateForError(string(raw))) + } + var parsed struct { + Ideas []ProjectIdea `json:"ideas"` + } + if err := json.Unmarshal([]byte(text), &parsed); err != nil { + return nil, fmt.Errorf("parse ideas JSON: %w", err) + } + if len(parsed.Ideas) == 0 { + return nil, fmt.Errorf("OpenAI returned an empty ideas array") + } + return parsed.Ideas, nil +} + +// truncateForError trims an HTTP body for inclusion in an error message — +// long bodies (especially 5xx HTML pages) make the wizard's error screen +// unreadable. 200 chars is enough to identify the failure shape. +func truncateForError(s string) string { + const max = 200 + if len(s) <= max { + return s + } + return s[:max] + "…" +} + +// stubIdeaFetcher is exposed so tea-level tests can drive the wizard without +// real network IO. Production code never references it. +type stubIdeaFetcher struct { + Ideas []ProjectIdea + Err error +} + +func (s stubIdeaFetcher) Fetch(_ IdeaProfile) ([]ProjectIdea, error) { + if s.Err != nil { + return nil, s.Err + } + return s.Ideas, nil +} diff --git a/tui/interview_bootstrap_ideas_test.go b/tui/interview_bootstrap_ideas_test.go new file mode 100644 index 0000000..6152507 --- /dev/null +++ b/tui/interview_bootstrap_ideas_test.go @@ -0,0 +1,154 @@ +package main + +import ( + "bytes" + "fmt" + "io" + "net/http" + "strings" + "testing" +) + +// recordingDoer captures every request the fetcher sends and returns the +// canned responses in order. Tests use it to assert prompt content and +// drive both success and error paths through the JSON-schema validator. +type recordingDoer struct { + requests []*http.Request + responses []*http.Response + errs []error +} + +func (r *recordingDoer) Do(req *http.Request) (*http.Response, error) { + idx := len(r.requests) + r.requests = append(r.requests, req) + if idx >= len(r.responses) { + return nil, fmt.Errorf("no response staged for call %d", idx) + } + resp := r.responses[idx] + var err error + if idx < len(r.errs) { + err = r.errs[idx] + } + return resp, err +} + +func mkRespBody(body string) *http.Response { + return &http.Response{ + StatusCode: 200, + Body: io.NopCloser(strings.NewReader(body)), + } +} + +func TestBuildIdeaPrompt_IncludesRoleProfile(t *testing.T) { + p := IdeaProfile{ + Role: "senior-backend", + RoleTitle: "Senior Backend Engineer", + Stack: "Go", + Domain: "Payments", + Feature: "Refund idempotency", + TimeBoxMinutes: 90, + ProjectMode: "A", + } + prompt := buildIdeaPrompt(p) + for _, want := range []string{ + "Senior Backend Engineer", "Go", "Payments", + "Refund idempotency", "90", "Project mode: A", + } { + if !strings.Contains(prompt, want) { + t.Errorf("prompt missing %q\nprompt:\n%s", want, prompt) + } + } +} + +func TestBuildIdeaPrompt_FallsBackToRoleSlugWhenTitleMissing(t *testing.T) { + p := IdeaProfile{Role: "junior-fe", Stack: "TS", Domain: "Storefront", Feature: "x"} + prompt := buildIdeaPrompt(p) + if !strings.Contains(prompt, "junior-fe") { + t.Errorf("expected role slug fallback in prompt: %s", prompt) + } +} + +func TestParseIdeasResponse_ParsesOutputText(t *testing.T) { + // Responses API returns JSON-schema-validated content as output_text. + body := `{"output_text":"{\"ideas\":[{\"title\":\"Ledger CRUD\",\"blurb\":\"Build a ledger.\"},{\"title\":\"Refund API\",\"blurb\":\"Add refund.\"} ]}"}` + ideas, err := parseIdeasResponse([]byte(body)) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ideas) != 2 { + t.Fatalf("expected 2 ideas, got %d", len(ideas)) + } + if ideas[0].Title != "Ledger CRUD" || ideas[1].Blurb != "Add refund." { + t.Errorf("ideas not parsed correctly: %+v", ideas) + } +} + +func TestParseIdeasResponse_FallsBackToOutputArray(t *testing.T) { + // Some Responses API revisions return the JSON in output[0].content[0].text + // rather than the top-level output_text. The parser must handle both + // shapes so a future API change doesn't silently break us. + body := `{"output":[{"content":[{"text":"{\"ideas\":[{\"title\":\"A\",\"blurb\":\"B\"}]}"}]}]}` + ideas, err := parseIdeasResponse([]byte(body)) + if err != nil { + t.Fatalf("parse: %v", err) + } + if len(ideas) != 1 || ideas[0].Title != "A" { + t.Errorf("unexpected ideas: %+v", ideas) + } +} + +func TestParseIdeasResponse_RejectsEmptyIdeas(t *testing.T) { + body := `{"output_text":"{\"ideas\":[]}"}` + if _, err := parseIdeasResponse([]byte(body)); err == nil { + t.Errorf("expected error on empty ideas array") + } +} + +func TestParseIdeasResponse_RejectsMissingText(t *testing.T) { + if _, err := parseIdeasResponse([]byte(`{}`)); err == nil { + t.Errorf("expected error when payload has neither output_text nor output[]") + } +} + +func TestOpenAIIdeaFetcher_Fetch_SendsAuthorizationHeader(t *testing.T) { + doer := &recordingDoer{ + responses: []*http.Response{ + mkRespBody(`{"output_text":"{\"ideas\":[{\"title\":\"T\",\"blurb\":\"B\"}]}"}`), + }, + } + f := &openAIIdeaFetcher{apiKey: "sk-test", model: "gpt-test", client: doer} + _, err := f.Fetch(IdeaProfile{Role: "x", Stack: "y", Domain: "z", Feature: "w", TimeBoxMinutes: 60, ProjectMode: "A"}) + if err != nil { + t.Fatalf("fetch: %v", err) + } + if len(doer.requests) != 1 { + t.Fatalf("expected 1 request, got %d", len(doer.requests)) + } + if got := doer.requests[0].Header.Get("Authorization"); got != "Bearer sk-test" { + t.Errorf("authorization header: got %q", got) + } + // Verify the request body actually contains the model and prompt — guards + // against a future refactor that builds the payload without including + // them. + var captured bytes.Buffer + _, _ = captured.ReadFrom(doer.requests[0].Body) + if !strings.Contains(captured.String(), "gpt-test") { + t.Errorf("request body missing model: %s", captured.String()) + } +} + +func TestOpenAIIdeaFetcher_Fetch_SurfacesHTTPErrors(t *testing.T) { + doer := &recordingDoer{ + responses: []*http.Response{ + {StatusCode: 401, Body: io.NopCloser(strings.NewReader(`{"error":"bad key"}`))}, + }, + } + f := &openAIIdeaFetcher{apiKey: "sk-bad", model: "gpt", client: doer} + _, err := f.Fetch(IdeaProfile{}) + if err == nil { + t.Fatalf("expected error on 401") + } + if !strings.Contains(err.Error(), "401") { + t.Errorf("error should mention HTTP code: %v", err) + } +} diff --git a/tui/interview_bootstrap_publish.go b/tui/interview_bootstrap_publish.go new file mode 100644 index 0000000..774c4aa --- /dev/null +++ b/tui/interview_bootstrap_publish.go @@ -0,0 +1,383 @@ +package main + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "os/exec" + "path/filepath" + "regexp" + "strings" + + "github.com/charmbracelet/huh" +) + +// PublishOptions captures the user's answers from the publish form. The +// flow is fully optional — when the user declines or no GitHub token is +// configured, this struct stays zero. +type PublishOptions struct { + // Owner is either the user's GitHub username (personal repo) or an + // organization login. Empty defaults to the authenticated user. + Owner string + // Repo is the repository name on GitHub. + Repo string + // Private toggles repo visibility. Default true — interview material + // shouldn't be browsable by random GitHub users. + Private bool +} + +// PublishResult is what a successful publish returns. URL is what the +// success screen displays as an OSC 8 hyperlink. +type PublishResult struct { + URL string +} + +// repoNameRe is the GitHub repo-name pattern. GitHub itself is more +// permissive (allows up to 100 chars, dot-prefix forbidden) but this +// pattern catches the common mistakes (spaces, slashes, weird unicode) +// before the API rejects them. +var repoNameRe = regexp.MustCompile(`^[A-Za-z0-9][A-Za-z0-9._-]{0,99}$`) + +// validateRepoName returns nil for an acceptable GitHub repo name, an +// error otherwise. Kept as a plain function so the wizard's huh.Input +// validator can wire it directly. +func validateRepoName(s string) error { + t := strings.TrimSpace(s) + if t == "" { + return fmt.Errorf("repository name is required") + } + if !repoNameRe.MatchString(t) { + return fmt.Errorf("repo name must start with a letter/digit and contain only letters, digits, '.', '_', '-'") + } + return nil +} + +// GitHubClient is the minimal surface we need for publish. Tests inject +// a fake doer; production uses defaultHTTPClient. +type GitHubClient struct { + Token string + Client HTTPDoer +} + +// CreateRepo POSTs to either /user/repos (personal) or /orgs/{owner}/repos +// (organization) and returns the created repo's URL. The owner-vs-org +// distinction is decided by checking whether the authenticated user's +// login matches `owner`; we call /user once to find that out, then route. +func (g *GitHubClient) CreateRepo(opts PublishOptions) (PublishResult, error) { + if strings.TrimSpace(g.Token) == "" { + return PublishResult{}, fmt.Errorf("GITHUB_PERSONAL_ACCESS_TOKEN is empty") + } + if err := validateRepoName(opts.Repo); err != nil { + return PublishResult{}, err + } + + user, err := g.authenticatedLogin() + if err != nil { + return PublishResult{}, err + } + + endpoint := githubAPIBaseURL + "/user/repos" + owner := strings.TrimSpace(opts.Owner) + if owner == "" { + owner = user + } + if owner != "" && !strings.EqualFold(owner, user) { + endpoint = fmt.Sprintf("%s/orgs/%s/repos", githubAPIBaseURL, owner) + } + + body, _ := json.Marshal(map[string]any{ + "name": opts.Repo, + "private": opts.Private, + "auto_init": false, + }) + req, _ := http.NewRequest("POST", endpoint, bytes.NewReader(body)) + req.Header.Set("Authorization", "Bearer "+g.Token) + req.Header.Set("Accept", "application/vnd.github+json") + req.Header.Set("Content-Type", "application/json") + + resp, err := g.Client.Do(req) + if err != nil { + return PublishResult{}, fmt.Errorf("GitHub request failed: %w", err) + } + defer resp.Body.Close() + rawBody, _ := io.ReadAll(resp.Body) + if resp.StatusCode/100 != 2 { + return PublishResult{}, fmt.Errorf("GitHub returned HTTP %d: %s", resp.StatusCode, truncateForError(string(rawBody))) + } + + var parsed struct { + HTMLURL string `json:"html_url"` + CloneURL string `json:"clone_url"` + } + if err := json.Unmarshal(rawBody, &parsed); err != nil { + return PublishResult{}, fmt.Errorf("decode GitHub response: %w", err) + } + if parsed.HTMLURL == "" { + return PublishResult{}, fmt.Errorf("GitHub response missing html_url: %s", truncateForError(string(rawBody))) + } + return PublishResult{URL: parsed.HTMLURL}, nil +} + +// authenticatedLogin asks /user for the token's owner login so we can +// pick the right create-repo endpoint. Cached only within one call — +// publish is a single-shot operation. +func (g *GitHubClient) authenticatedLogin() (string, error) { + req, _ := http.NewRequest("GET", githubAPIBaseURL+"/user", nil) + req.Header.Set("Authorization", "Bearer "+g.Token) + req.Header.Set("Accept", "application/vnd.github+json") + resp, err := g.Client.Do(req) + if err != nil { + return "", fmt.Errorf("GitHub /user failed: %w", err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode == 401 { + return "", fmt.Errorf("GitHub token unauthorized — run `teamhero setup` to refresh") + } + if resp.StatusCode/100 != 2 { + return "", fmt.Errorf("GitHub /user returned HTTP %d", resp.StatusCode) + } + var u struct { + Login string `json:"login"` + } + if err := json.Unmarshal(body, &u); err != nil { + return "", fmt.Errorf("decode /user: %w", err) + } + return u.Login, nil +} + +// gitRunner abstracts the subprocess invocations for testability. The +// production runner shells out to `git`; tests substitute a recorder. +type gitRunner interface { + Run(dir string, args ...string) (stdout, stderr string, err error) +} + +type execGitRunner struct{} + +func (execGitRunner) Run(dir string, args ...string) (string, string, error) { + cmd := exec.Command("git", args...) + cmd.Dir = dir + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + err := cmd.Run() + return stdout.String(), stderr.String(), err +} + +// InitAndPushParams bundles what we need to populate a fresh GitHub repo +// from a local directory. Token is embedded in the remote URL so git +// pushes without an interactive credential prompt. +type InitAndPushParams struct { + Dir string + RemoteHTTPS string // https://github.com/owner/repo.git — auth injected before use + Token string + CommitMsg string +} + +// initAndPush stages the generated project as a fresh git repo and pushes +// to the freshly-created GitHub remote. Refuses to clobber an existing +// .git/ directory — that would mean the user already has their own git +// state in there. +func initAndPush(g gitRunner, p InitAndPushParams) error { + if p.Dir == "" { + return fmt.Errorf("output directory is empty") + } + gitDir := filepath.Join(p.Dir, ".git") + if st, err := os.Stat(gitDir); err == nil && st.IsDir() { + return fmt.Errorf("refusing to clobber existing .git/ in %s — push it yourself with `git push`", p.Dir) + } + steps := [][]string{ + {"init", "-b", "main"}, + {"add", "."}, + {"commit", "-m", p.CommitMsg}, + } + for _, args := range steps { + if _, stderr, err := g.Run(p.Dir, args...); err != nil { + return fmt.Errorf("git %s failed: %v\n%s", strings.Join(args, " "), err, stderr) + } + } + // Inject the token into the remote URL so push doesn't prompt. Strip + // any existing credentials first to avoid double-auth segments. + remote := injectToken(p.RemoteHTTPS, p.Token) + if _, stderr, err := g.Run(p.Dir, "remote", "add", "origin", remote); err != nil { + return fmt.Errorf("git remote add failed: %v\n%s", err, stderr) + } + if _, stderr, err := g.Run(p.Dir, "push", "-u", "origin", "main"); err != nil { + return fmt.Errorf("git push failed: %v\n%s", err, stderr) + } + return nil +} + +// injectToken rewrites `https://github.com/...` into +// `https://oauth2:@github.com/...` for one-shot authenticated pushes. +// The token is short-lived in memory and never persisted to git config. +func injectToken(rawURL, token string) string { + if token == "" { + return rawURL + } + const prefix = "https://" + if !strings.HasPrefix(rawURL, prefix) { + return rawURL + } + rest := strings.TrimPrefix(rawURL, prefix) + // If the URL already contains credentials (user:pass@host) strip them. + if i := strings.Index(rest, "@"); i != -1 { + rest = rest[i+1:] + } + return prefix + "oauth2:" + token + "@" + rest +} + +// loadGitHubToken reads the persisted token from the same credentials +// file the report wizard and doctor use. Returns "" when no token is set. +func loadGitHubToken() string { + creds := loadExistingCredentials(filepath.Join(configDir(), ".env")) + return strings.TrimSpace(creds["GITHUB_PERSONAL_ACCESS_TOKEN"]) +} + +// promptForPublish renders the post-generation publish form. Returns +// (opts, true) on confirm, (zero, false) on cancel/abort. +func promptForPublish(defaultRepo, defaultOwner string) (PublishOptions, bool) { + opts := PublishOptions{ + Owner: defaultOwner, + Repo: defaultRepo, + Private: true, + } + var publish bool + confirm := huh.NewForm(huh.NewGroup( + huh.NewConfirm(). + Title("Publish this interview project to GitHub?"). + Description("The generated repository can be pushed to a new private GitHub repo. You can always do this later by hand."). + Affirmative("Yes, publish"). + Negative("Skip"). + Value(&publish), + )).WithTheme(huh.ThemeCharm()) + if err := confirm.Run(); err != nil || !publish { + return PublishOptions{}, false + } + details := huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Repository name"). + Description("e.g. interview-senior-backend"). + Value(&opts.Repo). + Validate(validateRepoName), + huh.NewInput(). + Title("Owner"). + Description("Your GitHub username for a personal repo, or an organization login."). + Value(&opts.Owner), + huh.NewSelect[bool](). + Title("Visibility"). + Options( + huh.NewOption("Private (recommended)", true), + huh.NewOption("Public", false), + ). + Value(&opts.Private), + )).WithTheme(huh.ThemeCharm()) + if err := details.Run(); err != nil { + return PublishOptions{}, false + } + return opts, true +} + +// defaultRepoName builds the suggested repo name from the role slug. +// Falls back to "interview-project" when the slug is missing. +func defaultRepoName(roleSlug string) string { + s := strings.TrimSpace(roleSlug) + if s == "" { + return "interview-project" + } + return "interview-" + s +} + +// offerPublishToGitHub is the entry point called by the wizard after +// generation succeeds. Silent no-op when no token is configured — +// callers who haven't run `teamhero setup` for GitHub shouldn't see +// an offer they can't use. +// +// Test seam: tests substitute `publishFlow` to avoid network IO. The +// production implementation drives a confirm form, hits the GitHub API, +// and runs git locally to push the generated tree. +var offerPublishToGitHub = func(opts *BootstrapOptions, stdout, stderr io.Writer) { + token := loadGitHubToken() + if token == "" { + fmt.Fprintln(stderr, "(Tip: run `teamhero setup` to configure GitHub for one-click publishing.)") + return + } + repoName := defaultRepoName(opts.Role) + pubOpts, ok := promptForPublish(repoName, "") + if !ok { + return + } + url, err := publishToGitHub(opts, pubOpts, token, stderr) + if err != nil { + // publishToGitHub already wrote a contextual error. + _ = err + return + } + fmt.Fprintf(stdout, "✓ Published to %s\n", url) +} + +// publishToGitHub is the non-interactive publish path shared between +// the prompt-driven offer (offerPublishToGitHub) and the agent-driven +// --publish flag (autoPublishToGitHub). Returns the html_url of the +// created repo on success, empty string with a written stderr message +// on any failure. Splitting this out lets the auto-publish flag reuse +// the exact API + git plumbing without re-running the confirm prompt. +func publishToGitHub( + opts *BootstrapOptions, + pubOpts PublishOptions, + token string, + stderr io.Writer, +) (string, error) { + client := &GitHubClient{Token: token, Client: defaultHTTPClient} + result, err := client.CreateRepo(pubOpts) + if err != nil { + fmt.Fprintf(stderr, "GitHub repo creation failed: %v\n", err) + return "", err + } + // Build https URL for git push. GitHub returns html_url like + // "https://github.com/owner/repo"; the clone URL we want is the + // same path with ".git" appended. + remote := strings.TrimRight(result.URL, "/") + ".git" + err = initAndPush(execGitRunner{}, InitAndPushParams{ + Dir: opts.OutputDir, + RemoteHTTPS: remote, + Token: token, + CommitMsg: "Initial commit: teamhero interview scaffold", + }) + if err != nil { + fmt.Fprintf( + stderr, + "git push failed: %v\nThe GitHub repository was created at %s but no commits were pushed. You can push manually from %s.\n", + err, result.URL, opts.OutputDir, + ) + return "", err + } + return result.URL, nil +} + +// init wires the production autoPublishToGitHub indirection (declared +// as a `var` in interview_bootstrap.go) to the real GitHub plumbing. +// Tests override the var to dodge network IO. The function returns the +// published URL or empty string on any failure (token absence, API +// rejection, push error) — failures already wrote contextual stderr. +func init() { + autoPublishToGitHub = func(opts *BootstrapOptions, stderr io.Writer) string { + token := loadGitHubToken() + if token == "" { + fmt.Fprintln(stderr, "auto-publish skipped: no GitHub token configured (run `teamhero setup` to fix).") + return "" + } + pubOpts := PublishOptions{ + Repo: defaultRepoName(opts.Role), + Private: true, + } + url, err := publishToGitHub(opts, pubOpts, token, stderr) + if err != nil { + return "" + } + return url + } +} diff --git a/tui/interview_bootstrap_publish_test.go b/tui/interview_bootstrap_publish_test.go new file mode 100644 index 0000000..e96b37f --- /dev/null +++ b/tui/interview_bootstrap_publish_test.go @@ -0,0 +1,268 @@ +package main + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + "os" + "strings" + "testing" +) + +func TestValidateRepoName(t *testing.T) { + cases := []struct { + name string + valid bool + }{ + {"interview-senior-backend", true}, + {"my.project_v2", true}, + {"a", true}, + {" trimmed-leading-space", true}, // trim() makes leading whitespace acceptable + {"", false}, + {".dotstart", false}, + {"slash/inside", false}, + {"has space", false}, + {strings.Repeat("a", 101), false}, // > 100 + } + for _, tc := range cases { + err := validateRepoName(tc.name) + if (err == nil) != tc.valid { + t.Errorf("validateRepoName(%q): valid=%v but got err=%v", tc.name, tc.valid, err) + } + } +} + +func TestDefaultRepoName(t *testing.T) { + if got := defaultRepoName("senior-backend"); got != "interview-senior-backend" { + t.Errorf("default: got %q", got) + } + if got := defaultRepoName(""); got != "interview-project" { + t.Errorf("fallback: got %q", got) + } +} + +func TestInjectToken_BasicURL(t *testing.T) { + got := injectToken("https://github.com/asa/foo.git", "ghp_abc") + want := "https://oauth2:ghp_abc@github.com/asa/foo.git" + if got != want { + t.Errorf("got %q, want %q", got, want) + } +} + +func TestInjectToken_StripsExistingCredentials(t *testing.T) { + got := injectToken("https://olduser:oldpass@github.com/asa/foo.git", "ghp_new") + if !strings.Contains(got, "oauth2:ghp_new@") { + t.Errorf("expected fresh token to replace old credentials: %s", got) + } + if strings.Contains(got, "oldpass") { + t.Errorf("old password leaked into rewritten URL: %s", got) + } +} + +func TestInjectToken_NoTokenReturnsInputUnchanged(t *testing.T) { + in := "https://github.com/asa/foo.git" + if got := injectToken(in, ""); got != in { + t.Errorf("empty token should not rewrite URL") + } +} + +// stagedDoer is a tiny http.RoundTripper-like recorder. Each Do() pops +// the first request expectation; if exhausted it errors. Keeps test +// assertions terse and explicit about call order. +type stagedDoer struct { + t *testing.T + queue []func(*http.Request) (*http.Response, error) + called int +} + +func (s *stagedDoer) Do(req *http.Request) (*http.Response, error) { + if s.called >= len(s.queue) { + return nil, fmt.Errorf("unexpected call %d to GitHub API: %s %s", s.called, req.Method, req.URL.Path) + } + resp, err := s.queue[s.called](req) + s.called++ + return resp, err +} + +func TestGitHubClient_CreateRepo_PersonalAccount(t *testing.T) { + doer := &stagedDoer{ + t: t, + queue: []func(*http.Request) (*http.Response, error){ + func(r *http.Request) (*http.Response, error) { + if r.URL.Path != "/user" { + t.Errorf("first call should be GET /user, got %s", r.URL.Path) + } + body := `{"login":"asa"}` + return &http.Response{StatusCode: 200, Body: io.NopCloser(strings.NewReader(body))}, nil + }, + func(r *http.Request) (*http.Response, error) { + if r.URL.Path != "/user/repos" { + t.Errorf("personal repo create should hit /user/repos, got %s", r.URL.Path) + } + // Verify the payload uses the user-supplied repo name. + var body map[string]any + _ = json.NewDecoder(r.Body).Decode(&body) + if body["name"] != "interview-x" || body["private"] != true { + t.Errorf("payload not as expected: %+v", body) + } + return &http.Response{ + StatusCode: 201, + Body: io.NopCloser(strings.NewReader(`{"html_url":"https://github.com/asa/interview-x","clone_url":"https://github.com/asa/interview-x.git"}`)), + }, nil + }, + }, + } + gh := &GitHubClient{Token: "ghp_test", Client: doer} + res, err := gh.CreateRepo(PublishOptions{Owner: "asa", Repo: "interview-x", Private: true}) + if err != nil { + t.Fatalf("create: %v", err) + } + if res.URL != "https://github.com/asa/interview-x" { + t.Errorf("URL: got %q", res.URL) + } +} + +func TestGitHubClient_CreateRepo_OrgOwner(t *testing.T) { + doer := &stagedDoer{ + t: t, + queue: []func(*http.Request) (*http.Response, error){ + func(r *http.Request) (*http.Response, error) { + return &http.Response{StatusCode: 200, Body: io.NopCloser(strings.NewReader(`{"login":"asa"}`))}, nil + }, + func(r *http.Request) (*http.Response, error) { + want := "/orgs/teamhero/repos" + if r.URL.Path != want { + t.Errorf("org repo create path: got %s want %s", r.URL.Path, want) + } + return &http.Response{ + StatusCode: 201, + Body: io.NopCloser(strings.NewReader(`{"html_url":"https://github.com/teamhero/repo"}`)), + }, nil + }, + }, + } + gh := &GitHubClient{Token: "ghp_test", Client: doer} + _, err := gh.CreateRepo(PublishOptions{Owner: "teamhero", Repo: "repo", Private: true}) + if err != nil { + t.Fatalf("create: %v", err) + } +} + +func TestGitHubClient_CreateRepo_SurfacesUnauthorized(t *testing.T) { + doer := &stagedDoer{ + t: t, + queue: []func(*http.Request) (*http.Response, error){ + func(r *http.Request) (*http.Response, error) { + return &http.Response{StatusCode: 401, Body: io.NopCloser(strings.NewReader(`{"message":"Bad credentials"}`))}, nil + }, + }, + } + gh := &GitHubClient{Token: "ghp_bad", Client: doer} + _, err := gh.CreateRepo(PublishOptions{Repo: "anything"}) + if err == nil { + t.Fatalf("expected error on 401") + } + if !strings.Contains(err.Error(), "unauthorized") && !strings.Contains(err.Error(), "401") { + t.Errorf("expected unauthorized error: %v", err) + } +} + +func TestGitHubClient_CreateRepo_ValidatesRepoName(t *testing.T) { + gh := &GitHubClient{Token: "ghp_test", Client: nil} + _, err := gh.CreateRepo(PublishOptions{Repo: ""}) + if err == nil { + t.Errorf("expected validation error on empty repo") + } +} + +// fakeGit records the commands that initAndPush issues so we can assert +// the order without actually running git. +type fakeGit struct { + calls [][]string + failOn string // substring match against args[0]; "" never fails + stderr string +} + +func (g *fakeGit) Run(_ string, args ...string) (string, string, error) { + g.calls = append(g.calls, args) + if g.failOn != "" && len(args) > 0 && strings.Contains(args[0], g.failOn) { + return "", g.stderr, fmt.Errorf("simulated failure on %v", args) + } + return "", "", nil +} + +func TestInitAndPush_HappyPath(t *testing.T) { + dir := t.TempDir() + g := &fakeGit{} + err := initAndPush(g, InitAndPushParams{ + Dir: dir, + RemoteHTTPS: "https://github.com/asa/foo.git", + Token: "ghp_test", + CommitMsg: "x", + }) + if err != nil { + t.Fatalf("push: %v", err) + } + // Expect: init, add, commit, remote add (with token injected), push + if len(g.calls) != 5 { + t.Fatalf("expected 5 git calls, got %d: %v", len(g.calls), g.calls) + } + if g.calls[0][0] != "init" || g.calls[1][0] != "add" || g.calls[2][0] != "commit" { + t.Errorf("git command order wrong: %v", g.calls) + } + if g.calls[3][0] != "remote" || g.calls[3][1] != "add" { + t.Errorf("remote add not at position 4: %v", g.calls[3]) + } + remoteURL := g.calls[3][3] + if !strings.Contains(remoteURL, "oauth2:ghp_test@github.com") { + t.Errorf("remote URL missing injected token: %s", remoteURL) + } + if g.calls[4][0] != "push" { + t.Errorf("push not at last position: %v", g.calls) + } +} + +func TestInitAndPush_RefusesExistingGitDir(t *testing.T) { + dir := t.TempDir() + if err := makeEmptyDir(dir + "/.git"); err != nil { + t.Fatalf("setup .git: %v", err) + } + g := &fakeGit{} + err := initAndPush(g, InitAndPushParams{ + Dir: dir, + RemoteHTTPS: "https://github.com/x/y.git", + Token: "t", + CommitMsg: "m", + }) + if err == nil { + t.Fatalf("expected refusal when .git/ already exists") + } + if !strings.Contains(err.Error(), ".git") { + t.Errorf("error should mention .git: %v", err) + } + if len(g.calls) > 0 { + t.Errorf("should not have invoked git when refusing: %v", g.calls) + } +} + +func TestInitAndPush_SurfacesPushFailure(t *testing.T) { + dir := t.TempDir() + g := &fakeGit{failOn: "push", stderr: "remote unreachable"} + err := initAndPush(g, InitAndPushParams{ + Dir: dir, + RemoteHTTPS: "https://github.com/x/y.git", + Token: "t", + CommitMsg: "m", + }) + if err == nil { + t.Fatalf("expected push failure") + } + if !strings.Contains(err.Error(), "remote unreachable") { + t.Errorf("error should include captured stderr: %v", err) + } +} + +func makeEmptyDir(p string) error { + return os.MkdirAll(p, 0o755) +} diff --git a/tui/interview_bootstrap_summary.go b/tui/interview_bootstrap_summary.go new file mode 100644 index 0000000..0d7a0ec --- /dev/null +++ b/tui/interview_bootstrap_summary.go @@ -0,0 +1,157 @@ +package main + +import ( + "strings" + + "github.com/charmbracelet/lipgloss" +) + +// renderInterviewBootstrapSummary builds the right-side configuration summary +// panel for the interview bootstrap wizard. It mirrors renderSummary() for +// the report wizard: a bordered box with one labelled row per field, the +// current field highlighted, and fields not yet reached shown as "—". +func renderInterviewBootstrapSummary( + m *bootstrapWizardModel, + currentStep interviewBootstrapStep, + highWater interviewBootstrapStep, + width int, +) string { + if width < 20 { + width = 20 + } + + headerStyle := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("212")) + labelStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + valueStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("15")) + dimStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("239")) + activeLabel := lipgloss.NewStyle().Foreground(lipgloss.Color("14")).Bold(true) + + boxStyle := lipgloss.NewStyle(). + Border(lipgloss.RoundedBorder()). + BorderForeground(lipgloss.Color("240")). + Padding(0, 1) + + innerWidth := width - boxStyle.GetHorizontalBorderSize() + + type entry struct { + label string + value string + step interviewBootstrapStep + } + + rubricValue := m.modeRubric + if m.modeRubric == "custom" && m.customPrompt != "" { + rubricValue = "custom (" + truncate(m.customPrompt, 24) + ")" + } + + jdValue := "—" + if m.jdProvided == "yes" { + if m.jdPath != "" { + jdValue = truncate(m.jdPath, 28) + } else { + jdValue = "(path pending)" + } + } else if m.jdProvided == "no" { + jdValue = "none" + } + + jdInfluenceValue := "—" + if m.jdProvided == "yes" { + if m.jdInfluencesProject == "yes" { + jdInfluenceValue = "shapes project" + } else if m.jdInfluencesProject == "no" { + jdInfluenceValue = "review only" + } + } + + // Entry order mirrors the wizard's step order: JD comes BEFORE + // Domain because the JD describes the business domain. When a JD + // is attached, the wizard skips Domain entirely — the row will + // render with a "—" placeholder. + domainValue := m.domain + if m.jdProvided == "yes" && strings.TrimSpace(m.domain) == "" { + domainValue = "(from JD)" + } + entries := []entry{ + {"Role slug", m.role, ibStepRole}, + {"Role title", m.roleTitle, ibStepRoleTitle}, + {"Stack", m.stack, ibStepStack}, + {"JD attached", jdValue, ibStepJDProvided}, + {"JD usage", jdInfluenceValue, ibStepJDInfluencesProject}, + {"Domain", domainValue, ibStepDomain}, + {"Feature source", fmtFeatureSource(m.featureSource), ibStepFeatureSource}, + {"Feature", truncate(m.feature, 28), ibStepFeature}, + {"Time-box", fmtTimeBox(m.timeBox), ibStepTimeBox}, + {"Project type", fmtProjectMode(m.modeProject), ibStepProjectMode}, + {"Analysis mode", m.modeAnalysis, ibStepAnalysisMode}, + {"Rubric", rubricValue, ibStepRubricMode}, + {"Output dir", m.outputDir, ibStepOutputDir}, + } + + lines := []string{ + headerStyle.Render("Interview Bootstrap"), + "", + } + + for _, e := range entries { + lbl := labelStyle + if e.step == currentStep { + lbl = activeLabel + } + val := dimStyle.Render("—") + if highWater > e.step && strings.TrimSpace(e.value) != "" { + val = valueStyle.Render(e.value) + } + lines = append(lines, lbl.Render(e.label+": ")+val) + } + + return boxStyle.Width(innerWidth).Render(strings.Join(lines, "\n")) +} + +func fmtProjectMode(s string) string { + switch s { + case "brownfield", "A": + return "Brownfield — AI scaffolds" + case "greenfield-stack", "B": + return "Greenfield (your stack)" + case "greenfield-open": + return "Greenfield (candidate picks)" + default: + return s + } +} + +func fmtTimeBox(s string) string { + if s == "" { + return "" + } + return s + " min" +} + +func fmtFeatureSource(s string) string { + switch s { + case "custom": + return "typed by proctor" + case "suggest": + return "AI-suggested" + default: + return s + } +} + +// truncate clips a string to `max` runes and appends "…" when clipped. +// Operates on runes (not bytes) so multi-byte characters like accented +// Latin or CJK don't get split mid-codepoint into invalid UTF-8. +func truncate(s string, max int) string { + if max <= 0 { + return "" + } + runes := []rune(s) + if max == 1 { + return s + } + if len(runes) <= max { + return s + } + return string(runes[:max-1]) + "…" +} diff --git a/tui/interview_bootstrap_tea.go b/tui/interview_bootstrap_tea.go new file mode 100644 index 0000000..8484c6c --- /dev/null +++ b/tui/interview_bootstrap_tea.go @@ -0,0 +1,720 @@ +package main + +import ( + "fmt" + "strconv" + "strings" + + "github.com/charmbracelet/bubbles/spinner" + tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/huh" + "github.com/charmbracelet/lipgloss" +) + +// interviewBootstrapStep enumerates the form screens of the bootstrap +// wizard. Branching for the JD provided/influences-project pair and +// the rubric-mode custom-prompt detour is handled by advance(), not by +// the enumeration order. +type interviewBootstrapStep int + +const ( + ibStepRole interviewBootstrapStep = iota + ibStepRoleTitle + ibStepStack + // ibStepJDProvided sits BEFORE Domain so a job-description-supplied + // domain doesn't have to be re-typed by the proctor. "yes" routes + // through the path + influences-project pair AND skips Domain + // entirely; "no" routes to Domain so the proctor can name the + // business context explicitly. + ibStepJDProvided + ibStepJDPath + ibStepJDInfluencesProject + // ibStepDomain is reached only when no JD was attached. With a JD + // the AI infers domain from the JD context block; the role-config + // validator accepts empty domain in that case. + ibStepDomain + // ibStepFeatureSource is the either/or step that picks whether the + // proctor types the feature description themselves or asks the AI to + // suggest project ideas scoped to the role profile. The "Feature" + // description is the single source of truth for what the candidate + // builds — the old PromptSource + ProjectPrompt addendum pair were + // redundant and have been removed. + ibStepFeatureSource + ibStepFeature + // ibStepIdeaFetching is a transient spinner state shown while the + // idea-fetcher runs. Reached only when featureSource == "suggest". + ibStepIdeaFetching + // ibStepIdeaSelect presents the fetched ideas as a single-select. + // The chosen idea's title+blurb populates data.feature before the + // wizard advances to time-box. + ibStepIdeaSelect + ibStepTimeBox + // ibStepTimeBoxCustom is a sub-step shown only when the user chooses + // "Custom" on the time-box select. It runs the validated minutes-input + // form before the wizard advances to project-mode. + ibStepTimeBoxCustom + ibStepProjectMode + ibStepAnalysisMode + // ibStepRubricMode is now just default/custom — the "default+jd" + // value was retired in favour of the standalone JD branch. + ibStepRubricMode + ibStepCustomPrompt + ibStepOutputDir + ibStepConfirm + ibStepDone +) + +// interviewBootstrapTeaModel is a bubbletea Model that drives the bootstrap +// wizard. It wraps the existing bootstrapWizardModel data container and +// embeds a *huh.Form for the current screen, so the View() composition +// produces the same shell-header + summary-panel layout as the report +// wizard. +type interviewBootstrapTeaModel struct { + data bootstrapWizardModel + step interviewBootstrapStep + highWater interviewBootstrapStep + form *huh.Form + + // ideaFetcher is the strategy used when featureSource == "suggest". + // Production callers leave it nil; the tea model lazily constructs an + // openAIIdeaFetcher when first needed. Tests inject a stub via the + // constructor to avoid real HTTP traffic. + ideaFetcher IdeaFetcher + spin spinner.Model + + width, height int +} + +func newInterviewBootstrapTeaModel(d BootstrapWizardDefaults) *interviewBootstrapTeaModel { + return newInterviewBootstrapTeaModelWithFetcher(d, nil) +} + +// newInterviewBootstrapTeaModelWithFetcher is the test seam — supply a +// stubIdeaFetcher in tests so the "suggest ideas" branch can be exercised +// without HTTP. Pass nil to use the production OpenAI fetcher (constructed +// lazily on first need). +func newInterviewBootstrapTeaModelWithFetcher(d BootstrapWizardDefaults, fetcher IdeaFetcher) *interviewBootstrapTeaModel { + sp := spinner.New() + sp.Spinner = spinner.Dot + sp.Style = lipgloss.NewStyle().Foreground(lipgloss.Color("14")) + m := &interviewBootstrapTeaModel{ + data: newBootstrapWizardModel(d), + step: ibStepRole, + highWater: ibStepRole, + ideaFetcher: fetcher, + spin: sp, + } + m.form = m.buildForm() + return m +} + +func (m *interviewBootstrapTeaModel) Init() tea.Cmd { + if m.form != nil { + return m.form.Init() + } + return nil +} + +// ideasFetchedMsg is dispatched by the async idea-fetch tea.Cmd. Carries +// either the populated ideas slice or a human-readable error string. +type ideasFetchedMsg struct { + ideas []ProjectIdea + err string +} + +func (m *interviewBootstrapTeaModel) Update(msg tea.Msg) (tea.Model, tea.Cmd) { + switch msg := msg.(type) { + case tea.WindowSizeMsg: + m.width = msg.Width + m.height = msg.Height + if m.form != nil { + m.form = m.form.WithWidth(m.formWidth()) + // Forward the resize message so internal viewport/scroll state + // inside the form's fields re-layouts immediately rather than + // waiting for the next keystroke. + form, cmd := m.form.Update(msg) + if f, ok := form.(*huh.Form); ok { + m.form = f + } + return m, cmd + } + return m, nil + + case tea.KeyMsg: + if msg.String() == "ctrl+c" { + m.data.aborted = true + return m, tea.Quit + } + + case ideasFetchedMsg: + // The async fetch completed. Land on the idea-select screen + // regardless of success/failure — buildForm() renders an error + // note when err != "" so the user can dismiss it and fall through. + m.data.ideas = msg.ideas + m.data.ideaFetchErr = msg.err + return m.advance() + + case spinner.TickMsg: + // Only progress the spinner while we're in a transient + // async-work state, to keep redraws cheap when forms are active. + if m.step == ibStepIdeaFetching { + var cmd tea.Cmd + m.spin, cmd = m.spin.Update(msg) + return m, cmd + } + } + + // While fetching, we have no form to drive — return early so the + // View() path renders the spinner without forwarding the message + // into a nil form. + if m.step == ibStepIdeaFetching { + return m, nil + } + + if m.form == nil { + return m, tea.Quit + } + + form, cmd := m.form.Update(msg) + if f, ok := form.(*huh.Form); ok { + m.form = f + } + + if m.form.State == huh.StateCompleted { + return m.advance() + } + if m.form.State == huh.StateAborted { + m.data.aborted = true + return m, tea.Quit + } + return m, cmd +} + +func (m *interviewBootstrapTeaModel) View() string { + if m.step == ibStepDone { + return "" + } + + w := m.width + if w <= 0 { + w = 80 + } + + title := renderShellHeader(w) + + formWidth := m.formWidth() + summaryWidth := w - formWidth - 2 + + leftFrame := lipgloss.NewStyle(). + Border(lipgloss.HiddenBorder()). + Padding(0, 1) + leftInnerWidth := max(20, formWidth-leftFrame.GetHorizontalFrameSize()) + + leftPanel := "" + if m.step == ibStepIdeaFetching { + label := lipgloss.NewStyle().Foreground(lipgloss.Color("245")) + title := lipgloss.NewStyle().Bold(true).Foreground(lipgloss.Color("212")) + leftPanel = fmt.Sprintf( + " %s %s\n\n %s\n", + m.spin.View(), + title.Render("Fetching project ideas…"), + label.Render("OpenAI is drafting a handful of ideas scoped to your role profile."), + ) + } else if m.form != nil { + leftPanel = m.form.View() + } + leftPanel = leftFrame.Width(leftInnerWidth).Render(leftPanel) + + rightPanel := renderInterviewBootstrapSummary(&m.data, m.step, m.highWater, summaryWidth) + + left := lipgloss.NewStyle().Width(formWidth).Render(leftPanel) + right := lipgloss.NewStyle().Width(summaryWidth).Render(rightPanel) + + hintStyle := lipgloss.NewStyle().Foreground(lipgloss.Color("241")) + hints := hintStyle.Render("enter continue • ctrl+c quit") + + body := lipgloss.JoinHorizontal(lipgloss.Top, left, " ", right) + + return lipgloss.JoinVertical(lipgloss.Left, title, "", body, "", hints) +} + +func (m *interviewBootstrapTeaModel) formWidth() int { + w := m.width + if w <= 0 { + w = 80 + } + return w * 3 / 5 +} + +// advance moves to the next step, accounting for the rubric-mode branch, +// the time-box "custom" sub-step, and the suggest-ideas async fetch. +// Returns (model, tea.Quit) when the wizard reaches its final state +// (Confirm answered). +func (m *interviewBootstrapTeaModel) advance() (tea.Model, tea.Cmd) { + // Persist the selected idea into the feature description as the user + // leaves the idea-select step — that becomes the candidate-facing + // project description AND the AI prompt's feature focus. + if m.step == ibStepIdeaSelect { + m.commitSelectedIdea() + } + + next := m.nextStep(m.step) + if next == ibStepDone { + m.step = ibStepDone + return m, tea.Quit + } + m.step = next + if next > m.highWater { + m.highWater = next + } + + // Entering the async fetch state: form is nil, spinner runs, and we + // dispatch the actual OpenAI call as a tea.Cmd. The corresponding + // ideasFetchedMsg lands back in Update() and re-enters advance(). + if next == ibStepIdeaFetching { + m.form = nil + return m, tea.Batch(m.spin.Tick, m.fetchIdeasCmd()) + } + + m.form = m.buildForm() + if m.form == nil { + return m, tea.Quit + } + return m, m.form.Init() +} + +// commitSelectedIdea copies the chosen idea's "title — blurb" into +// data.feature so the downstream generator and the candidate-facing +// role-config both see the AI-suggested project as the single feature +// description. No-op when no idea is selected (e.g. when the fetch +// failed and the user pressed enter on the error note). +func (m *interviewBootstrapTeaModel) commitSelectedIdea() { + if len(m.data.ideas) == 0 { + return + } + idx := m.data.ideaSelected + if idx < 0 || idx >= len(m.data.ideas) { + idx = 0 + } + chosen := m.data.ideas[idx] + m.data.feature = strings.TrimSpace(chosen.Title + "\n\n" + chosen.Blurb) +} + +// fetchIdeasCmd returns a tea.Cmd that runs the OpenAI idea-fetch on a +// goroutine (Bubble Tea schedules Cmd in goroutines) and emits an +// ideasFetchedMsg when it finishes. The fetcher is lazily constructed +// the first time it's needed in production; tests inject a stub via the +// constructor and skip this lazy path entirely. +func (m *interviewBootstrapTeaModel) fetchIdeasCmd() tea.Cmd { + return func() tea.Msg { + fetcher := m.ideaFetcher + if fetcher == nil { + f, err := newOpenAIIdeaFetcher() + if err != nil { + return ideasFetchedMsg{err: err.Error()} + } + fetcher = f + } + tbMin := 0 + if n, err := strconv.Atoi(strings.TrimSpace(m.data.timeBox)); err == nil { + tbMin = n + } + profile := IdeaProfile{ + Role: m.data.role, + RoleTitle: m.data.roleTitle, + Stack: m.data.stack, + Domain: m.data.domain, + Feature: m.data.feature, + TimeBoxMinutes: tbMin, + ProjectMode: m.data.modeProject, + } + ideas, err := fetcher.Fetch(profile) + if err != nil { + return ideasFetchedMsg{err: err.Error()} + } + return ideasFetchedMsg{ideas: ideas} + } +} + + +func (m *interviewBootstrapTeaModel) nextStep(cur interviewBootstrapStep) interviewBootstrapStep { + switch cur { + case ibStepRole: + return ibStepRoleTitle + case ibStepRoleTitle: + return ibStepStack + case ibStepStack: + return ibStepJDProvided + case ibStepJDProvided: + if m.data.jdProvided == "yes" { + return ibStepJDPath + } + // No JD → ask Domain explicitly. With a JD, skip Domain; + // the AI infers it from the JD context block. + return ibStepDomain + case ibStepJDPath: + return ibStepJDInfluencesProject + case ibStepJDInfluencesProject: + return ibStepFeatureSource + case ibStepDomain: + return ibStepFeatureSource + case ibStepFeatureSource: + if m.data.featureSource == "suggest" { + return ibStepIdeaFetching + } + return ibStepFeature + case ibStepFeature: + return ibStepTimeBox + case ibStepIdeaFetching: + return ibStepIdeaSelect + case ibStepIdeaSelect: + return ibStepTimeBox + case ibStepTimeBox: + // Branch into the custom sub-step only when the user picked + // "Custom" on the select. Otherwise skip straight to project mode. + if m.data.timeBox == "custom" { + return ibStepTimeBoxCustom + } + return ibStepProjectMode + case ibStepTimeBoxCustom: + return ibStepProjectMode + case ibStepProjectMode: + return ibStepAnalysisMode + case ibStepAnalysisMode: + return ibStepRubricMode + case ibStepRubricMode: + if m.data.modeRubric == "custom" { + return ibStepCustomPrompt + } + return ibStepOutputDir + case ibStepCustomPrompt: + return ibStepOutputDir + case ibStepOutputDir: + return ibStepConfirm + case ibStepConfirm: + return ibStepDone + default: + return ibStepDone + } +} + +// buildForm constructs the huh.Form for the current step. Each form binds +// to a field on m.data so the data container stays the single source of +// truth for the validated final result. +func (m *interviewBootstrapTeaModel) buildForm() *huh.Form { + d := &m.data + switch m.step { + case ibStepRole: + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Role slug (URL-safe identifier)"). + Description("Used in paths and role-config.json; not shown to the candidate."). + Value(&d.role). + Validate(validateRoleSlug), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepRoleTitle: + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Role title (human-readable, optional)"). + Description("Appears in the candidate-facing README/BRIEF header."). + Value(&d.roleTitle), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepStack: + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Primary tech stack"). + Description("Sets the language the AI uses for source files and tests."). + Value(&d.stack). + Validate(nonEmpty("stack")), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepDomain: + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Business domain"). + Description("Shapes the GLOSSARY vocabulary and naming in generated code."). + Value(&d.domain). + Validate(nonEmpty("domain")), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepFeatureSource: + // The single either/or step that replaces the old PromptSource + + // ProjectPrompt redundancy. The feature description IS the project + // prompt; the proctor either writes it themselves or picks from a + // few AI-drafted ideas. Description kept under one line at the + // default formWidth to dodge huh.ThemeCharm's left-bar break on + // wrapped Description lines. + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("How should we describe the project?"). + Description("Drives what the candidate is asked to build."). + Options( + huh.NewOption("I'll write the description myself", "custom"), + huh.NewOption("Suggest project ideas for me", "suggest"), + ). + Value(&d.featureSource), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepFeature: + return huh.NewForm(huh.NewGroup( + huh.NewText(). + Title("Feature description"). + Description("Becomes the project's central focus and shapes every generated file."). + Value(&d.feature). + Validate(nonEmpty("feature")), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepTimeBox: + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("Time-box (minutes)"). + Description("Bounds the acceptance criteria; appears in the candidate's brief."). + Options( + huh.NewOption("60 minutes (recommended)", "60"), + huh.NewOption("90 minutes", "90"), + huh.NewOption("120 minutes", "120"), + huh.NewOption("Custom", "custom"), + ). + Value(&d.timeBox), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepTimeBoxCustom: + // The select binds to d.timeBox; arriving here means it's the + // literal "custom". Replace it with the empty string so the input + // field starts blank rather than showing "custom" as the value. + if d.timeBox == "custom" { + d.timeBox = "" + } + return m.buildTimeBoxCustomForm() + + case ibStepProjectMode: + // Three options drive what the AI scaffolds: + // brownfield — generate a starter codebase in your stack + // greenfield-stack — written brief; candidate codes from scratch using your stack + // greenfield-open — written brief; candidate also picks the tech stack + // Internally the first two map to projectMode "A"/"B" so the + // downstream validator and OpenAI client only need to know about + // scaffolding-vs-brief. The third is "B" + stackByCandidate=true, + // resolved in bootstrapWizardOptionsFromModel. + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("Project type"). + Description("Picks whether the AI scaffolds starter code or only a written brief."). + Options( + huh.NewOption("Brownfield — AI scaffolds a starter codebase", "brownfield"), + huh.NewOption("Greenfield (use the stack above) — brief only", "greenfield-stack"), + huh.NewOption("Greenfield (candidate picks stack) — brief only", "greenfield-open"), + ). + Value(&d.modeProject), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepAnalysisMode: + // Description kept short enough to fit on one line at the + // default formWidth (3/5 of an 80-col terminal = 48 chars). + // huh.ThemeCharm's left vertical bar fails to extend onto + // wrapped Description lines, producing a visual break, so we + // pre-shorten static descriptions to dodge huh's wrap path. + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("Analysis mode"). + Description("AI-assisted drafts post-interview observations; human-only doesn't."). + Options( + huh.NewOption("AI-assisted (recommended)", "ai-assisted"), + huh.NewOption("Human-only", "human-only"), + ). + Value(&d.modeAnalysis), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepRubricMode: + // JD attachment moved to its own earlier step so the rubric + // question is now a clean default-vs-custom. The default option + // label and description signal what the rubric actually + // observes — traditional engineering discipline (domain-driven + // design, deep modules, verification, etc.) surfaced through + // how the candidate works with AI — rather than the opaque + // "9 built-in dimensions" phrasing it used to carry. + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("How should AI share observations?"). + Description("Default rubric looks for sound engineering practices applied to AI-assisted work."). + Options( + huh.NewOption("Default — DDD, deep modules, verification (recommended)", "default"), + huh.NewOption("Custom — write your own prompt", "custom"), + ). + Value(&d.modeRubric), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepCustomPrompt: + return huh.NewForm(huh.NewGroup( + huh.NewText(). + Title("Custom rubric prompt"). + Description("Free-form prompt the AI uses in place of the built-in dimensions."). + Value(&d.customPrompt). + Validate(nonEmpty("custom prompt")), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepJDProvided: + // Standalone JD branch. Defaults to "no" so a hiring manager + // who pressed enter through everything ends up without a JD + // rather than with a broken path. Description kept short to + // dodge huh.ThemeCharm's left-bar wrap bug. + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("Will you provide a job description?"). + Description("Optional. Feeds the AI observer when set."). + Options( + huh.NewOption("No", "no"), + huh.NewOption("Yes — I have a JD file", "yes"), + ). + Value(&d.jdProvided), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepJDPath: + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Path to job description file"). + Description("Markdown or text JD; AI reads it as evaluation context."). + Value(&d.jdPath). + Validate(func(s string) error { + if err := validateJDPath(s); err != nil { + return err + } + if strings.TrimSpace(s) == "" { + return fmt.Errorf("JD path is required (or pick No on the previous step)") + } + return nil + }), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepJDInfluencesProject: + // When "yes", the project-generation prompt reads the JD and + // tailors the project's complexity and domain to match — e.g., + // a junior healthtech JD nudges the AI toward an EHR-flavoured + // feature. When "no", the JD is still used by the post-interview + // observer; it just doesn't shape what the candidate sees. + return huh.NewForm(huh.NewGroup( + huh.NewSelect[string](). + Title("Should the JD influence the project?"). + Description("Tailors the generated project to the JD's seniority and domain."). + Options( + huh.NewOption("No — JD informs review only", "no"), + huh.NewOption("Yes — JD shapes what the candidate sees", "yes"), + ). + Value(&d.jdInfluencesProject), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepOutputDir: + if d.outputDir == "./interviews/role" && d.role != "" { + d.outputDir = "./interviews/" + d.role + } + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Output directory"). + Description("Where the generated repo (and kit overlay) will be written."). + Value(&d.outputDir). + Validate(nonEmpty("output directory")), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepIdeaFetching: + // No huh form — the spinner is rendered in View(). The fetch was + // kicked off by advance() as a tea.Cmd; we just wait for the + // resulting ideasFetchedMsg. + return nil + + case ibStepIdeaSelect: + if d.ideaFetchErr != "" || len(d.ideas) == 0 { + // Surface the fetch error so the user can press enter to + // continue with an empty feature description (the next screen + // is the validator-protected time-box, so the wizard will + // still reject an empty feature at confirm time — better than + // hanging here). + return huh.NewForm(huh.NewGroup( + huh.NewNote(). + Title("Idea generation failed"). + Description(d.ideaFetchErr + "\n\nPress enter to continue; you'll need to back up and type a feature description manually."), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + } + opts := make([]huh.Option[int], 0, len(d.ideas)) + for i, idea := range d.ideas { + label := fmt.Sprintf("%s — %s", idea.Title, truncate(idea.Blurb, 60)) + opts = append(opts, huh.NewOption(label, i)) + } + if d.ideaSelected < 0 { + d.ideaSelected = 0 + } + return huh.NewForm(huh.NewGroup( + huh.NewSelect[int](). + Title("Pick a project idea"). + Description("The full title + blurb becomes the feature description."). + Options(opts...). + Value(&d.ideaSelected), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + + case ibStepConfirm: + // Default to affirmative — after 13 screens of input the user almost + // always wants to commit. huh.Confirm uses the initial value of the + // bound variable to pick which button has focus, so without this + // pre-set the user lands on "Cancel" and a stray Enter cancels the + // whole wizard. Reported by a user who completed all steps, hit + // Enter on confirm, and got "Wizard cancelled at confirm" instead + // of a generated project. + d.confirmed = true + // No description on this confirm form — the right-hand summary + // panel already lists every collected field. Repeating them in + // the form's Description was reported as visual clutter that + // hid the only thing the user has to act on (Yes / Cancel). + return huh.NewForm(huh.NewGroup( + huh.NewConfirm(). + Title("Ready to bootstrap?"). + Affirmative("Yes, generate the role"). + Negative("Cancel"). + Value(&d.confirmed), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) + } + return nil +} + +func (m *interviewBootstrapTeaModel) buildTimeBoxCustomForm() *huh.Form { + return huh.NewForm(huh.NewGroup( + huh.NewInput(). + Title("Custom time-box (30-240 minutes)"). + Value(&m.data.timeBox). + Validate(validateTimeBox), + )).WithTheme(huh.ThemeCharm()).WithWidth(m.formWidth()) +} + +// runBootstrapTeaWizard launches the bubbletea program for the bootstrap +// wizard. Production callers get stdin/stdout/alt-screen; tests replace +// runBootstrapTeaProgram with a stub that drives the model in-process. +func runBootstrapTeaWizard(d BootstrapWizardDefaults) (*BootstrapWizardResult, error) { + model := newInterviewBootstrapTeaModel(d) + // No WithInput/WithOutput overrides — bubbletea uses the inherited + // stdin/stdout. Passing nil here previously left the program with no + // I/O at all and the wizard hung the moment the user pressed a key. + p := tea.NewProgram(model, tea.WithAltScreen()) + return runBootstrapTeaProgram(p, model) +} + +// runBootstrapTeaProgram is the indirection seam for tests. The default +// implementation runs the real bubbletea event loop; smoke tests in +// interview_bootstrap_wizard_test.go replace it with a driver that walks +// the model through advance() transitions in-process. +var runBootstrapTeaProgram = func(p *tea.Program, _ *interviewBootstrapTeaModel) (*BootstrapWizardResult, error) { + finalModel, err := p.Run() + if err != nil { + return nil, err + } + tm, ok := finalModel.(*interviewBootstrapTeaModel) + if !ok { + return nil, fmt.Errorf( + "bootstrap tea program returned unexpected model type %T", finalModel, + ) + } + return &BootstrapWizardResult{ + Options: bootstrapWizardOptionsFromModel(tm.data), + Confirmed: tm.data.confirmed, + Aborted: tm.data.aborted, + }, nil +} diff --git a/tui/interview_bootstrap_tea_test.go b/tui/interview_bootstrap_tea_test.go new file mode 100644 index 0000000..5fd7aeb --- /dev/null +++ b/tui/interview_bootstrap_tea_test.go @@ -0,0 +1,413 @@ +package main + +import ( + "io" + "os" + "path/filepath" + "strings" + "testing" + "time" + + tea "github.com/charmbracelet/bubbletea" + "github.com/charmbracelet/x/exp/teatest" +) + +// --------------------------------------------------------------------------- +// Golden-style layout tests for the interview bootstrap wizard. +// +// These tests drive interviewBootstrapTeaModel through teatest so the huh +// form gets a real bubbletea runtime (initial WindowSizeMsg, Init() Cmds, +// etc.), then capture the rendered output and assert that the layout +// invariants are present. This catches the regression we shipped where the +// wizard rendered as a bare huh.Form with no shell header or summary panel. +// +// Per-field cursor/animation state is suppressed by stripping ANSI before +// assertions. We do NOT assert exact byte-for-byte snapshots because huh's +// internal rendering depends on terminal capability detection; the +// invariants we care about are: +// 1. The shared "//// TEAM HERO" shell header is present. +// 2. The right-side "Interview Bootstrap" summary panel renders with +// the correct label for the active step and bracketed values for +// every step already reached. +// 3. The navigation hints footer is present. +// +// Together those three invariants tell us the wizard now wears the same +// frame as the report wizard. +// --------------------------------------------------------------------------- + +const ( + testTermWidth = 100 + testTermHeight = 32 +) + +// driveWizardOutput drives interviewBootstrapTeaModel through teatest until +// the layout marker appears, then quits and reliably tears down the +// program. Returns both the raw rendered output (including ANSI styling) +// and a stripped plaintext version for assertions. +func driveWizardOutput(t *testing.T, m *interviewBootstrapTeaModel) (raw, stripped string) { + t.Helper() + + tm := teatest.NewTestModel(t, m, teatest.WithInitialTermSize(testTermWidth, testTermHeight)) + tm.Send(tea.WindowSizeMsg{Width: testTermWidth, Height: testTermHeight}) + + // Always tear down the program before returning, regardless of how we + // exit this function. tm.Quit() shuts the program down via its + // internal channel — more reliable than sending Ctrl+C, which races + // with huh's cursor blink goroutines and leaves them parked on + // channel reads. + defer func() { + _ = tm.Quit() + tm.WaitFinished(t, teatest.WithFinalTimeout(3*time.Second)) + }() + + var buf strings.Builder + r := tm.Output() + deadline := time.Now().Add(3 * time.Second) + for time.Now().Before(deadline) { + chunk := make([]byte, 8192) + n, _ := r.Read(chunk) + if n > 0 { + buf.Write(chunk[:n]) + s := stripANSI(buf.String()) + if strings.Contains(s, "//// TEAM HERO") && strings.Contains(s, "Interview Bootstrap") { + return buf.String(), s + } + } + time.Sleep(20 * time.Millisecond) + } + s := stripANSI(buf.String()) + t.Logf("captured output (no settled frame):\n%s", s) + return buf.String(), s +} + +var _ = io.ReadAll // reserved for future variants that need full readback + +func TestInterviewBootstrap_RoleStep_HasSharedLayout(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + _, view := driveWizardOutput(t, m) + + mustContain(t, view, "//// TEAM HERO ", "shell header prefix") + mustContain(t, view, "Interview Bootstrap", "summary panel header") + mustContain(t, view, "Role slug:", "active step label in summary") + mustContain(t, view, "Role slug (URL-safe identifier)", "form title") + mustContain(t, view, "ctrl+c quit", "navigation hints footer") +} + +func TestInterviewBootstrap_SummaryShowsValuesAsStepsAdvance(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{ + Role: "senior-backend", + Stack: "Go", + Domain: "Payments", + Feature: "build a ledger entry-point", + TimeBox: "90", + ModeProject: "A", + ModeAnalysis: "ai-assisted", + ModeRubric: "default", + }) + // Jump to the output-dir step so the summary shows everything before it. + m.step = ibStepOutputDir + m.highWater = ibStepOutputDir + m.form = m.buildForm() + + _, view := driveWizardOutput(t, m) + + mustContain(t, view, "Role slug: senior-backend", "filled role slug") + mustContain(t, view, "Stack: Go", "filled stack") + mustContain(t, view, "Domain: Payments", "filled domain") + mustContain(t, view, "Time-box: 90 min", "filled time-box") + // Long values wrap across lines in the narrow summary column; assert + // only the unwrappable prefix. + mustContain(t, view, "Project type: Brownfield", "filled project type (prefix)") + mustContain(t, view, "Rubric: default", "filled rubric") + // Active step's form title should appear in the left panel. + mustContain(t, view, "Output directory", "current step form title") +} + +func TestInterviewBootstrap_View_AdvancesPastRubricStep_ShowsCustomTruncatedInSummary(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.role = "x" + m.data.stack = "Go" + m.data.domain = "Payments" + m.data.modeRubric = "custom" + m.data.customPrompt = "Score primarily on architectural decisions and verification discipline" + m.step = ibStepOutputDir + m.highWater = ibStepOutputDir + m.form = m.buildForm() + + _, view := driveWizardOutput(t, m) + mustContain(t, view, "Rubric: custom (", "rubric label shows custom prefix") + mustContain(t, view, "…", "long custom prompt is truncated with ellipsis") +} + +// --------------------------------------------------------------------------- +// Branching transitions for rubric mode are pure state-machine tests; they +// don't need the bubbletea runtime, so we test them directly for speed. +// --------------------------------------------------------------------------- + +func TestInterviewBootstrap_JDProvidedYes_RoutesToJDPath(t *testing.T) { + // JD attachment is now its own branch sitting between Stack and + // Domain. jdProvided=yes routes through the path + influence pair + // AND skips Domain entirely (the JD describes the domain). + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.jdProvided = "yes" + m.step = ibStepJDProvided + if next := m.nextStep(m.step); next != ibStepJDPath { + t.Fatalf("jdProvided=yes should advance to JD-path, got %v", next) + } +} + +func TestInterviewBootstrap_JDProvidedNo_RoutesToDomain(t *testing.T) { + // jdProvided=no flows into Domain so the proctor can type the + // business context explicitly. With a JD, Domain is skipped. + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.jdProvided = "no" + m.step = ibStepJDProvided + if next := m.nextStep(m.step); next != ibStepDomain { + t.Fatalf("jdProvided=no should advance to Domain, got %v", next) + } +} + +func TestInterviewBootstrap_JDPath_RoutesToInfluencesProject(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.step = ibStepJDPath + if next := m.nextStep(m.step); next != ibStepJDInfluencesProject { + t.Fatalf("jd-path should advance to influences-project, got %v", next) + } +} + +func TestInterviewBootstrap_JDInfluencesProject_SkipsDomainGoesToFeatureSource(t *testing.T) { + // With a JD attached, the wizard skips the Domain question. + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.step = ibStepJDInfluencesProject + if next := m.nextStep(m.step); next != ibStepFeatureSource { + t.Fatalf("influences-project should skip Domain and advance to feature-source, got %v", next) + } +} + +func TestInterviewBootstrap_Domain_RoutesToFeatureSource(t *testing.T) { + // When the JD branch was declined, Domain rejoins the main flow + // at feature-source. + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.step = ibStepDomain + if next := m.nextStep(m.step); next != ibStepFeatureSource { + t.Fatalf("domain (no-JD branch) should advance to feature-source, got %v", next) + } +} + +func TestInterviewBootstrap_RubricCustomBranch_RoutesToCustomPrompt(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.modeRubric = "custom" + m.step = ibStepRubricMode + if next := m.nextStep(m.step); next != ibStepCustomPrompt { + t.Fatalf("rubric=custom should advance to custom prompt step, got %v", next) + } +} + +func TestInterviewBootstrap_RubricDefaultBranch_SkipsConditionalSteps(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.modeRubric = "default" + m.step = ibStepRubricMode + if next := m.nextStep(m.step); next != ibStepOutputDir { + t.Fatalf("rubric=default should jump to output dir, got %v", next) + } +} + +// TestInterviewBootstrap_Screenshot_WritesGolden renders three +// representative wizard states and writes both the raw ANSI capture and a +// plaintext-stripped version to tui/testdata/interview_bootstrap/. When +// TEAMHERO_UPDATE_SCREENSHOTS=1 is set, the files are overwritten; +// otherwise the test compares against the existing golden files. This +// gives us a human-reviewable artifact under version control so layout +// regressions show up as diffs in PRs. +func TestInterviewBootstrap_Screenshot_WritesGolden(t *testing.T) { + cases := []struct { + name string + seed func() *interviewBootstrapTeaModel + }{ + { + name: "01-role-step-empty", + seed: func() *interviewBootstrapTeaModel { + return newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + }, + }, + { + name: "02-output-dir-step-filled", + seed: func() *interviewBootstrapTeaModel { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{ + Role: "senior-backend", RoleTitle: "Senior Backend Engineer", + Stack: "Go", Domain: "Payments", + Feature: "build a ledger entry-point", TimeBox: "60", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + }) + m.step = ibStepOutputDir + m.highWater = ibStepOutputDir + m.form = m.buildForm() + return m + }, + }, + { + name: "03-confirm-step", + seed: func() *interviewBootstrapTeaModel { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{ + Role: "senior-backend", Stack: "Go", Domain: "Payments", + Feature: "ledger entry-point", TimeBox: "60", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + OutputDir: "./interviews/senior-backend", + }) + m.step = ibStepConfirm + m.highWater = ibStepConfirm + m.form = m.buildForm() + return m + }, + }, + } + + outDir := filepath.Join("testdata", "interview_bootstrap") + if err := os.MkdirAll(outDir, 0o755); err != nil { + t.Fatalf("mkdir testdata: %v", err) + } + update := os.Getenv("TEAMHERO_UPDATE_SCREENSHOTS") == "1" + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + raw, stripped := driveWizardOutput(t, tc.seed()) + rawPath := filepath.Join(outDir, tc.name+".ansi.txt") + strippedPath := filepath.Join(outDir, tc.name+".plain.txt") + + if update { + if err := os.WriteFile(rawPath, []byte(raw), 0o644); err != nil { + t.Fatalf("write raw: %v", err) + } + if err := os.WriteFile(strippedPath, []byte(stripped), 0o644); err != nil { + t.Fatalf("write stripped: %v", err) + } + return + } + + want, err := os.ReadFile(strippedPath) + if err != nil { + t.Fatalf("read golden (run with TEAMHERO_UPDATE_SCREENSHOTS=1 to create): %v", err) + } + // Compare stripped plaintext only — the raw file includes + // cursor-blink and cursor-position sequences that vary between + // runs and aren't load-bearing for layout regressions. + if got := normalizeForGolden(stripped); got != normalizeForGolden(string(want)) { + t.Errorf("layout regression in %s. Got:\n%s\n\nWant:\n%s", tc.name, stripped, string(want)) + } + }) + } +} + +// TestInterviewBootstrap_ConfirmStep_OmitsVerboseSummary pins the fix for +// a reported clutter bug: the confirm step used to repeat every collected +// field (role=… · stack=… · domain=… · time-box=…) as the huh.Confirm +// form's Description, which duplicated the right-hand summary panel and +// hid the only choice the user has to make. The summary panel is the +// source of truth — the left-hand form should only show the prompt and +// the two buttons. Any regression that pipes summarizeBootstrapModel back +// into the form will reintroduce the "role=" / "stack=" markers and fail +// this assertion. +func TestInterviewBootstrap_ConfirmStep_OmitsVerboseSummary(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{ + Role: "senior-backend", Stack: "Go", Domain: "Payments", + Feature: "ledger entry-point", TimeBox: "60", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + OutputDir: "./interviews/senior-backend", + }) + m.step = ibStepConfirm + m.highWater = ibStepConfirm + m.form = m.buildForm() + _, stripped := driveWizardOutput(t, m) + + // The verbose summary's signature tokens — if any of these leak back + // into the confirm-step view, the description was reattached. + for _, banned := range []string{"role=", "stack=", "time-box=", "out="} { + if strings.Contains(stripped, banned) { + t.Errorf("confirm step should NOT contain summary token %q (it duplicates the side panel); got:\n%s", banned, stripped) + } + } + // Sanity: the title and the affirmative button MUST still be visible + // — without these the user has nothing to act on. + mustContain(t, stripped, "Ready to bootstrap?", "confirm title") + mustContain(t, stripped, "Yes, generate the role", "affirmative button") +} + +// TestInterviewBootstrap_CommitSelectedIdea_WritesToFeature pins the +// either/or contract: when the proctor picks an AI-suggested idea, the +// chosen title+blurb MUST land in data.feature (the single source of +// truth for what the candidate builds). Earlier code wrote to +// data.projectPrompt and left feature blank, which left the OpenAI +// generator with an empty "Feature focus:" field and the candidate-facing +// role-config without a description. Both fields are gone now — this +// test exists so a future refactor can't reintroduce the split. +func TestInterviewBootstrap_CommitSelectedIdea_WritesToFeature(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.ideas = []ProjectIdea{ + {Title: "Refund retries", Blurb: "Idempotent retries with exponential backoff."}, + {Title: "Audit log", Blurb: "Append-only ledger of refund state transitions."}, + } + m.data.ideaSelected = 1 + m.commitSelectedIdea() + if !strings.Contains(m.data.feature, "Audit log") { + t.Errorf("commitSelectedIdea must populate data.feature with the chosen idea; got %q", m.data.feature) + } + if !strings.Contains(m.data.feature, "Append-only ledger") { + t.Errorf("commitSelectedIdea must include the blurb; got %q", m.data.feature) + } +} + +// TestInterviewBootstrap_NextStep_StackRoutesToJDProvided ensures the +// JD-provided gate sits between Stack and Domain (Domain is then asked +// only when no JD was attached). +func TestInterviewBootstrap_NextStep_StackRoutesToJDProvided(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.step = ibStepStack + if next := m.nextStep(m.step); next != ibStepJDProvided { + t.Fatalf("stack should advance to jd-provided, got %v", next) + } +} + +// TestInterviewBootstrap_NextStep_FeatureSourceSuggestRoutesToFetch +// pins the suggest branch — picking "Suggest ideas for me" triggers +// the spinner state, then idea-select, then time-box (rejoining the +// main flow). +func TestInterviewBootstrap_NextStep_FeatureSourceSuggestRoutesToFetch(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.data.featureSource = "suggest" + m.step = ibStepFeatureSource + if next := m.nextStep(m.step); next != ibStepIdeaFetching { + t.Fatalf("featureSource=suggest should advance to idea-fetching, got %v", next) + } +} + +// TestInterviewBootstrap_NextStep_OutputDirRoutesToConfirm ensures the +// late-stage PromptSource/ProjectPrompt redundancy is gone: output-dir +// now flows straight into the confirm screen. +func TestInterviewBootstrap_NextStep_OutputDirRoutesToConfirm(t *testing.T) { + m := newInterviewBootstrapTeaModel(BootstrapWizardDefaults{}) + m.step = ibStepOutputDir + if next := m.nextStep(m.step); next != ibStepConfirm { + t.Fatalf("output-dir should advance to confirm, got %v", next) + } +} + +// normalizeForGolden collapses trailing whitespace on each line so minor +// width changes don't churn the golden file. +func normalizeForGolden(s string) string { + lines := strings.Split(s, "\n") + for i, l := range lines { + lines[i] = strings.TrimRight(l, " \t\r") + } + return strings.Join(lines, "\n") +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +func mustContain(t *testing.T, haystack, needle, what string) { + t.Helper() + if !strings.Contains(haystack, needle) { + t.Errorf("expected view to contain %s (%q); got:\n%s", what, needle, haystack) + } +} diff --git a/tui/interview_bootstrap_test.go b/tui/interview_bootstrap_test.go new file mode 100644 index 0000000..2be1e93 --- /dev/null +++ b/tui/interview_bootstrap_test.go @@ -0,0 +1,1027 @@ +package main + +import ( + "bytes" + "encoding/json" + "io" + "os" + "strings" + "testing" +) + +func TestParseBootstrapFlags_AllFlags(t *testing.T) { + args := []string{ + "--headless", + "--no-confirm", + "--foreground", + "--role", "senior-backend", + "--role-title", "Senior Backend Engineer", + "--stack", "TypeScript", + "--domain", "Payments", + "--feature", "Add idempotency keys", + "--time-box", "90", + "--mode-project", "A", + "--mode-analysis", "ai-assisted", + "--mode-rubric", "default", + "--output-dir", "./roles/senior-backend", + } + opts, parseErr := ParseBootstrapFlags(args) + if parseErr != "" { + t.Fatalf("unexpected parse error: %s", parseErr) + } + if opts.Role != "senior-backend" { + t.Errorf("role: got %q", opts.Role) + } + if !opts.Headless || !opts.NoConfirm || !opts.Foreground { + t.Errorf("boolean flags not parsed: %+v", opts) + } + if opts.TimeBox != "90" { + t.Errorf("time-box: got %q", opts.TimeBox) + } +} + +func TestParseBootstrapFlags_MissingValueErrors(t *testing.T) { + _, parseErr := ParseBootstrapFlags([]string{"--role"}) + if parseErr == "" { + t.Fatal("expected parse error on dangling --role") + } +} + +func TestParseBootstrapFlags_EmitJSONFlag(t *testing.T) { + // --json switches the bootstrap into agent-payload mode: a single + // JSON object goes to stdout describing the run, and human-readable + // chatter (Project: link, publish prompt) is routed to stderr. + // The flag exists so an orchestrating agent (HR notifier, scheduler, + // etc.) can `read` stdout and act on the payload without parsing + // our human formatting. + opts, parseErr := ParseBootstrapFlags([]string{"--json"}) + if parseErr != "" { + t.Fatalf("unexpected parse error: %s", parseErr) + } + if !opts.EmitJSON { + t.Error("--json should set EmitJSON=true") + } +} + +func TestParseBootstrapFlags_PublishFlag(t *testing.T) { + // --publish is orthogonal to --json. When set, the dispatcher + // auto-publishes to GitHub on success (no prompt), so a downstream + // agent caller can pass --publish --json and get a payload with a + // real github.url to put in an HR email. + opts, parseErr := ParseBootstrapFlags([]string{"--publish"}) + if parseErr != "" { + t.Fatalf("unexpected parse error: %s", parseErr) + } + if !opts.Publish { + t.Error("--publish should set Publish=true") + } +} + +func TestParseBootstrapFlags_DebugFlag(t *testing.T) { + // --debug toggles verbose run-context logging in both the dispatcher + // (Go side) and the bun subprocess. Off by default. + opts, parseErr := ParseBootstrapFlags([]string{"--debug"}) + if parseErr != "" { + t.Fatalf("unexpected parse error: %s", parseErr) + } + if !opts.Debug { + t.Error("--debug should set Debug=true") + } + // Short form -d works the same way. + opts2, parseErr2 := ParseBootstrapFlags([]string{"-d"}) + if parseErr2 != "" { + t.Fatalf("unexpected parse error for -d: %s", parseErr2) + } + if !opts2.Debug { + t.Error("-d should set Debug=true") + } +} + +func TestParseBootstrapFlags_UnknownFlagErrors(t *testing.T) { + _, parseErr := ParseBootstrapFlags([]string{"--what-is-this"}) + if parseErr == "" { + t.Fatal("expected parse error on unknown flag") + } +} + +func TestValidateBootstrapOptions_RejectsMissingRequired(t *testing.T) { + opts := &BootstrapOptions{ + Role: "x", + Stack: "x", + ModeProject: "A", + ModeAnalysis: "ai-assisted", + ModeRubric: "default", + OutputDir: "x", + // missing Domain and Feature + } + if msg := ValidateBootstrapOptions(opts); msg == "" { + t.Fatal("expected validation error on missing fields") + } +} + +func TestValidateBootstrapOptions_RejectsBadModeProject(t *testing.T) { + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "C", ModeAnalysis: "ai-assisted", ModeRubric: "default", + } + if msg := ValidateBootstrapOptions(opts); msg == "" { + t.Fatal("expected validation error on bad mode-project") + } +} + +func TestValidateBootstrapOptions_CustomRubricRequiresPrompt(t *testing.T) { + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "custom", + } + if msg := ValidateBootstrapOptions(opts); msg == "" { + t.Fatal("expected validation error on missing custom prompt") + } +} + +func TestValidateBootstrapOptions_JDInfluencesProjectRequiresPath(t *testing.T) { + // --jd-influences-project tells the project-generation prompt to + // read the JD; without a path there's nothing to read. The + // validator rejects the combination so the misconfiguration is + // caught before the bun subprocess starts. + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + JDInfluencesProject: true, + } + msg := ValidateBootstrapOptions(opts) + if msg == "" { + t.Fatal("expected validation error when --jd-influences-project is set without --jd-path") + } + if !strings.Contains(msg, "jd-influences-project") { + t.Errorf("validation error should mention jd-influences-project; got %q", msg) + } +} + +func TestValidateBootstrapOptions_RejectsDefaultPlusJD(t *testing.T) { + // "default+jd" is retired. JD attachment is its own field. A caller + // still passing the old value should get a clear validation error + // rather than the bun subprocess receiving an unsupported rubric. + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default+jd", + } + if msg := ValidateBootstrapOptions(opts); msg == "" { + t.Fatal("expected validation error on retired 'default+jd' rubric value") + } +} + +func TestValidateBootstrapOptions_AcceptsStandaloneJDPath(t *testing.T) { + // JD path is now optional regardless of rubric mode. A caller can + // supply --jd-path with --mode-rubric default and it should pass + // validation (the JD will be used by the AI observer). + jd := t.TempDir() + "/jd.md" + if err := os.WriteFile(jd, []byte("# JD"), 0o644); err != nil { + t.Fatalf("setup: %v", err) + } + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + JDPath: jd, + } + if msg := ValidateBootstrapOptions(opts); msg != "" { + t.Fatalf("expected validation pass for default rubric + jd-path, got %q", msg) + } +} + +func TestParseBootstrapFlags_StackByCandidateFlag(t *testing.T) { + // --stack-by-candidate is the headless equivalent of the wizard's + // "Greenfield (candidate picks stack)" option. Boolean flag; off by + // default. Combined with --mode-project A it should fail validation + // (covered by TestValidateBootstrapOptions_StackByCandidateRequiresModeB). + opts, parseErr := ParseBootstrapFlags([]string{"--stack-by-candidate"}) + if parseErr != "" { + t.Fatalf("unexpected parse error: %s", parseErr) + } + if !opts.StackByCandidate { + t.Error("--stack-by-candidate should set StackByCandidate=true") + } +} + +func TestValidateBootstrapOptions_StackByCandidateRequiresModeB(t *testing.T) { + // Stack-by-candidate is incoherent with Mode A — Mode A scaffolds + // code IN a stack, so "candidate picks the stack" makes no sense + // there. The validator rejects the combination so headless callers + // don't get a brownfield project with a confused brief. + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + StackByCandidate: true, + } + msg := ValidateBootstrapOptions(opts) + if msg == "" { + t.Fatal("expected validation error when --stack-by-candidate is combined with --mode-project A") + } + if !strings.Contains(msg, "stack-by-candidate") { + t.Errorf("validation error should mention stack-by-candidate; got %q", msg) + } +} + +func TestValidateBootstrapOptions_StackByCandidateAllowedWithModeB(t *testing.T) { + // Stack-by-candidate IS valid in combination with Mode B — that's + // the only mode where "no starter code, candidate picks the stack" + // makes sense. + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "B", ModeAnalysis: "ai-assisted", ModeRubric: "default", + StackByCandidate: true, + } + if msg := ValidateBootstrapOptions(opts); msg != "" { + t.Fatalf("expected validation pass for B + StackByCandidate, got %q", msg) + } +} + +func TestValidateBootstrapOptions_DomainOptionalWhenJDAttached(t *testing.T) { + // A JD describes the business domain, so requiring --domain on + // top of --jd-path is redundant. The validator drops the domain + // requirement when a JD is supplied — the OpenAI prompt falls back + // to the JD's body for domain context, and the wizard skips the + // Domain question entirely on the JD-yes branch. + jd := t.TempDir() + "/jd.md" + if err := os.WriteFile(jd, []byte("# JD"), 0o644); err != nil { + t.Fatalf("setup: %v", err) + } + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + JDPath: jd, + // Domain intentionally omitted. + } + if msg := ValidateBootstrapOptions(opts); msg != "" { + t.Fatalf("expected validation pass with JD-but-no-domain, got %q", msg) + } +} + +func TestValidateBootstrapOptions_DomainRequiredWhenNoJD(t *testing.T) { + // Without a JD attached, the proctor must name the domain + // explicitly — otherwise the AI has no business context at all. + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + // Domain and JDPath both omitted. + } + msg := ValidateBootstrapOptions(opts) + if msg == "" { + t.Fatal("expected validation error: domain required when no JD") + } + if !strings.Contains(msg, "--domain") { + t.Errorf("error should mention --domain; got %q", msg) + } +} + +func TestValidateBootstrapOptions_HappyPath(t *testing.T) { + opts := &BootstrapOptions{ + Role: "x", Stack: "x", Domain: "x", Feature: "x", OutputDir: "x", + ModeProject: "A", ModeAnalysis: "ai-assisted", ModeRubric: "default", + } + if msg := ValidateBootstrapOptions(opts); msg != "" { + t.Fatalf("expected validation pass, got: %s", msg) + } +} + +type stubRunner struct { + gotOpts *BootstrapOptions + code int +} + +func (s *stubRunner) Run(opts *BootstrapOptions, _, _ io.Writer) int { + s.gotOpts = opts + return s.code +} + +func TestRunInterviewBootstrap_RequiresHeadlessForNow(t *testing.T) { + var out, errBuf bytes.Buffer + stub := &stubRunner{code: 0} + code := runInterviewBootstrap([]string{ + "--role", "x", "--stack", "x", "--domain", "x", "--feature", "x", + "--mode-project", "A", "--mode-analysis", "ai-assisted", + "--mode-rubric", "default", "--output-dir", "x", + }, stub, &out, &errBuf) + if code == 0 { + t.Error("expected non-zero exit without --headless") + } + if !strings.Contains(errBuf.String(), "headless") { + t.Errorf("expected message about --headless, got: %s", errBuf.String()) + } +} + +func TestRunInterviewBootstrap_DelegatesToRunner(t *testing.T) { + var out, errBuf bytes.Buffer + stub := &stubRunner{code: 0} + code := runInterviewBootstrap([]string{ + "--headless", + "--role", "x", "--stack", "x", "--domain", "x", "--feature", "x", + "--mode-project", "A", "--mode-analysis", "ai-assisted", + "--mode-rubric", "default", "--output-dir", "x", + }, stub, &out, &errBuf) + if code != 0 { + t.Errorf("expected exit 0 from stub, got %d (stderr: %s)", code, errBuf.String()) + } + if stub.gotOpts == nil { + t.Fatal("runner not called") + } + if stub.gotOpts.Role != "x" { + t.Errorf("runner saw role=%q", stub.gotOpts.Role) + } +} + +func TestRunInterviewBootstrap_ForwardsRunnerExitCode(t *testing.T) { + var out, errBuf bytes.Buffer + stub := &stubRunner{code: 7} + code := runInterviewBootstrap([]string{ + "--headless", + "--role", "x", "--stack", "x", "--domain", "x", "--feature", "x", + "--mode-project", "A", "--mode-analysis", "ai-assisted", + "--mode-rubric", "default", "--output-dir", "x", + }, stub, &out, &errBuf) + if code != 7 { + t.Errorf("expected exit 7 forwarded from runner, got %d", code) + } +} + +// withPublishHooks installs no-op replacements for offerPublishToGitHub and +// isStdinTTY for the duration of a test. The cleanup runs on teardown so +// later tests see the production behavior. +func withPublishHooks(t *testing.T, tty bool, onPublish func(opts *BootstrapOptions)) { + t.Helper() + origPublish := offerPublishToGitHub + origTTY := isStdinTTY + t.Cleanup(func() { + offerPublishToGitHub = origPublish + isStdinTTY = origTTY + }) + offerPublishToGitHub = func(opts *BootstrapOptions, _, _ io.Writer) { + if onPublish != nil { + onPublish(opts) + } + } + isStdinTTY = func() bool { return tty } +} + +func TestPrintBootstrapSuccessLink_DisplaysRelativePath(t *testing.T) { + // Display label should be cwd-relative — running `teamhero` from + // ~/Documents and writing into ~/Documents/interviews/foo should + // surface as "interviews/foo", not "/home//Documents/...". + // The underlying OSC 8 file:// URL is still absolute (so the link + // actually works on click), but the human-readable label is what + // the proctor reads. + tmp := t.TempDir() + prevCwd, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + if err := os.Chdir(tmp); err != nil { + t.Fatalf("chdir: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(prevCwd) }) + + subDir := tmp + "/interviews/foo" + if err := os.MkdirAll(subDir, 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + var buf bytes.Buffer + printBootstrapSuccessLink(subDir, &buf) + got := buf.String() + // The OSC 8 envelope wraps the absolute file:// URL around a + // human-readable label: ESC]8;;ESC\