Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
a376644
feat: add reusable thread lifecycle response pipeline
rianjs Jun 23, 2026
57959a7
fix: preserve unknown field names in structured retry prompts
rianjs Jun 23, 2026
e000c98
fix: satisfy lint for thread lifecycle pipeline
rianjs Jun 23, 2026
bbb4e15
fix: harden thread response lifecycle
rianjs Jun 24, 2026
010b195
fix: make llm lifecycle tasks resumable
rianjs Jun 26, 2026
c63e75b
fix: resume thread response attempts
rianjs Jun 26, 2026
cd78dd8
test: cover thread response resume boundaries
rianjs Jun 26, 2026
b822314
test: assert resume persistence artifacts
rianjs Jun 26, 2026
3fb8254
fix: keep failed thread analysis resumable
rianjs Jun 26, 2026
e6399bf
test: cover stale thread resume inputs
rianjs Jun 26, 2026
bac48c1
test: cover dry-run rerun recovery
rianjs Jun 26, 2026
b006629
test: cover real lifecycle resume boundaries
rianjs Jun 26, 2026
66e98b3
fix: route review thread lifecycle through analysis
rianjs Jun 26, 2026
994639f
test: pin thread response planning path
rianjs Jun 26, 2026
331c2c5
fix: preserve checkout-readonly adapter capability
rianjs Jun 26, 2026
34d7a6d
fix: bundle github inline findings into submit review
rianjs Jun 26, 2026
432cb9a
fix: fail fast for non-opinionated github reviews
rianjs Jun 26, 2026
4d73a26
fix: wire respond progress breadcrumbs
rianjs Jun 26, 2026
8da289a
Restrict reviewer identities to PAT auth
rianjs Jun 26, 2026
05ba883
Split review runtime read and post providers
rianjs Jun 26, 2026
3826a3b
Recover init from legacy reviewer config
rianjs Jun 26, 2026
5d8f3e2
fix: restore github app reviewer identities
rianjs Jun 27, 2026
853bab4
fix: preserve app reviewer identity refresh
rianjs Jun 27, 2026
36e7acc
docs: clarify github app reviewer contract
rianjs Jun 27, 2026
54c74b0
test: strengthen github app reviewer coverage
rianjs Jun 27, 2026
7e1a922
fix: isolate review dry-run resume state
rianjs Jun 27, 2026
82bb27b
fix: share run artifact lifecycle boundaries
rianjs Jun 27, 2026
ad443c6
fix: harden response lifecycle resume boundaries
rianjs Jun 27, 2026
5c22e83
fix: harden thread response resume validation
rianjs Jun 27, 2026
43ea71b
fix: validate cached thread response actions
rianjs Jun 27, 2026
3f1a284
fix: scope dry-run resume lookup
rianjs Jun 27, 2026
a68aabe
fix: harden thread response empirical path
rianjs Jun 27, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion cmd/cr/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import (
"github.com/open-cli-collective/codereview-cli/internal/cmd/datacmd"
"github.com/open-cli-collective/codereview-cli/internal/cmd/exitcode"
"github.com/open-cli-collective/codereview-cli/internal/cmd/mecmd"
"github.com/open-cli-collective/codereview-cli/internal/cmd/respondcmd"
"github.com/open-cli-collective/codereview-cli/internal/cmd/reviewcmd"
"github.com/open-cli-collective/codereview-cli/internal/cmd/root"
"github.com/open-cli-collective/codereview-cli/internal/cmd/sessionscmd"
Expand Down Expand Up @@ -47,6 +48,16 @@ func buildRootCommand(stdin io.Reader, stdout, stderr io.Writer) (*cobra.Command
Stdout: stdout,
Stderr: stderr,
})
root.RegisterAll(cmd, opts, configcmd.Register, credentialcmd.Register, mecmd.Register, agentscmd.Register, reviewcmd.Register, sessionscmd.Register, datacmd.Register, benchmarkcmd.Register)
root.RegisterAll(cmd, opts,
configcmd.Register,
credentialcmd.Register,
mecmd.Register,
agentscmd.Register,
reviewcmd.Register,
respondcmd.Register,
sessionscmd.Register,
datacmd.Register,
benchmarkcmd.Register,
)
return cmd, opts
}
118 changes: 118 additions & 0 deletions docs/architecture.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Architecture Decisions

This document records review-pipeline boundaries that are intended to stay
stable as the implementation evolves.

## Durable LLM Execution Boundary

All production structured LLM actions must flow through
`internal/llmlifecycle`. Callers describe the task ID, phase, prompt input,
structured output contract, model/effort, artifact paths, and run/session
scope. The lifecycle runner owns provider invocation, structured-output retry,
provider-session resume, pre-flight reuse, task metadata, accepted-output
artifacts, session persistence, progress breadcrumbs, and failure
classification.

The final commit marker for a task is
`llm-tasks/<encoded-task-id>/metadata.json`. Writers must publish validated
output or failed-attempt payloads first, persist the ledger session row when
the task is run-owned, and write metadata last. Resume code must trust only the
final metadata path, never temporary files or partial payloads.

New LLM-backed components must not call `internal/llm` structured helpers
directly. They should call `llmlifecycle` through explicit, fakeable
dependencies in unit tests and should return domain results rather than
posting comments or mutating provider state.
`internal/architecture/llm_lifecycle_test.go` enforces that direct structured
helper calls stay inside `internal/llm` and `internal/llmlifecycle`. Direct
provider-adapter calls should also stay behind `llmlifecycle` for production
structured tasks; code review owns that broader boundary until a stronger
static guardrail exists.

Most lifecycle tasks are run-owned and must have a matching ledger session row
when a provider session is available. Caller-owned no-run tasks are allowed only
where no review run exists yet, such as `SelectionOnly` and the pre-run approval
override classifier. Those tasks may reuse artifact metadata without a ledger
session row, but they still use the same metadata schema and lifecycle runner.

## Stage Model Resolution

Runtime model choice must be resolved through `internal/stagemodel`. Code that
executes an LLM stage must not hard-code model IDs and must not call
`config.ResolveModelTier` directly.

`stagemodel.ResolveStageModel` is the single runtime path from profile
preferences and command overrides to a concrete model and effort. The request
must include the named stage, requested tier, default effort, and any explicit
operator override. The resolver applies user profile `llm.model_map` values,
built-in provider defaults, and configured tier floors before returning the
concrete runtime choice.

This boundary exists so model catalog data, provider capabilities, token costs,
and profile-level tier floors can be added without touching individual review
stages. Runtime hard-coding bypasses user preference and is a bug.

Reviewer `agent.model_id` is an exact provider-specific model override. It must
still enter runtime execution through `stagemodel.ResolveStageModel` as a model
override rather than bypassing the resolver, but it intentionally bypasses the
tier map because the agent author selected a concrete model.

The direct `config.ResolveModelTier` exception is config inspection and the
resolver implementation itself.
`internal/architecture/model_resolution_test.go` enforces that direct
`config.ResolveModelTier` calls stay inside approved packages. Hard-coded
runtime model IDs remain a code-review concern until model-catalog guardrails
exist.

## Git Provider Writes

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR documents Git Provider Writes and Inline Thread Lifecycle as stable architecture boundaries, but unlike the LLM lifecycle and stage-model sections there is no corresponding enforceable guardrail in the diff. Right now those two invariants live only in prose, so future commands can quietly reintroduce direct provider writes or bypass threadcontext/threadanalysis without any test catching it. Add architecture tests similar to internal/architecture/llm_lifecycle_test.go and internal/architecture/model_resolution_test.go that restrict provider-write calls to the posting path and verify thread-lifecycle packages stay on the intended domain/lifecycle seams.

Reply inline to this comment.


Provider writes have one durable path: planned actions in the ledger followed

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section states provider writes have one durable path and that commands should not mutate provider state directly, but the new architecture guardrail test does not actually enforce that repo-wide rule: internal/architecture/thread_lifecycle_test.go explicitly exempts all of internal/cmd/reviewcmd from the write-method scan. That means future direct PostIssueComment/ReplyToThread/SubmitReview calls added in command glue would still pass the advertised guardrail. Narrow the allowlist to the specific boundary shim/outbox code, or soften this prose so it reads as an intended design boundary rather than an enforced convention.

Reply inline to this comment.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section states provider writes have one durable path and that commands should not mutate provider state directly, but the new architecture guardrail test does not actually enforce that repo-wide rule: internal/architecture/thread_lifecycle_test.go explicitly exempts all of internal/cmd/reviewcmd from the write-method scan. That means future direct PostIssueComment/ReplyToThread/SubmitReview calls added in command glue would still pass the advertised guardrail. Narrow the allowlist to the specific boundary shim/outbox code, or soften this prose so it reads as an intended design boundary rather than an enforced convention.

Reply inline to this comment.

by outbox execution. Commands and domain analyzers should not post comments,
reply to review threads, resolve threads, submit reviews, or mutate provider
state directly.

This keeps markers, retries, reconciliation, idempotency, and resume behavior in
one place. New commands such as `cr respond` should produce planned thread
actions and let the reviewplan/ledger/outbox flow perform provider writes.

## Inline Thread Lifecycle

Inline PR discussion threads are domain input, not provider-specific prompt
data. The intended decomposition is:

- `internal/threadcontext` normalizes `gitprovider.InlineThread`, detects
codereview-authored finding threads, detects latest human replies, strips
shared markers, collapses resolved threads to the latest sanitized comment,
and produces file-scoped reviewer context.
- `internal/threadanalysis` accepts normalized thread context and returns
reusable domain decisions: thread ID, decision, reply body, summary, resolve
flag, and rationale.

Resolved inline threads should not be reprocessed as full conversations on
every review. Their durable context is the latest sanitized comment on the
resolved thread, with marker metadata retained when the comment contains a
codereview thread-summary marker. Reviewer prompts should receive compact
file-scoped summaries so agents avoid re-raising issues that have already been
discussed and resolved.

`cr review` and `cr respond` should share the same normalization, filtering,
model resolution, LLM execution, and action-planning components. `cr respond`
is a command-shaped reuse of the thread-response portion of the review
pipeline, not a separate posting system.

## Retention And Cleanup

Durable run-owned LLM tasks, thread-analysis results, and artifacts must be
owned by a run and must be safe to delete through the existing data lifecycle
commands. Database rows should reference `runs(run_id)` with cascade delete
semantics, and large artifacts should live under the run artifact directory.

When the retention window elapses, normal prune/GC commands should remove these
results along with the rest of the run. If a user deletes retained state, a
future review may need to spend time and tokens recreating it; that is the
expected tradeoff for user-controlled local data retention.

Caller-owned no-run task artifacts must live under the configured data root or
an explicit caller artifact root. If no run is eventually allocated for that
artifact root, the directory is treated as orphaned local data and must be safe
for `cr data purge` to remove.
17 changes: 12 additions & 5 deletions docs/development.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,20 @@ Collective standards and automation remain canonical in their own repositories.

codereview-cli is the Open CLI Collective code-review CLI and ships the `cr`
binary. It provides configuration and credential commands, trusted-agent
inspection, dry-run and live pull-request review orchestration, named LLM
session management, and local data lifecycle commands.
inspection, dry-run and live pull-request review orchestration, inline thread
response handling through `cr respond`, named LLM session management, and local
data lifecycle commands.

The current Go code is a Cobra command tree in `internal/cmd/*` with a thin

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File-level note: docs/development.md

The repo-local project overview still summarizes the current CLI surface without mentioning the new cr respond command, even though this PR adds it as a durable root command and the rest of the docs now refer to it. Update the overview so the development guide reflects the public command surface future contributors are expected to preserve.

Reply inline to this comment.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

File-level note: docs/development.md

The repo-local project overview still summarizes the current CLI surface without mentioning the new cr respond command, even though this PR adds it as a durable root command and the rest of the docs now refer to it. Update the overview so the development guide reflects the public command surface future contributors are expected to preserve.

Reply inline to this comment.

`cmd/cr` entrypoint, shared exit-code mapping in `internal/cmd/exitcode`, and
version plumbing in `internal/version`. Review orchestration is split across
`internal/pipeline`, `internal/reviewrun`, `internal/reviewplan`,
`internal/outbox`, `internal/gate`, and `internal/gateio`.
`internal/pipeline`, `internal/reviewrun`, `internal/threadrespond`,
`internal/reviewplan`, `internal/outbox`, `internal/gate`, and
`internal/gateio`.

Architecture guardrails for LLM execution, model resolution, Git provider
writes, inline thread lifecycle, and retention live in
[`docs/architecture.md`](architecture.md).

Within `internal/pipeline`, the public entry points are `DryRun`, `Live`, and
`SelectionOnly`. `DryRun` and `Live` execute the full review pipeline, while
Expand Down Expand Up @@ -80,7 +86,8 @@ make clean # remove build artifacts
state/config adapters in `internal/config`, `internal/ledger`, and
`internal/statepaths`, provider/LLM adapters in their owning packages, and
review posting/gating in `internal/outbox`, `internal/gate`, and
`internal/gateio`.
`internal/gateio`, and response-only inline discussion handling in
`internal/threadrespond`.

## Interactive Init Notes

Expand Down
13 changes: 10 additions & 3 deletions docs/init-ux-contract.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ Interactive `init` should use these primary user-facing terms:
and GitHub Enterprise hosts such as `github.mycompany.com`.
- **Reviewer entity**: the actor that posts `COMMENT`, `APPROVE`, or
`REQUEST_CHANGES` on pull requests.
On GitHub, a reviewer entity must resolve to a repository-authorized
identity for `APPROVE` or `REQUEST_CHANGES` to count toward PR state. Live
review warns and continues when the selected reviewer can write a review
object but GitHub may not treat it as an opinionated review.
- **LLM runtime**: the way reviewer agents run and authenticate, such as Claude
CLI subscription auth, Codex CLI subscription auth, Pi local runtime, or a
direct API-key-backed provider path.
Expand Down Expand Up @@ -198,8 +202,8 @@ follow-up credential work without leaking values.

All credential-bearing init flows should show equivalent non-secret destination
context before collecting secret values. This includes repository-access Git
credentials, reviewer PAT/GitHub App credentials, and LLM API keys handled by
the shared credential collector.
credentials, reviewer PAT credentials, reviewer GitHub App private keys, and
LLM API keys handled by the shared credential collector.

Destination summaries should include:

Expand Down Expand Up @@ -251,7 +255,7 @@ contextual variants of the same fallback choice, such as:

- **Post as rianjs (GitHub PAT)**
- **Post as acme-review-bot (GitHub App)**
- **Post using this profile's Git account (GitHub PAT)**
- **Post using this profile's Git account (GitHub PAT or GitHub App)**

This means:

Expand Down Expand Up @@ -310,6 +314,9 @@ and saved config must stay stable:
profile selects it with `profiles.<profile>.reviewer.kind: entity` and
`profiles.<profile>.reviewer.entity`. The Git-account fallback maps to
`profiles.<profile>.reviewer.kind: git_identity`.
A GitHub reviewer entity is not just posting credentials: the resolved
identity must also have repository authority for GitHub to count blocking or
approving reviews toward the PR decision.
- **Review profile** maps to one saved entry under `profiles.<name>`.

This section is intentionally high level. The detailed field inventory and
Expand Down
41 changes: 36 additions & 5 deletions docs/llm-task-artifacts.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ reviewer, and rollup calls must be isolated from each other so one failed task
does not erase successful upstream work or force unrelated LLM sessions to run
again.

Task artifacts live under a run artifact directory:
Task artifacts usually live under a run artifact directory:

```text
llm-tasks/<encoded-task-id>/
Expand All @@ -18,6 +18,8 @@ llm-tasks/<encoded-task-id>/
Raw failed-attempt files are named `<label>.json` in the task directory. The
current structured adapter labels are `initial` and `retry`, which produce
`initial.json` and `retry.json` when raw invalid output is available.
`validated-output.json` stores the accepted structured JSON after prose
recovery, not necessarily the provider's raw structured-output bytes.

`metadata.json` is the commit marker. Writers must publish it last, after any
validated output or raw failed-attempt payloads are written and after the ledger
Expand All @@ -37,8 +39,9 @@ to resume.
Load-bearing metadata fields are:

- `task_id`: stable task identity. Current values are `orchestrator-selection`,
`reviewer-<encoded-agent-id>`, `orchestrator-rollup`, and
`dossier-discussion-summary`.
`reviewer-<encoded-agent-id>`, `orchestrator-rollup`,
`dossier-discussion-summary`, `thread-analysis-<thread-id>`, and
`approval-override`.
- `phase`: task phase, such as `selection`, `reviewer`, `rollup`, or
`dossier`.
- `dependency_task_ids`: task IDs whose completed state was included in this
Expand All @@ -49,8 +52,8 @@ Load-bearing metadata fields are:
- `status`: one of `succeeded`, `failed_isolated`, or `failed_blocking`.
- `session_row_id` and `provider_session_id`: ledger/provider session handles
used for run summaries and provider-level resume. `session_row_id` may be
empty only for caller-owned `SelectionOnly` artifact roots that reuse a
cached task without allocating a review run.
empty only for caller-owned no-run artifact roots such as `SelectionOnly` or
the pre-run approval override classifier.
- `adapter`, `model`, `effort`, and `log_path`: execution context.
- `validated_output_path`: structured output to decode when reusing a succeeded
task.
Expand Down Expand Up @@ -126,3 +129,31 @@ allocated. In that scoped mode:

This no-run behavior is intentionally limited to caller-owned artifact roots;
the normal run-backed durable task model remains unchanged for full reviews.

## Thread Analysis Tasks

`thread-analysis-<thread-id>` tasks classify one normalized inline discussion
thread and return a reusable decision, reply body, summary, resolve flag, and
rationale.

For `cr respond`, these tasks are run-owned. Successful analyses persist normal
ledger-backed sessions and are reused on retry. A normal `cr respond` invocation
resumes the latest incomplete response run for the same PR head, base, profile,
posting identity, and post mode. If analysis completed but planning or posting
was interrupted, rerun loads the persisted thread-analysis task instead of
calling the LLM again; if planned actions already exist, rerun continues through
the ledger/outbox post phase instead of replanning. Use `cr respond --rerun` to
start a fresh response attempt and leave the incomplete attempt untouched. If
the normalized thread input changes under the same task directory, the lifecycle
runner fails closed with rerun guidance instead of overwriting the prior task.

## Approval Override Task

`approval-override` is a pre-run classifier that detects explicit author
requests to approve without another full review pass.

The gate runs this before a review run may exist, so it uses the caller-owned
no-run lifecycle mode. Classifier failures are non-blocking: the gate warns and
continues with normal review. Successful and failed classifier task metadata
still lives under the prospective run artifact root so provider-session resume
and local artifact inspection use the same lifecycle shape.
44 changes: 38 additions & 6 deletions internal/approvaloverride/approvaloverride.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,12 @@ import (
"strings"
"time"

"github.com/google/uuid"

"github.com/open-cli-collective/codereview-cli/internal/gitprovider"
"github.com/open-cli-collective/codereview-cli/internal/llm"
"github.com/open-cli-collective/codereview-cli/internal/llmlifecycle"
"github.com/open-cli-collective/codereview-cli/internal/stagemodel"
)

const schemaVersion = 1
Expand All @@ -39,6 +43,9 @@ type Request struct {
LatestMarkerAt time.Time
Candidates []Candidate
LogPath string
LLMTasksDir string
Now func() time.Time
NewSessionRowID func() string
}

// Result is the classifier's decision.
Expand Down Expand Up @@ -78,16 +85,27 @@ func (c *LLMClassifier) ClassifyApprovalOverride(ctx context.Context, req Reques
if err := ensureLogDir(req.LogPath); err != nil {
return Result{}, err
}
value, _, err := llm.RunStructured(ctx, c.adapter, llm.Request{
Model: c.model,
Effort: c.effort,
Prompt: BuildPrompt(req),
LogPath: req.LogPath,
if strings.TrimSpace(req.LLMTasksDir) == "" {
return Result{}, fmt.Errorf("approvaloverride: llm task artifact directory is required")
}
result, err := llmlifecycle.RunStructured(ctx, llmlifecycle.Request{
Adapter: c.adapter,
TaskID: "approval-override",
Phase: string(stagemodel.StageApprovalOverride),
AllowNoRunCache: true,
Paths: llmlifecycle.Paths{LLMTasksDir: req.LLMTasksDir},
Model: c.model,
Effort: c.effort,
LogPath: req.LogPath,
Prompt: BuildPrompt(req),
FailureStatus: llmlifecycle.StatusFailedIsolated,
Now: requestClock(req),
NewSessionRowID: requestSessionRowID(req),
}, DecodeResponse)
if err != nil {
return Result{}, err
}
return Result{Approve: value.ApprovalOverrideRequested}, nil
return Result{Approve: result.Value.ApprovalOverrideRequested}, nil
}

// Response is the strict classifier schema.
Expand Down Expand Up @@ -176,3 +194,17 @@ func ensureLogDir(path string) error {
}
return os.MkdirAll(dir, 0o750)
}

func requestClock(req Request) func() time.Time {
if req.Now != nil {
return func() time.Time { return req.Now().UTC() }
}
return func() time.Time { return time.Now().UTC() }
}

func requestSessionRowID(req Request) func() string {
if req.NewSessionRowID != nil {
return req.NewSessionRowID
}
return uuid.NewString
}
Loading
Loading