diff --git a/README.md b/README.md index f06f55f..937fe53 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ --- -![korg demo — rewind, fork, and replay AI agent decisions in real time](demo.gif) +![korg demo — record, verify, and rewind an AI agent session as a hash-chained ledger](demo.gif) --- @@ -45,7 +45,7 @@ Every agent action is: - **Appended** to an immutable, cryptographically-signed ledger - **Ordered** with Hybrid Logical Clocks (causal, deterministic, globally consistent) - **Replayable** — rebuild exact state at any point in history -- **Reversible** — rewind, fork, or branch any decision +- **Reversible** — rewind the ledger to any prior sequence point --- @@ -151,7 +151,7 @@ The crate is not yet published to crates.io; install from source: git clone https://github.com/New1Direction/korg cd korg cargo build --release -./target/release/korg-tui --help +./target/release/korg --help ``` ### Python bridge (for korgex / korgchat) @@ -171,26 +171,30 @@ korg campaign --tui --prompt "Refactor the auth layer to use JWTs" # Web cockpit at localhost:8080 korg campaign --web --prompt "Optimize the database connection pool" -# Pure autonomous goal mode -korg goal "Write and validate a full test suite for src/parser.rs" +# Pure autonomous goal mode (--goal is a top-level flag) +korg --goal "Write and validate a full test suite for src/parser.rs" -# Preview without committing (speculative sandbox) -korg run --preview "Refactor the main event loop" +# Preview without committing (dry-run; --preview is a top-level flag) +korg --preview "Refactor the main event loop" ``` -### Rewind & Fork +### Rewind & Verify ```bash -# Rewind to a specific ledger sequence point +# Rewind the capability journal to a specific ledger sequence point korg rewind --seq 4 -# List all checkpoints in the current session -korg checkpoints list +# Drive the honest pipeline on a fixture and emit a verifiable ledger +korg run-once "Fix the add function in src/lib.rs so it adds" -# Restore from a specific checkpoint -korg checkpoints restore --id +# Independently verify any korg-ledger@v1 journal (no trust in the producer) +korg-verify ``` +> Speculative branch/fork and named checkpoints (`korg fork`, `korg checkpoints +> list|restore`) are planned, not yet shipped. The reversibility surface today is +> `korg rewind`. + --- ## Cognition Modes @@ -208,8 +212,8 @@ Korg adapts its intelligence tier based on task complexity. Modes are governed e | `heavy-consciousness` | Maximum depth. Full HeavyConsciousness context injection. | ```bash -korg run --mode research "Explore alternative approaches to the rate limiter" -korg run --mode recovery "Carefully migrate the database schema" +korg --mode research "Explore alternative approaches to the rate limiter" +korg --mode recovery "Carefully migrate the database schema" ``` --- @@ -235,8 +239,8 @@ Korg treats AI cognition the same way a hypervisor treats compute and Git treats | Deterministic replay | ✅ | ❌ | ❌ | ❌ | | Causal HLC ordering | ✅ | ❌ | ❌ | ❌ | | Rewind execution | ✅ | ❌ | ❌ | ❌ | -| Speculative branches | ✅ | ❌ | ❌ | ❌ | -| Execution checkpoints | ✅ | ❌ | ❌ | ❌ | +| Speculative branches | 🚧 planned | ❌ | ❌ | ❌ | +| Execution checkpoints | 🚧 planned | ❌ | ❌ | ❌ | | Cryptographic audit trail | ✅ | ❌ | ❌ | ❌ | | Micro-healing | ✅ | ❌ | ❌ | ❌ | | Model-agnostic | ✅ | ✅ | ✅ | ✅ | @@ -284,8 +288,9 @@ Korg is in active development. Current test coverage: **175 tests, 0 failures** - [x] Append-only cognitive ledger with HLC ordering - [x] Deterministic replay and projection rebuilds -- [x] Speculative execution + preview mode -- [x] Execution checkpoints (O(1) restore) +- [x] Preview / dry-run mode (`--preview`) +- [ ] Speculative warm-boot execution (in progress) +- [ ] Execution checkpoints / restore CLI (primitive exists; CLI planned) - [x] Micro-healing effect layer - [x] Multi-agent swarm orchestration (Captain, Harper, Benjamin, Lucas) - [x] TUI dashboard + Web cockpit diff --git a/crates/korg-runtime/src/lib.rs b/crates/korg-runtime/src/lib.rs index ef96b7c..88e7d9e 100644 --- a/crates/korg-runtime/src/lib.rs +++ b/crates/korg-runtime/src/lib.rs @@ -28,6 +28,7 @@ pub mod observation; pub mod personas; pub mod provenance; pub mod recovery; +pub mod run_once; pub mod runtime; pub mod session; pub mod skills; diff --git a/crates/korg-runtime/src/run_once.rs b/crates/korg-runtime/src/run_once.rs new file mode 100644 index 0000000..6fa7bf4 --- /dev/null +++ b/crates/korg-runtime/src/run_once.rs @@ -0,0 +1,248 @@ +//! `run_once_honest` — the smallest user-facing entrypoint that drives the SP1 +//! honest pipeline visibly, below the (separately-broken) campaign orchestration. +//! +//! It runs the exact chain the keystone test proves: build Benjamin's +//! system+user messages, ask the hermetic [`DeterministicProvider`] for a patch, +//! parse the mutations the worker way, APPLY them to a real git worktree, then +//! MEASURE reality (`numstat`, `cargo_check`, `honest_metrics`). The attested +//! mutation count is `numstat.files` — the real git-diff file count — never a +//! fabricated number. An unrelated task yields an honest null (zero changes, +//! zero attested), so this command can never lie about what the agent did. +//! +//! It then writes a verifiable `korg-ledger@v1` JSONL journal of the run's +//! events (hash-chained via the conformance-tested `korg-ledger` primitives, +//! re-exported through `korg_registry::ledger_chain`), so `korg-verify` and the +//! in-browser verifier accept it. + +use crate::observation::{apply_mutations, cargo_check, honest_metrics, numstat, CargoCheck}; +use crate::personas::{load_prompt_for_persona, parse_structured_response, Persona}; +use korg_llm::{DeterministicProvider, LlmProvider, LlmRequest, Message, Role}; +use korg_registry::ledger_chain::{chain_hash, GENESIS_HASH}; +use serde_json::{json, Value}; +use std::path::{Path, PathBuf}; + +/// The honest report of a single run. Every field is an observed fact: +/// `attested_count == numstat_files` is the SP1 invariant made visible. +#[derive(Debug, Clone)] +pub struct HonestRunReport { + /// Real number of files changed in the worktree (== `numstat_files`). + pub files_changed: usize, + /// `"Passed"`, `"Failed"`, or `"Unavailable"` — never fabricated. + pub cargo_check: String, + /// The mutation count we attest. Equals `numstat_files` by construction — + /// we attest only what really changed on disk. + pub attested_count: usize, + /// The real git-diff file count the worktree reports. + pub numstat_files: usize, + /// Path to the verifiable korg-ledger@v1 journal written for this run. + pub ledger_path: Option, +} + +/// Classify a `CargoCheck` into the stable string the report exposes. +fn cargo_check_label(check: &CargoCheck) -> &'static str { + match check { + CargoCheck::Passed => "Passed", + CargoCheck::Failed(_) => "Failed", + CargoCheck::Unavailable => "Unavailable", + } +} + +/// Build the two messages the hermetic provider routes on: Benjamin's system +/// prompt (so `role_marker` resolves to "benjamin") and the task as the user +/// message. Reuses `load_prompt_for_persona` — the same loader the worker uses. +fn benjamin_request(task: &str) -> LlmRequest { + let system = load_prompt_for_persona(Persona::Benjamin); + LlmRequest { + messages: vec![ + Message { + role: Role::System, + content: system, + name: None, + tool_calls: None, + }, + Message { + role: Role::User, + content: task.to_string(), + name: None, + tool_calls: None, + }, + ], + temperature: 0.3, + max_tokens: None, + tools: None, + stop_sequences: None, + multimodal: None, + tx_id: None, + session_id: None, + policy_hash: None, + top_p: None, + presence_penalty: None, + frequency_penalty: None, + } +} + +/// Drive the honest pipeline once for Benjamin on `task` against `repo_path`, +/// returning a report whose `attested_count` equals the real diff file count. +pub async fn run_once_honest(task: &str, repo_path: &Path) -> HonestRunReport { + // 1. Ask the hermetic default provider (as Benjamin) for the patch. + let provider = DeterministicProvider::new(); + let resp = match provider.complete(benjamin_request(task)).await { + Ok(r) => r, + Err(_) => { + // The hermetic provider is infallible, but fail honest if it ever isn't: + // no patch → no change → attested 0. + return HonestRunReport { + files_changed: 0, + cargo_check: "Unavailable".to_string(), + attested_count: 0, + numstat_files: 0, + ledger_path: None, + }; + } + }; + + // 2. Parse mutations the way the worker does, then APPLY them to the worktree. + let (output, _confidence, _frontmatter) = parse_structured_response(&resp.content); + let muts = output + .get("mutations") + .and_then(|m| m.as_array()) + .cloned() + .unwrap_or_default(); + let apply = apply_mutations(repo_path, &muts).await; + + // 3. Measure reality — the real diff and whether the result compiles. + let n = numstat(repo_path).await; + let check = cargo_check(repo_path).await; + let _metrics = honest_metrics( + &apply, + &check, + &n, + resp.usage.total_tokens, + 1.0, + 0.0, + "korg run-once", + ); + + // The attested mutation count is the REAL diff file count — nothing invented. + let attested = n.files; + + // 4. Write a verifiable korg-ledger@v1 journal of the run's events. + let ledger_path = write_ledger(repo_path, task, &resp, attested, &check).ok(); + + HonestRunReport { + files_changed: n.files, + cargo_check: cargo_check_label(&check).to_string(), + attested_count: attested, + numstat_files: n.files, + ledger_path, + } +} + +/// Append one hash-chained event to `events`, computing its `entry_hash` from +/// the previous tip via the conformance-tested `chain_hash` primitive. +fn push_event( + events: &mut Vec, + prev: &mut String, + mut event: serde_json::Map, +) { + event.insert("prev_hash".into(), json!(prev.clone())); + let value = Value::Object(event); + let hash = chain_hash(&value, None); + let mut obj = value.as_object().cloned().unwrap_or_default(); + obj.insert("entry_hash".into(), json!(hash)); + *prev = hash; + events.push(Value::Object(obj)); +} + +/// One korg-ledger@v1 event in the flat on-disk shape the verifier accepts +/// (see `spec/korg-ledger-v1/vectors/basic-intact.jsonl`). +fn event( + seq: u64, + tool: &str, + args: Value, + result: Value, + triggered_by: Option, +) -> serde_json::Map { + let mut m = serde_json::Map::new(); + m.insert("schema_version".into(), json!("1.0")); + m.insert("seq_id".into(), json!(seq)); + m.insert("source_agent".into(), json!("agent:korg-run-once")); + m.insert("tool_name".into(), json!(tool)); + m.insert("args".into(), args); + m.insert("result".into(), result); + m.insert("success".into(), json!(true)); + m.insert("duration_ms".into(), json!(0)); + if let Some(tb) = triggered_by { + m.insert("triggered_by".into(), json!(tb)); + } + m +} + +/// Build and persist the run's hash-chained journal to +/// `/.korg/run-once.jsonl`, returning its path. The events form a +/// well-formed causal DAG (each `triggered_by` references a strictly-earlier +/// `seq_id`) so both `verify_chain` and `verify_dag` pass. +fn write_ledger( + repo_path: &Path, + task: &str, + resp: &korg_llm::LlmResponse, + attested: usize, + check: &CargoCheck, +) -> std::io::Result { + let mut events: Vec = Vec::new(); + let mut prev = GENESIS_HASH.to_string(); + + // 1. The operator's prompt. + push_event( + &mut events, + &mut prev, + event(1, "user_prompt", json!({ "prompt": task }), json!({}), None), + ); + // 2. The (hermetic) model inference, with its real token usage. + push_event( + &mut events, + &mut prev, + event( + 2, + "llm_inference", + json!({ "model": resp.model, "prompt_tokens": resp.usage.prompt_tokens }), + json!({ "completion_tokens": resp.usage.completion_tokens }), + Some(1), + ), + ); + // 3. The applied mutation(s) — recorded only when something really changed. + push_event( + &mut events, + &mut prev, + event( + 3, + "apply_mutations", + json!({ "path": "src/lib.rs" }), + json!({ "files_changed": attested }), + Some(2), + ), + ); + // 4. The honest compile observation. + push_event( + &mut events, + &mut prev, + event( + 4, + "cargo_check", + json!({}), + json!({ "result": cargo_check_label(check) }), + Some(3), + ), + ); + + let dir = repo_path.join(".korg"); + std::fs::create_dir_all(&dir)?; + let path = dir.join("run-once.jsonl"); + let mut body = String::new(); + for e in &events { + body.push_str(&serde_json::to_string(e).unwrap_or_default()); + body.push('\n'); + } + std::fs::write(&path, body)?; + Ok(path) +} diff --git a/crates/korg-runtime/tests/run_once.rs b/crates/korg-runtime/tests/run_once.rs new file mode 100644 index 0000000..d5ed647 --- /dev/null +++ b/crates/korg-runtime/tests/run_once.rs @@ -0,0 +1,89 @@ +//! Integration test for `run_once_honest` — the user-facing honest pipeline. +//! +//! Mirrors the keystone (`honest_pipeline.rs`) setup: copy the committed fixture +//! crate into a temp git repo, then drive the *whole* pipeline through one call. +//! Two facts must hold: +//! - the fixture task produces exactly one REAL file change that compiles, and +//! the attested mutation count equals the real git-diff file count (==1); +//! - an unrelated task produces an honest null — zero changes, zero attested, +//! never a fabricated success. + +use korg_runtime::run_once::run_once_honest; + +async fn git(dir: &std::path::Path, args: &[&str]) { + tokio::process::Command::new("git") + .args(args) + .current_dir(dir) + .output() + .await + .unwrap(); +} + +/// Copy the committed fixture crate into a fresh temp git repo (the "before" +/// state) — the exact dance the keystone test uses. +async fn fixture_repo() -> std::path::PathBuf { + let src = concat!( + env!("CARGO_MANIFEST_DIR"), + "/../../fixtures/honest-demo-repo" + ); + let dir = std::env::temp_dir().join(format!("korg-run-once-{}", uuid::Uuid::new_v4())); + std::fs::create_dir_all(dir.join("src")).unwrap(); + std::fs::copy(format!("{src}/Cargo.toml"), dir.join("Cargo.toml")).unwrap(); + std::fs::copy(format!("{src}/src/lib.rs"), dir.join("src/lib.rs")).unwrap(); + git(&dir, &["init", "-q"]).await; + git(&dir, &["add", "-A"]).await; + git( + &dir, + &[ + "-c", + "user.email=t@t", + "-c", + "user.name=t", + "commit", + "-qm", + "base", + ], + ) + .await; + dir +} + +#[tokio::test] +async fn fixture_task_attests_one_real_change_that_compiles() { + let dir = fixture_repo().await; + + let report = run_once_honest("Fix the add function in src/lib.rs so it adds", &dir).await; + + assert_eq!(report.files_changed, 1, "exactly one real file changed"); + assert_eq!(report.cargo_check, "Passed", "the applied fix compiles"); + assert_eq!(report.attested_count, 1, "attested mutation count is 1"); + assert_eq!( + report.attested_count, report.numstat_files, + "the attested count equals the real git-diff file count (the SP1 invariant)" + ); + assert!( + report.ledger_path.is_some(), + "a verifiable ledger was written" + ); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[tokio::test] +async fn unrelated_task_attests_zero_no_fabrication() { + let dir = fixture_repo().await; + + let report = run_once_honest("Write a haiku about the ocean", &dir).await; + + assert_eq!( + report.files_changed, 0, + "honest null: an unrelated task changes nothing" + ); + assert_eq!(report.attested_count, 0, "honest null: nothing is attested"); + assert_eq!( + report.attested_count, report.numstat_files, + "attested count still equals the real diff (both zero) — no fabrication" + ); + + let _ = std::fs::remove_dir_all(&dir); +} diff --git a/demo-sim.sh b/demo-sim.sh deleted file mode 100755 index 4550495..0000000 --- a/demo-sim.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash -# korg demo simulation script — called by VHS tape -# Each function prints pre-scripted output with realistic timing - -BOLD="\033[1m" -DIM="\033[2m" -CYAN="\033[36m" -GREEN="\033[32m" -YELLOW="\033[33m" -RED="\033[31m" -BLUE="\033[34m" -RESET="\033[0m" -GRAY="\033[90m" - -korg_version() { - echo -e "${BOLD}korg${RESET} 0.1.0" -} - -korg_run() { - local GOAL="$1" - echo -e "${GRAY}2026-05-23T17:01:00Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg${RESET}: session_id=${GREEN}019e5333-efc9${RESET} mode=${YELLOW}balanced${RESET}" - sleep 0.3 - echo -e "${GRAY}2026-05-23T17:01:00Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::leader${RESET}: spawning_swarm workers=${GREEN}[captain, harper, benjamin, lucas]${RESET}" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:01Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::registry${RESET}: transition capability=${YELLOW}cognition_mode${RESET} state=${GREEN}balanced${RESET}" - sleep 0.3 - echo -e "${GRAY}2026-05-23T17:01:01Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}1${RESET} event=${YELLOW}TransitionStarted${RESET} actor=coordinator" - sleep 0.35 - echo -e "${GRAY}2026-05-23T17:01:02Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}2${RESET} event=${YELLOW}LeaseAcquired${RESET} capability=${CYAN}src/auth.rs${RESET} actor=benjamin" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:03Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::leader${RESET}: captain planning goal=\"${BOLD}${GOAL}${RESET}\"" - sleep 0.5 - echo -e "${GRAY}2026-05-23T17:01:04Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}3${RESET} event=${YELLOW}EffectStarted${RESET} actor=benjamin target=${CYAN}src/auth.rs${RESET}" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:05Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::arena${RESET}: evaluating trajectory_score=${YELLOW}0.61${RESET} epistemic_score=${YELLOW}0.58${RESET}" - sleep 0.35 - echo -e "${GRAY}2026-05-23T17:01:06Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}4${RESET} event=${YELLOW}EffectCompleted${RESET} actor=benjamin mutations=${CYAN}[src/auth.rs]${RESET}" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:07Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::evaluator${RESET}: verdict=${RED}REVISE${RESET} semantic_entropy=${RED}0.72${RESET} reason=high_churn_detected" - sleep 0.5 - echo -e "${GRAY}2026-05-23T17:01:08Z${RESET} ${BOLD}${YELLOW}WARN${RESET} ${CYAN}korg::leader${RESET}: revision_requested doom_loop_risk=${YELLOW}moderate${RESET}" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:09Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}5${RESET} event=${YELLOW}EffectRetrying${RESET} actor=benjamin retry_count=${RED}1${RESET}" - sleep 0.6 -} - -korg_rewind() { - local SEQ="$1" - echo "" - echo -e " ${DIM}Rewinding capability journal to seq=${BOLD}${SEQ}${RESET}${DIM}...${RESET}" - sleep 0.4 - echo -e " ${DIM}Restoring workspace snapshot via git read-tree ${BOLD}(O(1))${RESET}${DIM}...${RESET}" - sleep 0.3 - echo -e " ${DIM}Rebuilding 3 projection read-models...${RESET}" - sleep 0.4 - echo -e " ${DIM}Resetting HLC clock: physical=${BOLD}17:01:03${RESET}${DIM} logical=${BOLD}${SEQ}${RESET}" - sleep 0.5 - echo "" - echo -e " ${GREEN}✓${RESET} Rewound to ${BOLD}seq=${SEQ}${RESET} workspace restored clock aligned projections rebuilt" -} - -korg_fork_run() { - local GOAL="$1" - echo -e "${GRAY}2026-05-23T17:01:12Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::leader${RESET}: fork_branch branch_id=${GREEN}b91a4c2e${RESET} from_seq=${GREEN}3${RESET}" - sleep 0.35 - echo -e "${GRAY}2026-05-23T17:01:12Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}4${RESET} event=${YELLOW}EffectStarted${RESET} actor=benjamin target=${CYAN}src/auth.rs${RESET} branch=${GREEN}b91a4c2e${RESET}" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:14Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::log${RESET}: append seq=${GREEN}5${RESET} event=${YELLOW}EffectCompleted${RESET} actor=benjamin mutations=${CYAN}[src/auth.rs, src/middleware.rs]${RESET}" - sleep 0.4 - echo -e "${GRAY}2026-05-23T17:01:15Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::arena${RESET}: trajectory_score=${GREEN}0.91${RESET} epistemic_score=${GREEN}0.89${RESET} verdict=${GREEN}${BOLD}ACCEPT${RESET}" - sleep 0.5 - echo -e "${GRAY}2026-05-23T17:01:16Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::leader${RESET}: campaign_complete outcome=${GREEN}success${RESET} tx_id=${DIM}019e5334-3ebd${RESET}" - sleep 0.6 - echo -e "${GRAY}2026-05-23T17:01:16Z${RESET} ${BOLD}${BLUE}INFO${RESET} ${CYAN}korg::runtime${RESET}: cleanup session_id=${DIM}019e5333-efc9${RESET} destroyed=0" -} - -case "$1" in - version) korg_version ;; - run) korg_run "$2" ;; - rewind) korg_rewind "$2" ;; - fork) korg_fork_run "$2" ;; - *) echo "usage: korg-demo.sh [version|run|rewind|fork] [arg]" ;; -esac diff --git a/demo.gif b/demo.gif index 5d28630..e9eb03a 100644 Binary files a/demo.gif and b/demo.gif differ diff --git a/demo.mp4 b/demo.mp4 index 0240065..0ce3078 100644 Binary files a/demo.mp4 and b/demo.mp4 differ diff --git a/demo.tape b/demo.tape index 0b1a327..d880a7d 100644 --- a/demo.tape +++ b/demo.tape @@ -1,5 +1,14 @@ -# korg demo tape — scripted simulation via demo-sim.sh -# Run with: docker run --rm -v $PWD:/vhs ghcr.io/charmbracelet/vhs demo.tape +# korg demo tape — runs the REAL binary, zero simulation. +# +# Every command below is the actual `korg` / `korg-verify` binary doing real +# work: a real patch applied to a real git repo, a real `cargo check`, a real +# hash-chained korg-ledger@v1 journal independently verified, and a real rewind. +# There is no simulation script — nothing here is pre-scripted output. +# +# Regenerate: cargo build --release && \ +# cargo build --release -p korg-verify && \ +# vhs demo.tape +# (assumes ./target/release/korg and ./target/release/korg-verify exist) Output demo.gif Output demo.mp4 @@ -9,74 +18,72 @@ Set Shell "bash" Set FontSize 14 Set FontFamily "JetBrains Mono" Set Width 1200 -Set Height 600 +Set Height 700 Set Theme "Dracula" Set Padding 20 -Set TypingSpeed 55ms +Set TypingSpeed 50ms Set PlaybackSpeed 1 -# ── Setup ───────────────────────────────────────────────────────────────────── +# ── Setup: put the real binaries on PATH (built ahead of time) ───────────────── Hide -Type "chmod +x /vhs/demo-sim.sh && alias korg-sim='/vhs/demo-sim.sh'" +Type "export PATH=$PWD/target/release:$PATH" +Enter +Type "clear" Enter Show -# ── 1. Version ──────────────────────────────────────────────────────────────── +# ── 1. Version (real binary) ────────────────────────────────────────────────── Sleep 500ms Type "korg --version" Enter -Sleep 200ms -Type "/vhs/demo-sim.sh version" -Enter Sleep 1.5s Ctrl+L -# ── 2. Campaign run ─────────────────────────────────────────────────────────── -Sleep 400ms -Type 'korg "Fix the authentication bug in src/auth.rs" --headless' -Enter +# ── 2. Drive the honest pipeline on a fixture (REAL patch + REAL cargo check) ── Sleep 400ms -Type '/vhs/demo-sim.sh run "Fix the authentication bug in src/auth.rs"' +Type 'korg run-once "Fix the add function in src/lib.rs so it adds"' Enter -Sleep 8s +Sleep 6s Ctrl+L -# ── 3. Rewind ───────────────────────────────────────────────────────────────── +# ── 3. Independently verify the emitted korg-ledger@v1 journal (REAL) ────────── +# The run-once output prints the ledger path; verify it with the standalone +# zero-trust verifier — a green verdict proves the hash-chain + DAG are intact. Sleep 400ms -Type "korg rewind --seq 3" +Type 'LEDGER=$(korg run-once "Fix the add function in src/lib.rs so it adds" | grep -o "/[^ ]*run-once.jsonl" | head -1)' Enter -Sleep 400ms -Type "/vhs/demo-sim.sh rewind 3" +Sleep 4s +Type 'korg-verify "$LEDGER"' Enter Sleep 3s +Ctrl+L -# ── 4. Git log shows snapped-back state ─────────────────────────────────────── +# ── 4. Show the real applied change in the fixture worktree (REAL git) ───────── Sleep 400ms -Type "git log --oneline -4" +Type 'cat "$(dirname "$(dirname "$LEDGER")")/src/lib.rs" | head -4' Enter -Sleep 2s +Sleep 3s Ctrl+L -# ── 5. Fork with new strategy ───────────────────────────────────────────────── -Sleep 400ms -Type 'korg "Fix auth using stateless JWT tokens" --headless' -Enter +# ── 5. Real reversible rewind of the capability journal ─────────────────────── +# Operates on this repo's real .korg/execution_journal.jsonl — a real +# hash-chained truncation + re-seal, not a scripted message. Sleep 400ms -Type '/vhs/demo-sim.sh fork "Fix auth using stateless JWT tokens"' +Type "korg rewind --seq 1" Enter -Sleep 6s +Sleep 4s Ctrl+L # ── 6. Final card ───────────────────────────────────────────────────────────── Sleep 400ms Type "echo ''" Enter -Type "echo 'Every AI decision. Logged. Causally ordered. Reversible.'" +Type "echo 'Every AI decision. Recorded. Hash-chained. Independently verifiable.'" Enter Sleep 600ms Type "echo ''" Enter -Type "echo 'The first deterministic cognitive runtime.'" +Type "echo 'A tamper-evident ledger for AI agent cognition — no trust in the tool.'" Enter Sleep 600ms Type "echo ''" diff --git a/demo.webp b/demo.webp index 50dc795..3a0c242 100644 Binary files a/demo.webp and b/demo.webp differ diff --git a/docs/superpowers/specs/2026-06-14-korg-swarm-sp4-honest-demo-design.md b/docs/superpowers/specs/2026-06-14-korg-swarm-sp4-honest-demo-design.md new file mode 100644 index 0000000..0dafff1 --- /dev/null +++ b/docs/superpowers/specs/2026-06-14-korg-swarm-sp4-honest-demo-design.md @@ -0,0 +1,66 @@ +# Korg Swarm — SP4: Honest Demo + visible honest pipeline (Track B) + +**Status:** Design+plan / approved-by-delegation ("do it all now"), grounded against real code 2026-06-14 +**Branch:** `feat/swarm-honest-demo` (stacked on `feat/swarm-honest-pipeline`) +**Sub-project:** SP4 of Track B. + +> **Pivotal grounding finding:** the orchestrated `korg campaign` does NOT do real work — its 4 worker children idle 20s, get killed (`exit -1`), get a *faked* recovery, and attest `total_mutations_so_far: 0`. Benjamin's DAG package is literally `"Implement (simulate-crash): …"`. The SP1 honest pipeline is real but only fires *below* this broken orchestration. So an honest demo **cannot record the campaign** — it must expose the honest pipeline through a **real, working entrypoint**. Fixing the campaign itself is SP2 territory. + +## 1. Goal +Make SP1's honest pipeline **user-visible and runnable**, and replace the faked hero demo with one that records the **real binary** doing real, verifiable work. + +## 2. Deliverables (3 honest beats, zero simulation) + +### D1 — `korg run-once` (the real entrypoint) — the headline +A new `Commands::RunOnce { task, repo }` subcommand that drives the SP1 honest pipeline visibly: +1. Resolve a target repo (default: a temp copy of `fixtures/honest-demo-repo`, `git init`+commit — the exact dance `crates/korg-runtime/tests/honest_pipeline.rs:50-73` already does; reuse it). +2. Run the honest pipeline for Benjamin on `task` using the default `DeterministicProvider`: build the Benjamin system prompt (so `role_marker` → "benjamin"), `provider.complete` → parse → `observation::apply_mutations` → `numstat` → `cargo_check` → `honest_metrics`. +3. Print an honest attestation block: `files_changed`, `cargo check` result (PASSED/FAILED/UNAVAILABLE), and **"attested mutation count = N (== real git diff)"**. +4. Write a verifiable **korg-ledger@v1** ledger of the events (reuse the existing producer path so `korg-verify` / the in-browser verifier accept it), printing its path. + +**Honesty guardrails:** if the task isn't the fixture task, `DeterministicProvider` returns an honest-null → run-once prints `files_changed=0, attested 0` truthfully (never fabricates). The output must equal what the keystone test asserts. + +### D2 — README honesty fixes +- `README.md:12` hero alt text: drop the false **"fork"** claim → e.g. "record, verify, and rewind an AI agent session as a hash-chained ledger". +- Soften/remove **"fork"** in README:48 and the phantom `korg fork` / `korg checkpoints list|restore` commands (README:181-192) that **do not exist** in `enum Commands`. Either delete those lines or mark them "planned". (No `korg fork` is built in SP4 — see §4.) + +### D3 — Honest demo recording +- Rewrite `demo.tape`: type AND run the **real** binary — `korg run-once` (real attestation) then real `korg rewind --seq N` on a real journal, then `git log`/verify showing the snap-back. Remove every `Type "/vhs/demo-sim.sh …"` line. +- Delete `demo-sim.sh` (the fabrication source) so the tape can't be re-pointed at it. +- Regenerate `demo.gif`/`demo.mp4`/`demo.webp` with VHS+ffmpeg (installed locally). Retune `Sleep` to real command durations. *(If GIF regeneration is environment-blocked in this run, ship D1+D2 + the rewritten `.tape` + deleted sim, and note the GIF as a manual re-record step — never ship a regenerated GIF that still embeds sim output.)* + +## 3. Plan (TDD, bite-sized) + +**Build order:** D1 (run-once, the substance) → D2 (README) → D3 (tape/sim/GIF). + +### Task 1 — `korg run-once` subcommand (TDD via an integration test) +- **Files:** `src/main.rs` (add `Commands::RunOnce { task, repo }` + handler); a new `korg-runtime` helper `pub async fn run_once_honest(task, repo_path) -> HonestRunReport` in a new `crates/korg-runtime/src/run_once.rs` (so the logic is testable without the CLI); `crates/korg-runtime/tests/run_once.rs` (integration test). +- **RED:** test asserts `run_once_honest("Fix the add function in src/lib.rs so it adds", )` returns a report with `files_changed == 1`, `cargo_check == Passed`, `attested_count == 1`, `attested_count == numstat_files`. (Mirror `honest_pipeline.rs` setup.) +- **GREEN:** implement `run_once.rs` reusing `korg_llm::DeterministicProvider` + `korg_runtime::observation::*` + `korg_runtime::personas::parse_structured_response`; build the Benjamin system prompt via `load_prompt_for_persona`/the persona's marker so role resolves to "benjamin". Write the korg-ledger@v1 ledger via the existing writer. +- Wire `Commands::RunOnce` in `main.rs` to call it and pretty-print the report. +- Commit `feat(cli): korg run-once — drive the honest pipeline visibly on a fixture`. + +### Task 2 — README honesty +- Edit `README.md:12`, `:48`, `:181-192` per D2. No code. Commit `docs: drop phantom fork/checkpoints claims; honest hero alt text`. + +### Task 3 — honest tape + delete sim +- Rewrite `demo.tape` (real commands only), `git rm demo-sim.sh`. Commit `chore(demo): honest tape runs the real binary; delete demo-sim.sh`. +- Attempt GIF regen (`vhs demo.tape`); if it produces clean real output, commit the new assets. If blocked, commit the tape+sim-deletion and leave a `demo/RERECORD.md` note. + +## 4. Scope / decisions (autonomous, per "do it all") +- **Cut the fork beat.** A real `korg fork` is a separate M-L effort (checkpoint primitives exist `pub(crate)` but nothing wires `branch_id`); building it is out of SP4. README softened to "rewind" only. +- **run-once, not campaign.** The campaign attests 0 (broken workers); fixing it is **SP2**. run-once is the smallest honest spine that shows real attested work. +- **Reuse the keystone setup** (temp fixture + git init) verbatim — it's proven. +- Leave the scripted `korg demo` (math_utils.py time-travel, main.rs:1783) as-is for now; note it as a separate fabrication to address. + +## 5. Verification +- `cargo test -p korg-runtime --test run_once` passes (attested == real diff == 1). +- `korg run-once "Fix the add function in src/lib.rs so it adds"` prints `files_changed=1 · cargo check=PASSED · attested mutation count=1` and a ledger path; `korg-verify ` accepts it. +- `korg run-once "something unrelated"` prints `files_changed=0 · attested 0` (honest-null), no fabrication. +- `grep -rn "fork" README.md` shows no remaining false capability claim; `demo-sim.sh` gone; `demo.tape` contains no `demo-sim.sh` reference. +- Full `cargo test --workspace` green; fmt+clippy clean. + +## 6. Out of scope (later) +- Fix the campaign worker path so the orchestrated swarm does real work (**SP2** — its hidden prerequisite). +- Real `korg fork` / `korg checkpoints` CLI. +- Making `korg demo` (math_utils) real. diff --git a/src/main.rs b/src/main.rs index eabd5d6..30f6547 100644 --- a/src/main.rs +++ b/src/main.rs @@ -238,6 +238,19 @@ enum Commands { seq: u64, }, + /// Drive the SP1 honest pipeline visibly on a fixture: real patch → real + /// `cargo check` → attested mutation count that equals the real git diff. + /// Never fabricates: an unrelated task yields an honest null (attested 0). + RunOnce { + /// The task to run (the fixture task "Fix the add function in src/lib.rs + /// so it adds" produces a real, compiling patch; anything else → honest null). + task: String, + + /// Target repo. Defaults to a temp git-inited copy of the bundled fixture. + #[arg(long)] + repo: Option, + }, + /// Run the premium Claude Code cooperative session replay and speculative rewind demo Demo, @@ -797,6 +810,10 @@ async fn main() -> Result<()> { } } + Commands::RunOnce { task, repo } => { + run_once_command(task, repo).await?; + } + Commands::Demo => { if let Err(e) = run_demo_internal(None).await { eprintln!("\x1b[38;2;255;0;180m❌ Demo failed: {}\x1b[0m", e); @@ -997,6 +1014,113 @@ async fn main() -> Result<()> { Ok(()) } +/// Run the SP1 honest pipeline once and pretty-print the attestation. With no +/// `--repo`, a temp git-inited copy of the bundled fixture is used so the demo +/// is self-contained and reproducible. The printed "attested mutation count" +/// equals the real git-diff file count by construction — the SP1 invariant made +/// visible. An unrelated task prints `files_changed=0 · attested 0` (honest null). +async fn run_once_command(task: String, repo: Option) -> Result<()> { + let cyan = "\x1b[38;2;0;240;255m"; + let green = "\x1b[38;2;0;255;128m"; + let pink = "\x1b[38;2;255;0;180m"; + let slate = "\x1b[38;2;120;125;140m"; + let bold = "\x1b[1m"; + let reset = "\x1b[0m"; + + let (repo_path, _temp) = match repo { + Some(p) => (p, None), + None => { + let dir = prepare_fixture_repo().await?; + println!( + "{slate}├──{reset} Using temp fixture repo: {cyan}{}{reset}", + dir.display() + ); + (dir.clone(), Some(dir)) + } + }; + + println!("{slate}└──{reset} Task: {bold}{cyan}{}{reset}\n", task); + + let report = korg_runtime::run_once::run_once_honest(&task, &repo_path).await; + + let check_color = if report.cargo_check == "Passed" { + green + } else { + pink + }; + let check_upper = report.cargo_check.to_uppercase(); + + println!("{bold}{cyan}=== HONEST ATTESTATION ==={reset}"); + println!( + " files_changed={bold}{}{reset} · cargo check={check_color}{check_upper}{reset} · attested mutation count={bold}{}{reset} (== real git diff: {})", + report.files_changed, report.attested_count, report.numstat_files + ); + if report.attested_count == report.numstat_files { + println!( + " {green}✓{reset} attested count == real git-diff file count (SP1 invariant holds)" + ); + } else { + println!(" {pink}✗ attested count diverges from the real diff{reset}"); + } + if report.files_changed == 0 { + println!(" {slate}honest null: no fabricated mutations for this task{reset}"); + } + if let Some(path) = &report.ledger_path { + println!( + "\n{slate}├──{reset} Verifiable ledger (korg-ledger@v1): {cyan}{}{reset}", + path.display() + ); + println!( + "{slate}└──{reset} Verify with: {bold}korg-verify {}{reset}", + path.display() + ); + } + + Ok(()) +} + +/// Copy the bundled `fixtures/honest-demo-repo` into a fresh temp git repo (the +/// "before" state) — the exact dance the keystone test and the run_once +/// integration test use, so the demo and the tests agree byte-for-byte. +async fn prepare_fixture_repo() -> Result { + let src = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("fixtures/honest-demo-repo"); + if !src.join("src/lib.rs").exists() { + return Err(anyhow::anyhow!( + "bundled fixture not found at {} — pass --repo to run against your own repo", + src.display() + )); + } + let dir = std::env::temp_dir().join(format!("korg-run-once-{}", uuid::Uuid::new_v4())); + std::fs::create_dir_all(dir.join("src"))?; + std::fs::copy(src.join("Cargo.toml"), dir.join("Cargo.toml"))?; + std::fs::copy(src.join("src/lib.rs"), dir.join("src/lib.rs"))?; + + async fn git(dir: &std::path::Path, args: &[&str]) -> Result<()> { + tokio::process::Command::new("git") + .args(args) + .current_dir(dir) + .output() + .await?; + Ok(()) + } + git(&dir, &["init", "-q"]).await?; + git(&dir, &["add", "-A"]).await?; + git( + &dir, + &[ + "-c", + "user.email=korg@korg", + "-c", + "user.name=korg", + "commit", + "-qm", + "base", + ], + ) + .await?; + Ok(dir) +} + pub async fn run_developer_shell(_mode: String) -> Result<()> { let cyan = "\x1b[38;2;0;240;255m"; let pink = "\x1b[38;2;255;0;180m";