From 997e5b057ff6fe2b327b38c11f153372eb74429c Mon Sep 17 00:00:00 2001 From: Rui Figueira Date: Thu, 21 May 2026 19:57:30 +0100 Subject: [PATCH 1/6] Added screenshot() method and Workers AI integration for vision-based bot navigation --- doom-player/src/bot/context.ts | 55 +++ doom-player/src/bot/runner.ts | 30 +- doom-player/src/client/app.tsx | 610 ++++++++++++++++++++++++++ doom-player/src/index.ts | 52 +++ doom-player/worker-configuration.d.ts | 6 +- doom-player/wrangler.jsonc | 3 + 6 files changed, 753 insertions(+), 3 deletions(-) diff --git a/doom-player/src/bot/context.ts b/doom-player/src/bot/context.ts index 605553a..96972c8 100644 --- a/doom-player/src/bot/context.ts +++ b/doom-player/src/bot/context.ts @@ -245,6 +245,61 @@ export class BotContext { this.#onLog(line); } + /** + * Capture a pixel-perfect 320x200 PNG of the current frame, as + * base64-encoded bytes plus mime type. Useful for piping into a + * vision-capable LLM (see `ai.run(...)` in the sandbox). + */ + async screenshot(): Promise<{ data: string; mimeType: string }> { + const res = await this.#webmcp.invoke("get_screenshot", {}); + if (res.status !== "Completed") { + throw new Error( + `get_screenshot failed: status=${res.status} ${res.errorText ?? res.exception?.description ?? ""}`, + ); + } + // get_screenshot returns `{ content: [{ type: "image", data, + // mimeType }, { type: "text", text }] }`. The MCP envelope can + // be wrapped one extra layer by the CDP transport, mirroring + // peelTextEnvelope in mcpPayload.ts; we walk at most a couple + // of layers looking for the image item. + let current: unknown = res.output; + for (let depth = 0; depth < 4; depth++) { + if (!current || typeof current !== "object") break; + const content = (current as { content?: unknown }).content; + if (Array.isArray(content)) { + const img = content.find( + (c): c is { type: "image"; data: string; mimeType: string } => + !!c && + typeof c === "object" && + (c as { type?: unknown }).type === "image" && + typeof (c as { data?: unknown }).data === "string" && + typeof (c as { mimeType?: unknown }).mimeType === "string", + ); + if (img) return { data: img.data, mimeType: img.mimeType }; + // Descend through a nested text-encoded envelope, if any. + const text = content.find( + (c): c is { type: "text"; text: string } => + !!c && + typeof c === "object" && + (c as { type?: unknown }).type === "text" && + typeof (c as { text?: unknown }).text === "string", + ); + if (text) { + try { + current = JSON.parse(text.text); + continue; + } catch { + break; + } + } + } + break; + } + throw new Error( + `get_screenshot returned no image content. Raw: ${safeStringify(res.output).slice(0, 200)}`, + ); + } + /** Read-only snapshot of how the bot has used the context so far. */ stats(): Readonly<{ stateReads: number; diff --git a/doom-player/src/bot/runner.ts b/doom-player/src/bot/runner.ts index 47e123b..ecdab88 100644 --- a/doom-player/src/bot/runner.ts +++ b/doom-player/src/bot/runner.ts @@ -41,6 +41,15 @@ export interface RunBotOptions extends BotContextOptions { loader: WorkerLoader; /** Live WebMCP client. Must have completed its preroll. */ webmcp: WebMCPClient; + /** + * Optional Workers AI binding. When supplied, the sandbox gets an + * extra namespace `ai` with a single `run(model, input, options?)` + * method that proxies straight to `env.AI.run`. When omitted (e.g. + * the `ai` block is removed from wrangler.jsonc) the namespace is + * not registered, so calls like `await ai.run(...)` simply throw a + * "ai is not defined" reference error in the sandbox. + */ + ai?: Ai; /** * Hard timeout for the bot, in milliseconds. Defaults to 60_000. * Note: this is enforced *inside* the sandboxed worker by codemode, @@ -91,10 +100,29 @@ export async function runBot(opts: RunBotOptions): Promise { sleep: async (ms: unknown) => ctx.sleep(typeof ms === "number" ? ms : Number(ms) || 0), log: async (...args: unknown[]) => ctx.log(...args), + screenshot: async () => ctx.screenshot(), }, positionalArgs: true, }; + const providers: ResolvedProvider[] = [provider]; + + if (opts.ai) { + const ai = opts.ai; + providers.push({ + name: "ai", + fns: { + run: async (model: unknown, input: unknown, options?: unknown) => + (ai.run as (m: string, i: unknown, o?: unknown) => Promise)( + String(model), + input, + options, + ), + }, + positionalArgs: true, + }); + } + const executor = new DynamicWorkerExecutor({ loader: opts.loader, timeout: opts.timeoutMs ?? 60_000, @@ -102,7 +130,7 @@ export async function runBot(opts: RunBotOptions): Promise { globalOutbound: null, }); - const exec = await executor.execute(opts.code, [provider]); + const exec = await executor.execute(opts.code, providers); return { ok: !exec.error, diff --git a/doom-player/src/client/app.tsx b/doom-player/src/client/app.tsx index 8d97a58..acf2545 100644 --- a/doom-player/src/client/app.tsx +++ b/doom-player/src/client/app.tsx @@ -466,6 +466,615 @@ const INSPECT_BOT = `// Dump a single state snapshot and quit. await bot.log(JSON.stringify(await bot.getState(), null, 2)); `; +// Vision-LLM bot: opens the automap, screenshots it, asks Workers AI +// where to go, and moves in that direction. Requires the optional +// `ai` namespace (host worker must have the AI binding configured). +// +// API used: +// await bot.screenshot() -> { data: base64-png, mimeType } +// await ai.run(model, input) +const AI_NAV_BOT = `// Hybrid AI navigation with closed-loop steering. +// +// One macro consult: open the automap, screenshot it, ask the vision +// LLM for a *player-relative* turn (AHEAD / SOFT_LEFT / HARD_LEFT / +// SOFT_RIGHT / HARD_RIGHT / BACK). Combine it with the player's current +// facing to compute a target world bearing. +// +// Many micro ticks: read get_state every tick, compute the angular +// error between the current player.angle_deg and the target bearing, +// and correct it (turn left/right) before walking forward. Combat / +// door / pinned / stuck / wall overrides preempt steering. +// +// Idea: the LLM is slow and expensive, so we only use it to answer the +// hard question — "given the whole map, which way should I be heading?" +// Everything else (shooting enemies in the FOV, opening doors, not +// walking into walls) is done locally from the raycast + thing data +// in get_state, tick-by-tick. +// +// Requires the host worker to have the optional Workers AI binding +// configured (see doom-player wrangler.jsonc \`ai\` block). Without +// it, \`ai\` is undefined in this sandbox and the call below throws. + +const MODEL = "@cf/meta/llama-3.2-11b-vision-instruct"; +const STEPS = 5; // macro consults of the vision LLM +const TICKS_PER_MACRO = 24; // micro ticks between consults +const TICK_MS = 200; +const TURN_TOLERANCE_DEG = 18; // dead-band: don't bother correcting <18° +const TURN_HOLD_MIN_MS = 140; // shortest turn tap +const TURN_HOLD_MAX_MS = 500; // longest single turn tap (big errors) +// Doom's player.momx / player.momy are LAGGY (see the \`get_state\` tool +// description in src/app/lib/webmcp.tsx): they trail keydown -> ticcmd +// -> thrust by 1-2 engine tics, so the first state read after a +// press_key('up') routinely returns mom=(0,0) even though the player +// is already moving. We instead track the change in player.x / player.y +// between consecutive get_state calls — position is current the tic +// it's sampled, so a non-zero pos delta is ground truth that the player +// actually moved. +const STUCK_POS_EPS = 4; // units of pos delta below this = stuck +const STUCK_TICKS = 3; // consecutive stuck ticks before unsticking +// Kept for the rare "I want a real-speed estimate" path; not used for +// wedge detection any more. +const STUCK_MOM_EPS = 0.5; + +// @cf/meta/llama-3.2-11b-vision-instruct requires that you agree with their terms +await ai.run(MODEL, { prompt: "agree" }).catch(() => {}); + +// ── Heading helpers ────────────────────────────────────────────────── +// +// Earlier versions of this bot treated the automap as world-axis- +// aligned and asked the LLM for a screen-space 3x3 cell, then converted +// that to an absolute world bearing. That was wrong in practice: the +// player ARROW on the automap rotates with the player's facing, so the +// LLM naturally reads the map relative to the arrow ("the open corridor +// is ahead and to the right of the player"). Converting its answer as +// an absolute world direction made the bot turn the wrong way whenever +// the player wasn't already facing north. +// +// We now ask the LLM for a *player-relative* turn (AHEAD / SOFT_LEFT / +// HARD_LEFT / SOFT_RIGHT / HARD_RIGHT / BACK) and compute the target +// world bearing as \`player.angle + relative_offset\`. No screen-to-world +// conversion needed. +const RELATIVE_TURNS = { + AHEAD: 0, + SOFT_LEFT: 45, + HARD_LEFT: 90, + BACK: 180, + HARD_RIGHT: -90, + SOFT_RIGHT: -45, +}; + +// Signed angular delta in degrees, result in (-180, 180]. +function angleDelta(targetDeg, currentDeg) { + let d = (targetDeg - currentDeg) % 360; + if (d > 180) d -= 360; + if (d <= -180) d += 360; + return d; +} + +function normalizeAngle(deg) { + return ((deg % 360) + 360) % 360; +} + +// Cardinal label for a world bearing (Doom convention: 0°=E, 90°=N). +// Lets us log human-readable directions alongside raw degrees so traces +// stay readable without doing degree arithmetic in your head. +function compassLabel(deg) { + const d = normalizeAngle(deg); + const labels = [ + ["E", 0], ["NE", 45], ["N", 90], ["NW", 135], + ["W", 180], ["SW", 225], ["S", 270], ["SE", 315], + ]; + let best = labels[0]; + let bestDelta = 360; + for (const [name, ref] of labels) { + const delta = Math.min( + Math.abs(d - ref), + 360 - Math.abs(d - ref), + ); + if (delta < bestDelta) { + bestDelta = delta; + best = [name, ref]; + } + } + return best[0]; +} + +// Linearly interpolate the engine's 8 forward raycasts to estimate +// what's at an arbitrary player-relative bearing. Used by the veto +// check after the LLM picks a turn — if the predicted slot is a close +// wall we override to BACK rather than walking into geometry. +function predictRayAt(rays, relBearingDeg) { + if (!rays || rays.length === 0) return null; + // Find the two nearest rays by bearing. + let nearest = rays[0]; + let nearestDelta = Infinity; + for (const r of rays) { + const delta = Math.abs(r.bearing_deg - relBearingDeg); + if (delta < nearestDelta) { + nearestDelta = delta; + nearest = r; + } + } + return nearest; +} + +// Build a compact, structured prose digest of the current engine +// state for the LLM. All directional fields are *player-relative* +// (matches the turn-direction question we're about to ask). The +// digest is intentionally line-oriented so the model can scan it. +function buildStatePrompt(state, history) { + const p = state.player; + const facingStr = p + ? \`\${p.angle_deg.toFixed(0)}° (\${compassLabel(p.angle_deg)})\` + : "(unknown)"; + const poseStr = p + ? \`(\${p.x.toFixed(0)}, \${p.y.toFixed(0)})\` + : "(unknown)"; + + const rays = state.raycasts || []; + const raysSorted = [...rays].sort((a, b) => a.bearing_deg - b.bearing_deg); + const rayLines = raysSorted.map((r) => { + const sign = r.bearing_deg >= 0 ? "+" : ""; + const extra = r.thing_type ? \` (\${r.thing_type})\` : ""; + return \` \${sign}\${r.bearing_deg.toFixed(0).padStart(3, " ")}°: \${r.hit} @ \${r.distance.toFixed(0)}\${extra}\`; + }).join("\\n"); + + const things = (state.things_visible || []).slice(0, 8).map((t) => { + const sign = t.bearing_deg >= 0 ? "+" : ""; + return \`\${t.type} @ \${sign}\${t.bearing_deg.toFixed(0)}°/\${t.distance.toFixed(0)}\`; + }).join(", "); + + const enemies = (state.enemies_visible || []).map( + (e) => \`\${e.type} @ \${e.bearing}/\${e.distance}\`, + ).join(", "); + + const hud = state.hud; + const hudStr = \`hp=\${hud.health} armor=\${hud.armor} ammo=\${hud.ammo}(\${hud.ammo_type}) weapon=\${hud.weapon} keys=[\${(hud.keys || []).join(",")}]\`; + + const historyLines = []; + if (history && history.recent && history.recent.length > 0) { + for (const h of history.recent) { + historyLines.push( + \` macro \${h.macro}: picked \${h.turn}, moved \${h.dist.toFixed(0)} units\${h.pinned ? " (pinned)" : ""}\`, + ); + } + } + + return [ + "ENGINE STATE (all bearings are relative to player facing; -=left, +=right):", + \` facing: \${facingStr}\`, + \` pose: \${poseStr}\`, + \` hud: \${hudStr}\`, + " forward-cone raycasts:", + rayLines || " (none)", + \` things visible: \${things || "(none)"}\`, + \` enemies visible: \${enemies || "(none)"}\`, + historyLines.length > 0 + ? "RECENT MACRO HISTORY:\\n" + historyLines.join("\\n") + : "RECENT MACRO HISTORY: (this is the first macro)", + ].join("\\n"); +} + +// Convert base64 PNG -> number[] (Workers AI vision input shape). +function base64ToBytes(b64) { + const bin = atob(b64); + const out = new Array(bin.length); + for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i); + return out; +} + +// Capture the automap as a screenshot. The automap is a toggle (Tab), +// so we open it, wait a tic for redraw, snap, then close it again so +// regular movement keys go back to controlling the player. +async function snapAutomap(currentScreen) { + if (currentScreen !== "automap") { + await bot.press("tab"); + await bot.sleep(150); + } + const shot = await bot.screenshot(); + await bot.press("tab"); + await bot.sleep(100); + return shot; +} + +// Ask the vision LLM which way to turn next, relative to the player +// arrow on the automap. The prompt fuses the rendered automap (image) +// with a structured digest of the engine state (text), so the model +// can cross-reference what it "sees" with what raycasts actually show. +async function askMacroTurn(state, shot, history) { + const stateBlock = buildStatePrompt(state, history); + const instructions = + "You are guiding a DOOM bot. The IMAGE is the in-game automap: " + + "white lines are explored walls; the small white triangle at the " + + "centre is the PLAYER and its tip points the way the player faces. " + + "Black space adjacent to white walls is unexplored territory.\\n\\n" + + stateBlock + + "\\n\\nDecide where the player should head next, *relative to the " + + "arrow's current facing*. Use BOTH the map (for big-picture " + + "exploration) AND the raycasts (for what's physically reachable " + + "this second). Prefer a direction where the raycasts are open " + + "(distance > 100, hit != wall). Avoid picking a direction the " + + "raycasts show as a close wall. If the recent history shows the " + + "bot was pinned moving the same way, pick a DIFFERENT direction " + + "this time.\\n\\n" + + "Reply with EXACTLY one token from this set, nothing else: " + + "AHEAD, SOFT_LEFT, HARD_LEFT, SOFT_RIGHT, HARD_RIGHT, BACK.\\n" + + " AHEAD = keep current facing\\n" + + " SOFT_LEFT = rotate ~45° counter-clockwise\\n" + + " HARD_LEFT = rotate ~90° counter-clockwise\\n" + + " SOFT_RIGHT = rotate ~45° clockwise\\n" + + " HARD_RIGHT = rotate ~90° clockwise\\n" + + " BACK = turn around (~180°)"; + + const t0 = Date.now(); + const resp = await ai.run(MODEL, { + image: base64ToBytes(shot.data), + prompt: instructions, + max_tokens: 12, + }); + const elapsedMs = Date.now() - t0; + const text = (resp && typeof resp === "object" && typeof resp.response === "string") + ? resp.response.trim() + : String(resp); + const upper = text.toUpperCase(); + // Longest tokens first so "HARD_LEFT" doesn't match the "LEFT" branch. + let pick = "AHEAD"; + for (const k of ["HARD_LEFT", "HARD_RIGHT", "SOFT_LEFT", "SOFT_RIGHT", "BACK", "AHEAD"]) { + if (upper.includes(k)) { pick = k; break; } + } + // Log the full prompt + raw response so the trace alone is enough + // to reconstruct what the LLM saw and replied. + await bot.log(\` ai prompt (\${instructions.length} chars):\`); + for (const line of instructions.split("\\n")) await bot.log(\` | \${line}\`); + await bot.log(\` ai (\${elapsedMs}ms) raw=\${JSON.stringify(text)} -> \${pick}\`); + return { pick, elapsedMs, rawText: text, prompt: instructions }; +} + +// Safety net: if the LLM's chosen direction lands on a close wall +// according to the engine's raycasts, override it. The veto returns +// either the original pick (no change) or a replacement that points +// at the deepest open ray; logging shows which case fired. +function vetoTurn(pick, state) { + const offset = RELATIVE_TURNS[pick] ?? 0; + const rays = state.raycasts || []; + if (rays.length === 0) return { pick, vetoed: false }; + // Forward-cone rays only; BACK is never vetoed (we trust the LLM + // on "turn around" because the engine's forward rays say nothing + // about what's behind the player). + if (pick === "BACK") return { pick, vetoed: false }; + + const sample = predictRayAt(rays, offset); + if (!sample) return { pick, vetoed: false }; + // The "blocked" threshold is generous on purpose — we only veto + // when the engine is very confident the chosen lane is unwalkable. + if (sample.hit !== "wall" || sample.distance >= 40) { + return { pick, vetoed: false, sample }; + } + + // Pick the deepest open ray (any non-wall, or wall with > 100 + // distance) and translate its bearing back into a label. + let best = rays[0]; + for (const r of rays) if (r.distance > best.distance) best = r; + let replacement = "AHEAD"; + const b = best.bearing_deg; + if (b >= 67) replacement = "HARD_LEFT"; + else if (b >= 22) replacement = "SOFT_LEFT"; + else if (b <= -67) replacement = "HARD_RIGHT"; + else if (b <= -22) replacement = "SOFT_RIGHT"; + // If the deepest open ray is *also* close, fall back to BACK. + if (best.hit === "wall" && best.distance < 40) replacement = "BACK"; + return { + pick: replacement, + vetoed: true, + sample, + reason: \`ray at \${offset}° is \${sample.hit}@\${sample.distance.toFixed(0)}; deepest ray = \${b.toFixed(0)}°@\${best.distance.toFixed(0)} (\${best.hit})\`, + }; +} + +// Convert a player-relative turn label into an absolute target world +// bearing using the player's current facing. +function turnToTargetBearing(turn, player) { + if (!player) return null; + const offset = RELATIVE_TURNS[turn] ?? 0; + // \`offset\` is in Doom's CCW-positive convention (LEFT = +45°). + return normalizeAngle(player.angle_deg + offset); +} + +// Scale turn-tap duration with the absolute heading error so big +// macro errors don't take 20 ticks to close. +function turnHoldFor(errDeg) { + const abs = Math.min(180, Math.abs(errDeg)); + // Linear: TURN_HOLD_MIN_MS at 18° (tolerance), TURN_HOLD_MAX_MS at 90°+. + const t = Math.min(1, Math.max(0, (abs - TURN_TOLERANCE_DEG) / (90 - TURN_TOLERANCE_DEG))); + return Math.round(TURN_HOLD_MIN_MS + t * (TURN_HOLD_MAX_MS - TURN_HOLD_MIN_MS)); +} + +// Doom's "use" is a toggle on doors — every press flips the door's +// open/close state. Hammering use@30ms every tick means we keep +// closing the door we just opened. Block consecutive use presses +// for this many ticks so the engine has time to animate the door. +const USE_COOLDOWN_TICKS = 8; +let _useCooldown = 0; + +// Deterministic one-tick policy. Combat / doors preempt steering. +// Wall avoidance is biased toward the macro target side. The caller +// hands us a stuck counter (derived from position delta — see the +// STUCK_POS_EPS note above) and an \`actuallyMoving\` flag so we can +// distinguish "engine says mom=0 but we just teleported 60 units" from +// "engine says mom=0 and we genuinely haven't moved". +// Returns a short string describing which branch fired, so the caller +// can log it for offline analysis ("why did the bot do X on tick Y?"). +async function microTick(state, targetBearing, stuckTicks, actuallyMoving) { + if (_useCooldown > 0) _useCooldown -= 1; + if (state.screen !== "playing") { + await bot.press("enter"); + return \`menu(enter) screen=\${state.screen}\`; + } + + // 1. Combat: shoot centred enemies, turn toward off-centre ones. + const enemies = state.enemies_visible || []; + const centred = enemies.find((e) => e.bearing === "center"); + if (centred) { + await bot.press("fire", 200); + return \`fire @\${centred.type}\`; + } + const turnTowardEnemy = enemies.find((e) => e.bearing === "left" || e.bearing === "far_left") + ? "left" + : enemies.find((e) => e.bearing === "right" || e.bearing === "far_right") + ? "right" + : null; + if (turnTowardEnemy) { + await bot.press(turnTowardEnemy, 120); + return \`face-enemy \${turnTowardEnemy}\`; + } + + // 2. Doors / switches close ahead -> activate and step through. + // Two guards before pressing \`use\`: + // a. the door must be near-centred in the FOV (otherwise we're + // not actually facing it; let steering align us first). + // b. respect a cooldown — \`use\` toggles the door, so spamming + // it every tick keeps re-closing what we just opened. + const rays = state.raycasts || []; + const fwd = rays.find((r) => Math.abs(r.bearing_deg) < 10); + if ( + fwd && + (fwd.hit === "door" || fwd.hit === "switch") && + fwd.distance < 80 && + Math.abs(fwd.bearing_deg) < 8 && + _useCooldown === 0 + ) { + // Single short tap; door animation runs even while we walk + // forward, so don't burn ticks holding the key. + await bot.press("use", 30); + await bot.press("up", 200); + _useCooldown = USE_COOLDOWN_TICKS; + return \`use \${fwd.hit}@\${fwd.distance.toFixed(0)} (cooldown=\${USE_COOLDOWN_TICKS})\`; + } + + // 3. Pinned: every nearby forward ray is a close wall. Pure rotation + // won't help — the player needs to physically retreat first. This + // catches the "wedged in a corner" case where the unstick turn + // below would just spin in place. + const fwdRays = (rays || []).filter((r) => Math.abs(r.bearing_deg) < 45); + const pinned = + fwdRays.length > 0 && + fwdRays.every((r) => r.hit === "wall" && r.distance < 32); + if (pinned) { + await bot.press("down", 350); + return \`pinned: back up (rays=\${fwdRays.map((r) => r.distance.toFixed(0)).join(",")})\`; + } + + // 4. Stuck (no momentum for several ticks) -> escape: back up + // *then* turn. Backing up reliably breaks contact with whatever + // geometry we wedged into; the turn happens on the next tick. + if (stuckTicks >= STUCK_TICKS) { + await bot.press("down", 250); + await bot.press("right", 250); + return \`unstick (stuck=\${stuckTicks})\`; + } + + // 5. Wall in our face -> turn toward the deepest open ray, but + // *prefer* the side closer to the macro target when both sides + // are roughly equal. + if (fwd && fwd.hit === "wall" && fwd.distance < 48) { + const targetErr = + targetBearing !== null && state.player + ? angleDelta(targetBearing, state.player.angle_deg) + : 0; + let best = rays[0]; + for (const r of rays) if (r.distance > best.distance) best = r; + // If the macro target is more than 45° off, override the + // deepest-ray pick with the target side; it's better to grind a + // tic and turn correctly than walk away from where we want to go. + const targetBiased = Math.abs(targetErr) > 45; + const dir = targetBiased + ? targetErr > 0 + ? "left" + : "right" + : best.bearing_deg < 0 + ? "left" + : "right"; + await bot.press(dir, 220); + return \`wall@\${fwd.distance.toFixed(0)} turn \${dir} (\${targetBiased ? "target-bias" : \`deep-ray@\${best.bearing_deg.toFixed(0)}\`})\`; + } + + // 6. Closed-loop steering toward the macro target bearing. + if (targetBearing !== null && state.player) { + const err = angleDelta(targetBearing, state.player.angle_deg); + if (Math.abs(err) > TURN_TOLERANCE_DEG) { + // Doom's angles: +y is 90°, -y is 270°. A positive \`err\` means + // we need to rotate counter-clockwise, which is the \`left\` key. + const hold = turnHoldFor(err); + const dir = err > 0 ? "left" : "right"; + await bot.press(dir, hold); + return \`steer \${dir} \${hold}ms (err=\${err.toFixed(0)}°)\`; + } + } + + // 7. Heading is good (or no target): walk forward — unless we're + // *already* stationary with a wall in range. The default wall- + // avoid branch above only fires at distance<48, but a player + // facing wall@80 with no actual position progress will sit there + // pressing "up" against geometry forever. Escalate to a sideways + // turn. We use \`actuallyMoving\` (position-delta based) here + // rather than mom, because mom lags behind the engine. + const fwdRay = fwd; // alias for clarity + if ( + !actuallyMoving && + stuckTicks >= 1 && + fwdRay && + fwdRay.hit === "wall" && + fwdRay.distance < 120 + ) { + // Pick the deepest open ray and turn that way; same as branch 5 + // but triggered earlier because spd=0 + close-ish wall is a + // strong "the engine won't let me through" signal. + let best = rays[0]; + for (const r of rays) if (r.distance > best.distance) best = r; + const dir = best.bearing_deg < 0 ? "left" : "right"; + await bot.press(dir, 220); + return \`stalled@wall\${fwdRay.distance.toFixed(0)} turn \${dir} (deep-ray@\${best.bearing_deg.toFixed(0)}@\${best.distance.toFixed(0)})\`; + } + + await bot.press("up", 250); + return targetBearing === null ? "fwd (no target)" : "fwd (on-bearing)"; +} + +// History of recent macro outcomes; fed back to the LLM so it can +// recognise it was pinned and pick a different direction next time. +// Keep the last 3 entries to avoid bloating the prompt. +const history = { recent: [] }; + +for (let macro = 0; macro < STEPS; macro++) { + let s = await bot.getState(); + await bot.log("macro", macro, "screen:", s.screen, "hp:", s.hud.health, + "pose:", s.player ? \`(\${s.player.x.toFixed(0)},\${s.player.y.toFixed(0)})@\${s.player.angle_deg.toFixed(0)}° (\${compassLabel(s.player.angle_deg)})\` : "?"); + + if (s.screen !== "playing" && s.screen !== "automap") { + await bot.press("enter"); + await bot.sleep(200); + continue; + } + + // --- ONE LLM consult per macro step: image + state digest. --- + const shot = await snapAutomap(s.screen); + const llmResult = await askMacroTurn(s, shot, history); + let turn = llmResult.pick; + + // Safety net: if the LLM picked a direction the raycasts say is a + // close wall, override. Logs both versions so we can tell whether + // the veto fires too aggressively. + const veto = vetoTurn(turn, s); + if (veto.vetoed) { + await bot.log(\` veto: \${llmResult.pick} -> \${veto.pick} (\${veto.reason})\`); + turn = veto.pick; + } else if (veto.sample) { + await bot.log( + \` veto: kept \${llmResult.pick} (ray@\${(RELATIVE_TURNS[llmResult.pick] ?? 0)}° = \${veto.sample.hit}@\${veto.sample.distance.toFixed(0)})\`, + ); + } + + const targetBearing = turnToTargetBearing(turn, s.player); + await bot.log( + "macro turn:", turn, + targetBearing === null + ? "(no player pose)" + : \`-> target bearing \${targetBearing.toFixed(0)}° (\${compassLabel(targetBearing)}) from facing \${s.player ? s.player.angle_deg.toFixed(0) : "?"}° (\${s.player ? compassLabel(s.player.angle_deg) : "?"})\`, + ); + + // --- Closed-loop micro ticks: check state, correct heading. --- + let stuckTicks = 0; + const startPose = s.player ? { x: s.player.x, y: s.player.y } : null; + // Last observed pose, used to compute position delta tick-to-tick. + // Position is the ground-truth movement signal (mom is laggy). + let lastPos = s.player ? { x: s.player.x, y: s.player.y } : null; + const branchCounts = {}; + for (let t = 0; t < TICKS_PER_MACRO; t++) { + s = await bot.getState(); + if (s.screen === "dead" || s.screen === "finale") { + await bot.log("ending screen reached:", s.screen); + return \`ended on \${s.screen} after \${macro} macro steps\`; + } + // Position delta since last get_state. This is what we actually + // trust for "are we moving?" — mom lags 1-2 engine tics behind + // press_key, but x/y are sampled the same tic they're read. + const posDelta = + s.player && lastPos + ? Math.hypot(s.player.x - lastPos.x, s.player.y - lastPos.y) + : 0; + const actuallyMoving = posDelta >= STUCK_POS_EPS; + stuckTicks = actuallyMoving ? 0 : stuckTicks + 1; + if (s.player) lastPos = { x: s.player.x, y: s.player.y }; + // Engine-reported speed kept for the log (and the LLM digest) + // even though we no longer decide stuck-ness from it. + const speed = s.player + ? Math.abs(s.player.momx) + Math.abs(s.player.momy) + : 1; + + const err = + targetBearing !== null && s.player + ? angleDelta(targetBearing, s.player.angle_deg) + : 0; + + // Forward raycast distance / hit kind: lets us see *why* the + // wall-avoid branch fires when reading the trace afterwards. + const rays = s.raycasts || []; + const fwd = rays.find((r) => Math.abs(r.bearing_deg) < 10); + const fwdStr = fwd ? \`\${fwd.hit}@\${fwd.distance.toFixed(0)}\` : "(none)"; + const enemiesStr = + (s.enemies_visible || []).length === 0 + ? "0" + : (s.enemies_visible || []) + .map((e) => \`\${e.type}/\${e.bearing}\`) + .join(","); + + const branch = await microTick(s, targetBearing, stuckTicks, actuallyMoving); + branchCounts[branch.split(" ")[0]] = (branchCounts[branch.split(" ")[0]] || 0) + 1; + + // Log both posDelta (truth) and mom (laggy) so the trace makes + // the lag visible: you'll often see early ticks with mom=0 + // but pdΔ>0 right after a press. + await bot.log( + \` t=\${String(t).padStart(2, "0")} \` + + \`pos=(\${s.player ? s.player.x.toFixed(0) : "?"},\${s.player ? s.player.y.toFixed(0) : "?"}) \` + + \`pdΔ=\${posDelta.toFixed(1)} \` + + \`ang=\${s.player ? s.player.angle_deg.toFixed(0) : "?"}° err=\${err.toFixed(0)}° \` + + \`mom=(\${s.player ? s.player.momx.toFixed(1) : "?"},\${s.player ? s.player.momy.toFixed(1) : "?"}) \` + + \`spd=\${speed.toFixed(1)} stuck=\${stuckTicks} \` + + \`fwd=\${fwdStr} enemies=\${enemiesStr} hp=\${s.hud.health} \` + + \`-> \${branch}\`, + ); + + // Reset the counter right after we kicked an unstick so we don't + // chain unstick branches forever. + if (stuckTicks >= STUCK_TICKS) stuckTicks = 0; + await bot.sleep(TICK_MS - 80); + } + + // Per-macro summary: how far did we actually move, and which + // branches dominated? Also fed back into \`history\` so the next + // LLM consult knows whether we got stuck. + if (s.player && startPose) { + const dx = s.player.x - startPose.x; + const dy = s.player.y - startPose.y; + const dist = Math.sqrt(dx * dx + dy * dy); + const branchStr = Object.entries(branchCounts) + .sort((a, b) => b[1] - a[1]) + .map(([k, v]) => \`\${k}=\${v}\`) + .join(" "); + const pinned = dist < 32 || + (branchCounts["pinned:"] || 0) + (branchCounts["unstick"] || 0) > 6; + await bot.log( + \` macro \${macro} summary: moved \${dist.toFixed(0)} units, branches: \${branchStr}\${pinned ? " [PINNED]" : ""}\`, + ); + history.recent.push({ macro, turn, dist, pinned }); + if (history.recent.length > 3) history.recent.shift(); + } +} + +return \`finished \${STEPS} macro steps\`; +`; + interface Example { id: string; label: string; @@ -477,6 +1086,7 @@ const EXAMPLES: Example[] = [ { id: "simple", label: "Simple: walk forward", code: SIMPLE_BOT }, { id: "combat", label: "Combat: shoot + advance", code: COMBAT_BOT }, { id: "inspect", label: "Inspect: dump state", code: INSPECT_BOT }, + { id: "ai-nav", label: "AI: vision-guided navigation", code: AI_NAV_BOT }, ]; const STARTER_CODE = AUTOPLAY_BOT; diff --git a/doom-player/src/index.ts b/doom-player/src/index.ts index d59eb05..cfbe27e 100644 --- a/doom-player/src/index.ts +++ b/doom-player/src/index.ts @@ -1,3 +1,4 @@ +import { WorkerEntrypoint } from "cloudflare:workers"; import { CDPConnection, CDPSession } from "./cdp/client"; import { WebMCPClient } from "./cdp/webmcp"; import { runBot } from "./bot/runner"; @@ -573,10 +574,16 @@ async function handleRun( } await sink.write(`# bot: starting`); + // Only forward the AI binding when it's actually configured + // on this deployment. When the `ai` block is removed from + // wrangler.jsonc, `env.AI` is undefined and we skip injecting + // the `ai.*` namespace into the bot sandbox. + const aiBinding = (env as Env & { AI?: Ai }).AI; const result = await runBot({ code, loader: env.LOADER, webmcp: boot.webmcp, + ai: aiBinding, timeoutMs: body.timeoutMs, onLog: (line) => { void sink.write(line); @@ -623,6 +630,51 @@ function safeJson(v: unknown): string { // ── Entry point ───────────────────────────────────────────────────── +// ── AI entrypoint ─────────────────────────────────────────────────── +// +// Optional WorkerEntrypoint that wraps the Workers AI binding. The +// `ai` binding in wrangler.jsonc is optional: if it's removed, every +// method here throws a clean "AI binding not configured" error so +// callers can detect it and fall back. +// +// The point of wrapping `env.AI` in a WorkerEntrypoint (rather than +// exposing it directly) is that other workers — including dynamic +// workers loaded via `LOADER` — can be given an RPC stub to this +// class. That stub is a capability: the dynamic worker can call +// `AI.run(model, input)` without seeing the underlying account / API +// token, and we get a single chokepoint to add auth, logging, model +// allow-lists, etc. later. See: +// https://developers.cloudflare.com/dynamic-workers/usage/bindings/ +// +// Usage from another worker (service binding): +// +// // wrangler.jsonc of the *caller*: +// "services": [{ "binding": "DOOM_AI", "service": "doom-player", +// "entrypoint": "AIEntrypoint" }] +// +// // in the caller's code: +// const out = await env.DOOM_AI.run("@cf/meta/llama-3.1-8b-instruct", +// { prompt: "hello" }); + +export class AIEntrypoint extends WorkerEntrypoint { + /** + * Run a Workers AI model. Mirrors `env.AI.run(model, input, options?)` + * one-to-one so callers don't have to learn a new shape. Throws if + * the AI binding isn't configured on this worker. + */ + async run(model: string, input: unknown, options?: unknown): Promise { + // Cast through unknown: `Ai.run` is heavily overloaded per + // model family and we deliberately keep this wrapper generic. + // The caller (host worker) only constructs this entrypoint + // when `env.AI` is configured, so we don't guard here. + return (this.env.AI.run as (m: string, i: unknown, o?: unknown) => Promise)( + model, + input, + options, + ); + } +} + export default { async fetch(request, env, ctx): Promise { const url = new URL(request.url); diff --git a/doom-player/worker-configuration.d.ts b/doom-player/worker-configuration.d.ts index eb64c18..978b98f 100644 --- a/doom-player/worker-configuration.d.ts +++ b/doom-player/worker-configuration.d.ts @@ -1,11 +1,13 @@ /* eslint-disable */ -// Generated by Wrangler by running `wrangler types` (hash: 175481eeb5dd6ef20277f9f17237e589) +// Generated by Wrangler by running `wrangler types` (hash: 60a660f187d1e0f0004e5e0f099bc781) // Runtime types generated with workerd@1.20260518.1 2026-05-01 nodejs_compat interface __BaseEnv_Env { LOADER: WorkerLoader; BROWSER: Fetcher; + AI: Ai; ASSETS: Fetcher; DOOM_URL: "https://agentic-doom.rui-figueira.workers.dev/"; + CLOUDFLARE_ACCOUNT_ID: string; } declare namespace Cloudflare { interface GlobalProps { @@ -18,7 +20,7 @@ type StringifyValues> = { [Binding in keyof EnvType]: EnvType[Binding] extends string ? EnvType[Binding] : string; }; declare namespace NodeJS { - interface ProcessEnv extends StringifyValues> {} + interface ProcessEnv extends StringifyValues> {} } // Begin runtime types diff --git a/doom-player/wrangler.jsonc b/doom-player/wrangler.jsonc index e801ca3..bb97353 100644 --- a/doom-player/wrangler.jsonc +++ b/doom-player/wrangler.jsonc @@ -23,5 +23,8 @@ }, "vars": { "DOOM_URL": "https://dev.silentspacemarine.com/" + }, + "ai": { + "binding": "AI" } } From cf77b16d96f5fdae7e13ae99919cb18d5bcb7a0f Mon Sep 17 00:00:00 2001 From: Rui Figueira Date: Thu, 21 May 2026 20:00:25 +0100 Subject: [PATCH 2/6] Updated get_state tool description to clarify that all fields are snapshots of the last completed engine tic, not real-time --- doom-player/src/client/app.tsx | 25 +++++++++++++++---------- src/app/lib/webmcp.tsx | 2 +- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/doom-player/src/client/app.tsx b/doom-player/src/client/app.tsx index acf2545..c998bf2 100644 --- a/doom-player/src/client/app.tsx +++ b/doom-player/src/client/app.tsx @@ -502,18 +502,23 @@ const TICK_MS = 200; const TURN_TOLERANCE_DEG = 18; // dead-band: don't bother correcting <18° const TURN_HOLD_MIN_MS = 140; // shortest turn tap const TURN_HOLD_MAX_MS = 500; // longest single turn tap (big errors) -// Doom's player.momx / player.momy are LAGGY (see the \`get_state\` tool -// description in src/app/lib/webmcp.tsx): they trail keydown -> ticcmd -// -> thrust by 1-2 engine tics, so the first state read after a -// press_key('up') routinely returns mom=(0,0) even though the player -// is already moving. We instead track the change in player.x / player.y -// between consecutive get_state calls — position is current the tic -// it's sampled, so a non-zero pos delta is ground truth that the player -// actually moved. +// Every field returned by get_state is a snapshot of the last completed +// 35Hz engine tic (see the \`get_state\` tool description in +// src/app/lib/webmcp.tsx). After a press_key our keydown -> ticcmd -> +// thrust pipeline can leave the next get_state still showing the +// previous tic's pose, raycasts, momx/momy, things, etc. +// +// momx/momy are the most visibly laggy because they require the thrust +// step to have run, but x/y/angle/raycasts can also briefly trail. +// Position eventually becomes ground truth because the displacement +// from a press accumulates over multiple tics, so deltas between two +// consecutive reads are a reliable "did anything happen?" signal even +// when one read is stale. We use position-delta-over-multiple-ticks +// for wedge detection; momentum is just logged for context. const STUCK_POS_EPS = 4; // units of pos delta below this = stuck const STUCK_TICKS = 3; // consecutive stuck ticks before unsticking -// Kept for the rare "I want a real-speed estimate" path; not used for -// wedge detection any more. +// Kept for the per-tick log and steady-state speed estimates only; +// not used for wedge detection any more. const STUCK_MOM_EPS = 0.5; // @cf/meta/llama-3.2-11b-vision-instruct requires that you agree with their terms diff --git a/src/app/lib/webmcp.tsx b/src/app/lib/webmcp.tsx index f093b45..c08ca9c 100644 --- a/src/app/lib/webmcp.tsx +++ b/src/app/lib/webmcp.tsx @@ -497,7 +497,7 @@ const useGetStateTool = () => { useWebMCP({ name: "get_state", description: - "Read the current Doom game state directly from the engine. Returns structured JSON: screen kind (title/menu/playing/demo/automap/intermission/dead/finale), HUD (health, armor, ammo, weapon, face, keys), player pose (x/y/z map units, angle_deg in [0,360), momx/momy per-tic velocity), 8 raycasts evenly spread across the forward 90-degree FOV (each with bearing_deg in [-45,+45] using screen convention where + = right of facing, distance in map units, hit kind wall/door/switch/exit/thing/open, plus inline thing_type/thing_category when hit='thing'), things_visible array of all pickups/decor/blockers crossed by any ray (deduped, each with type+category like 'green_armor'/armor, 'stimpack'/health, 'shotgun'/weapon, 'clip'/ammo, 'blue_keycard'/key, 'exploding_barrel'/barrel, 'decoration'/decor, plus enemies), enemies in the FOV (with bearing and distance bins), and high-level booleans (in_combat, low_health). Use player.momx/momy after a movement input to detect 'wedged' states (both zero = blocked). Use raycasts to navigate without screenshots: small distance with hit='wall' means turn; hit='door' near distance ~64 means press use; hit='open' with large distance means clear corridor; check things_visible for pickups to grab and barrels to shoot. This is accurate and instant — prefer it over get_screenshot for state-driven decisions. Note: screen='demo' means Doom is playing back a built-in attract-mode demo and ignoring input — the agent should call start_new_game or press_key('escape') to break out, not try to play.", + "Read the current Doom game state directly from the engine. Returns structured JSON: screen kind (title/menu/playing/demo/automap/intermission/dead/finale), HUD (health, armor, ammo, weapon, face, keys), player pose (x/y/z map units, angle_deg in [0,360), momx/momy per-tic velocity), 8 raycasts evenly spread across the forward 90-degree FOV (each with bearing_deg in [-45,+45] using screen convention where + = right of facing, distance in map units, hit kind wall/door/switch/exit/thing/open, plus inline thing_type/thing_category when hit='thing'), things_visible array of all pickups/decor/blockers crossed by any ray (deduped, each with type+category like 'green_armor'/armor, 'stimpack'/health, 'shotgun'/weapon, 'clip'/ammo, 'blue_keycard'/key, 'exploding_barrel'/barrel, 'decoration'/decor, plus enemies), enemies in the FOV (with bearing and distance bins), and high-level booleans (in_combat, low_health). Use raycasts to navigate without screenshots: small distance with hit='wall' means turn; hit='door' near distance ~64 means press use; hit='open' with large distance means clear corridor; check things_visible for pickups to grab and barrels to shoot. Prefer it over get_screenshot for state-driven decisions. IMPORTANT — EVERY FIELD IS A SNAPSHOT OF THE LAST COMPLETED ENGINE TIC, NOT REAL-TIME: Doom runs its simulation in discrete 35Hz tics (~28.5ms each). All fields here (player.x/y/angle_deg, momx/momy, raycasts, things_visible, enemies_visible, hud) are sampled from C globals that only update on tic boundaries, and our keydown -> ticcmd -> thrust pipeline can put a press_key 1-2 tics ahead of what get_state will show. After a press_key('up') the first get_state may still report momx=momy=0 with the previous pose and raycasts even though the engine has already accepted the input — wait one or two more get_state / sleep cycles for the snapshot to reflect reality. momx/momy are the most visibly laggy (they require the thrust step to have run), but pose, raycasts, things, and enemies can also briefly trail the player's actual situation, especially immediately after a key press, after the engine spawns/kills something, or after a door starts opening. Practical implications: (1) the bot should not treat a single 'didn't move' read as a wedge — wait at least one more tick, ideally use the DELTA between consecutive player.x/y reads (position becomes ground truth as soon as it ticks); (2) don't bias-correct a turn based on one frame of angle_deg, sample two; (3) raycasts at the moment of pressing 'use' on a door will still show hit='door' for a tic or two after the door starts opening, then transition. Reserve momx/momy for steady-state speed estimates. Note: screen='demo' means Doom is playing back a built-in attract-mode demo and ignoring input — the agent should call start_new_game or press_key('escape') to break out, not try to play.", inputSchema: EMPTY_OBJECT_SCHEMA, execute: async () => { const mod = window.Module; From c4a8af192a7b73bea17c5b7a16f42ece52e15b79 Mon Sep 17 00:00:00 2001 From: Rui Figueira Date: Thu, 21 May 2026 20:23:59 +0100 Subject: [PATCH 3/6] Added bot.logImage() method for streaming debug images to UI sidebar with caption support --- doom-player/src/bot/context.ts | 102 ++++++++++++++++ doom-player/src/bot/runner.ts | 5 + doom-player/src/client/app.tsx | 196 ++++++++++++++++++++++++------ doom-player/src/client/styles.css | 101 +++++++++++++++ 4 files changed, 365 insertions(+), 39 deletions(-) diff --git a/doom-player/src/bot/context.ts b/doom-player/src/bot/context.ts index 96972c8..8a9d85e 100644 --- a/doom-player/src/bot/context.ts +++ b/doom-player/src/bot/context.ts @@ -156,6 +156,7 @@ export class BotContext { keyPresses: 0, sleeps: 0, logs: 0, + imageLogs: 0, }; constructor(webmcp: WebMCPClient, opts: BotContextOptions = {}) { @@ -300,12 +301,78 @@ export class BotContext { ); } + /** + * Stream an image to the host log pane. Only the most recent + * image is kept by the UI -- this is a debug affordance, not a + * gallery. Pass either the result of `bot.screenshot()` directly, + * or any `{ data: base64, mimeType }` pair, plus an optional + * caption. + * + * The image flows as a single sentinel-prefixed log line: + * + * \u0001img: + * + * The host React app peels the sentinel off and renders an ; + * any other consumer of the stream just sees one weird line and + * can ignore it. + */ + async logImage( + shot: { data: string; mimeType: string }, + caption?: string, + ): Promise { + if ( + !shot || + typeof shot.data !== "string" || + typeof shot.mimeType !== "string" + ) { + throw new Error( + "logImage: expected { data: base64-string, mimeType: string }", + ); + } + this.#stats.imageLogs += 1; + // Hard cap so a bot can't blow up the streaming response. 512 KB + // of base64 is ~384 KB of binary -- way more than any reasonable + // debug screenshot at 320x200. + if (shot.data.length > 512 * 1024) { + throw new Error( + `logImage: image too large (${shot.data.length} base64 chars; cap is 524288)`, + ); + } + // We only support PNG today (that's what get_screenshot returns). + // The UI bounds the rendered size and preserves the real aspect + // ratio via \`object-fit: contain\`, so any sensible dimensions are + // fine — but reject pathologically large frames up front so a + // typo'd bot can't push a 4K screenshot through the stream. + if (shot.mimeType !== "image/png") { + throw new Error( + `logImage: expected mimeType "image/png", got "${shot.mimeType}"`, + ); + } + const dims = decodePngDimensions(shot.data); + if (!dims) { + throw new Error("logImage: payload is not a valid PNG (missing IHDR)"); + } + const MAX_DIM = 2048; + if (dims.width > MAX_DIM || dims.height > MAX_DIM) { + throw new Error( + `logImage: image too large (${dims.width}x${dims.height}; max dimension is ${MAX_DIM})`, + ); + } + const payload = JSON.stringify({ + mimeType: shot.mimeType, + data: shot.data, + caption: typeof caption === "string" ? caption : "", + }); + this.#onLog(`\u0001img:${payload}`); + } + /** Read-only snapshot of how the bot has used the context so far. */ stats(): Readonly<{ stateReads: number; keyPresses: number; sleeps: number; logs: number; + imageLogs: number; }> { return { ...this.#stats }; } @@ -321,6 +388,41 @@ function safeStringify(v: unknown): string { } } +/** + * Decode a PNG's IHDR width/height from its base64 payload. PNG layout: + * bytes 0..7 signature (89 50 4E 47 0D 0A 1A 0A) + * bytes 8..11 IHDR chunk length (always 13 for a valid PNG) + * bytes 12..15 "IHDR" + * bytes 16..19 width (big-endian u32) + * bytes 20..23 height (big-endian u32) + * + * Returns null if the input doesn't look like a PNG. We only need the + * first 24 bytes, so decoding the leading 32 base64 chars is enough. + */ +function decodePngDimensions( + base64: string, +): { width: number; height: number } | null { + if (base64.length < 32) return null; + let head: string; + try { + head = atob(base64.slice(0, 32)); + } catch { + return null; + } + if (head.length < 24) return null; + // Signature check on the first 8 bytes (89 50 4E 47 0D 0A 1A 0A). + const sig = [137, 80, 78, 71, 13, 10, 26, 10]; + for (let i = 0; i < 8; i++) { + if (head.charCodeAt(i) !== sig[i]) return null; + } + const u32 = (off: number) => + (head.charCodeAt(off) << 24) | + (head.charCodeAt(off + 1) << 16) | + (head.charCodeAt(off + 2) << 8) | + head.charCodeAt(off + 3); + return { width: u32(16) >>> 0, height: u32(20) >>> 0 }; +} + function isPlainState(v: unknown): v is BotState { return ( !!v && diff --git a/doom-player/src/bot/runner.ts b/doom-player/src/bot/runner.ts index ecdab88..d2d12f5 100644 --- a/doom-player/src/bot/runner.ts +++ b/doom-player/src/bot/runner.ts @@ -101,6 +101,11 @@ export async function runBot(opts: RunBotOptions): Promise { ctx.sleep(typeof ms === "number" ? ms : Number(ms) || 0), log: async (...args: unknown[]) => ctx.log(...args), screenshot: async () => ctx.screenshot(), + logImage: async (shot: unknown, caption?: unknown) => + ctx.logImage( + shot as { data: string; mimeType: string }, + typeof caption === "string" ? caption : undefined, + ), }, positionalArgs: true, }; diff --git a/doom-player/src/client/app.tsx b/doom-player/src/client/app.tsx index c998bf2..59c474f 100644 --- a/doom-player/src/client/app.tsx +++ b/doom-player/src/client/app.tsx @@ -462,8 +462,12 @@ return { ticks: MAX_TICKS, finalScreen: lastScreen, finalHp: lastHp, actions }; // One-shot diagnostic bot: dumps the first state snapshot and exits. // Useful for inspecting what fields the engine exposes without // writing a loop. -const INSPECT_BOT = `// Dump a single state snapshot and quit. +const INSPECT_BOT = `// Dump a single state snapshot and the current frame, then quit. +// Useful for sanity-checking what the engine exposes. await bot.log(JSON.stringify(await bot.getState(), null, 2)); +// The screenshot lands in the collapsible image panel on the right. +const shot = await bot.screenshot(); +await bot.logImage(shot, "inspect: current frame"); `; // Vision-LLM bot: opens the automap, screenshots it, asks Workers AI @@ -963,6 +967,9 @@ for (let macro = 0; macro < STEPS; macro++) { // --- ONE LLM consult per macro step: image + state digest. --- const shot = await snapAutomap(s.screen); + // Surface the automap to the UI's debug-image panel so a human + // watching the run can see exactly what the LLM saw. + await bot.logImage(shot, \`macro \${macro} automap (pose \${s.player ? \`(\${s.player.x.toFixed(0)},\${s.player.y.toFixed(0)})@\${s.player.angle_deg.toFixed(0)}°\` : "?"})\`); const llmResult = await askMacroTurn(s, shot, history); let turn = llmResult.pick; @@ -1101,6 +1108,8 @@ const STORAGE_KEY = "doom-player-bot-code"; // across tabs / restarts would just dump us on a dead session every // time. Cleared on tab close, restored on reload. const SESSION_KEY = "doom-player-br-session-id"; +// Max number of bot.logImage entries kept in the side panel. +const IMAGE_HISTORY = 4; type Mode = "idle" | "running"; @@ -1134,6 +1143,15 @@ export function App() { } }); const [log, setLog] = useState([]); + // Debug images surfaced by \`bot.logImage(...)\`. We keep the most + // recent IMAGE_HISTORY entries in display order (oldest first) so + // the user can scroll back a few frames; older ones drop off. + // Cleared on each new run. + const [images, setImages] = useState< + Array<{ mimeType: string; data: string; caption: string; receivedAt: number }> + >([]); + // User can collapse the image sidebar to give the log full width. + const [imagePanelCollapsed, setImagePanelCollapsed] = useState(false); const [mode, setMode] = useState("idle"); const [devtoolsUrl, setDevtoolsUrl] = useState(null); // Embed the DevTools pane by default; bot logs stay accessible via @@ -1191,9 +1209,51 @@ export function App() { }, []); // Some lines in the stream carry structured side-channel data - // (devtools URL, BR session id, ...). Recognise them here and - // forward to state so the UI can capture them. + // (devtools URL, BR session id, image dumps, ...). Recognise them + // here and forward to state so the UI can capture them. const handleLine = useCallback((line: string) => { + // \`bot.logImage(...)\` emits a single sentinel-prefixed line. We + // peel it off and stash the image in state rather than appending + // it as text; the JSON payload is base64-heavy and would just + // clutter the log pane. + if (line.startsWith("\u0001img:")) { + try { + const payload = JSON.parse(line.slice(5)) as { + mimeType?: unknown; + data?: unknown; + caption?: unknown; + }; + if ( + typeof payload.mimeType === "string" && + typeof payload.data === "string" + ) { + const entry = { + mimeType: payload.mimeType, + data: payload.data, + caption: + typeof payload.caption === "string" ? payload.caption : "", + receivedAt: Date.now(), + }; + setImages((prev) => { + // Cap at IMAGE_HISTORY entries; drop oldest first. + const next = [...prev, entry]; + return next.length > IMAGE_HISTORY + ? next.slice(next.length - IMAGE_HISTORY) + : next; + }); + append( + `# image: ${payload.mimeType}, ${payload.data.length} base64 chars${ + typeof payload.caption === "string" && payload.caption.length > 0 + ? ` — ${payload.caption}` + : "" + }`, + ); + return; + } + } catch { + // fall through and treat as a plain log line + } + } const dt = line.match(/^# devtools: (.+)$/); if (dt) { const raw = dt[1].trim(); @@ -1262,6 +1322,7 @@ export function App() { if (mode === "running") return; setMode("running"); setLog([]); + setImages([]); const reuseId = sessionIdRef.current; // Only blank the embedded DevTools iframe when we're starting // from scratch. If we're about to reuse the same BR session, @@ -1301,7 +1362,10 @@ export function App() { abortRef.current?.abort(); }, []); - const clearLog = useCallback(() => setLog([]), []); + const clearLog = useCallback(() => { + setLog([]); + setImages([]); + }, []); const resetCode = useCallback(() => setCode(STARTER_CODE), []); const resetSession = useCallback(() => { setSessionId(null); @@ -1441,6 +1505,17 @@ export function App() { ) : null} + {images.length > 0 ? ( + + ) : null}