Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
331 changes: 331 additions & 0 deletions doom-player/src/bot/context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ export class BotContext {
keyPresses: 0,
sleeps: 0,
logs: 0,
imageLogs: 0,
};

constructor(webmcp: WebMCPClient, opts: BotContextOptions = {}) {
Expand Down Expand Up @@ -245,12 +246,153 @@ export class BotContext {
this.#onLog(line);
}

/**
* Capture a pixel-perfect 320x200 PNG of the current frame, as
* base64-encoded bytes plus mime type. Useful for piping into a
* vision-capable LLM (see `ai.run(...)` in the sandbox).
*/
async screenshot(): Promise<{ data: string; mimeType: string }> {
const res = await this.#webmcp.invoke("get_screenshot", {});
if (res.status !== "Completed") {
throw new Error(
`get_screenshot failed: status=${res.status} ${res.errorText ?? res.exception?.description ?? ""}`,
);
}
// get_screenshot returns `{ content: [{ type: "image", data,
// mimeType }, { type: "text", text }] }`. The MCP envelope can
// be wrapped one extra layer by the CDP transport, mirroring
// peelTextEnvelope in mcpPayload.ts; we walk at most a couple
// of layers looking for the image item.
let current: unknown = res.output;
for (let depth = 0; depth < 4; depth++) {
if (!current || typeof current !== "object") break;
const content = (current as { content?: unknown }).content;
if (Array.isArray(content)) {
const img = content.find(
(c): c is { type: "image"; data: string; mimeType: string } =>
!!c &&
typeof c === "object" &&
(c as { type?: unknown }).type === "image" &&
typeof (c as { data?: unknown }).data === "string" &&
typeof (c as { mimeType?: unknown }).mimeType === "string",
);
if (img) return { data: img.data, mimeType: img.mimeType };
// Descend through a nested text-encoded envelope, if any.
const text = content.find(
(c): c is { type: "text"; text: string } =>
!!c &&
typeof c === "object" &&
(c as { type?: unknown }).type === "text" &&
typeof (c as { text?: unknown }).text === "string",
);
if (text) {
try {
current = JSON.parse(text.text);
continue;
} catch {
break;
}
}
}
break;
}
throw new Error(
`get_screenshot returned no image content. Raw: ${safeStringify(res.output).slice(0, 200)}`,
);
}

/**
* Stream an image to the host log pane. Only the most recent
* image is kept by the UI -- this is a debug affordance, not a
* gallery. Pass either the result of `bot.screenshot()` directly,
* or any `{ data: base64, mimeType }` pair, plus an optional
* caption.
*
* The image flows as a single sentinel-prefixed log line:
*
* \u0001img:<json>
*
* The host React app peels the sentinel off and renders an <img>;
* any other consumer of the stream just sees one weird line and
* can ignore it.
*/
async logImage(
shot: { data: string; mimeType: string },
caption?: string,
): Promise<void> {
if (
!shot ||
typeof shot.data !== "string" ||
typeof shot.mimeType !== "string"
) {
throw new Error(
"logImage: expected { data: base64-string, mimeType: string }",
);
}
this.#stats.imageLogs += 1;
// Hard cap so a bot can't blow up the streaming response. 512 KB
// of base64 is ~384 KB of binary -- way more than any reasonable
// debug screenshot at 320x200.
if (shot.data.length > 512 * 1024) {
throw new Error(
`logImage: image too large (${shot.data.length} base64 chars; cap is 524288)`,
);
}
// We only support PNG today (that's what get_screenshot returns).
// The UI bounds the rendered size and preserves the real aspect
// ratio via \`object-fit: contain\`, so any sensible dimensions are
// fine — but reject pathologically large frames up front so a
// typo'd bot can't push a 4K screenshot through the stream.
if (shot.mimeType !== "image/png") {
throw new Error(
`logImage: expected mimeType "image/png", got "${shot.mimeType}"`,
);
}
const dims = decodePngDimensions(shot.data);
if (!dims) {
throw new Error("logImage: payload is not a valid PNG (missing IHDR)");
}
const MAX_DIM = 2048;
if (dims.width > MAX_DIM || dims.height > MAX_DIM) {
throw new Error(
`logImage: image too large (${dims.width}x${dims.height}; max dimension is ${MAX_DIM})`,
);
}
const payload = JSON.stringify({
mimeType: shot.mimeType,
data: shot.data,
caption: typeof caption === "string" ? caption : "",
});
this.#onLog(`\u0001img:${payload}`);
}

/**
* Encode an RGBA pixel buffer into a base64-encoded PNG suitable
* for handing straight to `logImage`. We do this host-side so bot
* code can build memory / debug visualisations without dragging a
* full PNG encoder into the sandbox.
*
* `rgba` must be exactly `width * height * 4` bytes (RGBA, 8 bits
* per channel, top-to-bottom row order, no premultiplied alpha).
* The encoder uses uncompressed deflate blocks — the file is a few
* KB larger than a normal PNG but the code is simple and has zero
* dependencies.
*/
async encodePng(
width: number,
height: number,
rgba: Uint8Array | number[],
): Promise<{ data: string; mimeType: string }> {
return encodePngRgba(width, height, rgba);
}

/** Read-only snapshot of how the bot has used the context so far. */
stats(): Readonly<{
stateReads: number;
keyPresses: number;
sleeps: number;
logs: number;
imageLogs: number;
}> {
return { ...this.#stats };
}
Expand All @@ -266,6 +408,195 @@ function safeStringify(v: unknown): string {
}
}

/**
* Decode a PNG's IHDR width/height from its base64 payload. PNG layout:
* bytes 0..7 signature (89 50 4E 47 0D 0A 1A 0A)
* bytes 8..11 IHDR chunk length (always 13 for a valid PNG)
* bytes 12..15 "IHDR"
* bytes 16..19 width (big-endian u32)
* bytes 20..23 height (big-endian u32)
*
* Returns null if the input doesn't look like a PNG. We only need the
* first 24 bytes, so decoding the leading 32 base64 chars is enough.
*/
function decodePngDimensions(
base64: string,
): { width: number; height: number } | null {
if (base64.length < 32) return null;
let head: string;
try {
head = atob(base64.slice(0, 32));
} catch {
return null;
}
if (head.length < 24) return null;
// Signature check on the first 8 bytes (89 50 4E 47 0D 0A 1A 0A).
const sig = [137, 80, 78, 71, 13, 10, 26, 10];
for (let i = 0; i < 8; i++) {
if (head.charCodeAt(i) !== sig[i]) return null;
}
const u32 = (off: number) =>
(head.charCodeAt(off) << 24) |
(head.charCodeAt(off + 1) << 16) |
(head.charCodeAt(off + 2) << 8) |
head.charCodeAt(off + 3);
return { width: u32(16) >>> 0, height: u32(20) >>> 0 };
}

// ── PNG encoder ─────────────────────────────────────────────────────
//
// A small, dependency-free PNG encoder used by `BotContext.encodePng`.
// We emit a single IDAT chunk whose deflate stream is made entirely
// of uncompressed ("stored") blocks — that's the simplest legal
// deflate form: a 5-byte header per <= 65535-byte block, then raw
// bytes. Files are 0.5-1% larger than a properly-compressed PNG, but
// the encoder fits in ~60 lines and runs in any JS runtime (no
// CompressionStream / pako / canvas dependency).
//
// References:
// PNG spec https://www.w3.org/TR/png-3/
// deflate spec https://www.rfc-editor.org/rfc/rfc1951

const CRC_TABLE = (() => {
const t = new Uint32Array(256);
for (let n = 0; n < 256; n++) {
let c = n;
for (let k = 0; k < 8; k++) {
c = c & 1 ? 0xedb88320 ^ (c >>> 1) : c >>> 1;
}
t[n] = c;
}
return t;
})();

function crc32(bytes: Uint8Array, start: number, end: number): number {
let c = 0xffffffff;
for (let i = start; i < end; i++) c = CRC_TABLE[(c ^ bytes[i]) & 0xff] ^ (c >>> 8);
return (c ^ 0xffffffff) >>> 0;
}

function adler32(bytes: Uint8Array): number {
let a = 1;
let b = 0;
for (let i = 0; i < bytes.length; i++) {
a = (a + bytes[i]) % 65521;
b = (b + a) % 65521;
}
return ((b << 16) | a) >>> 0;
}

function writeU32BE(buf: Uint8Array, off: number, val: number): void {
buf[off] = (val >>> 24) & 0xff;
buf[off + 1] = (val >>> 16) & 0xff;
buf[off + 2] = (val >>> 8) & 0xff;
buf[off + 3] = val & 0xff;
}

function makeChunk(type: string, data: Uint8Array): Uint8Array {
const out = new Uint8Array(12 + data.length);
writeU32BE(out, 0, data.length);
out[4] = type.charCodeAt(0);
out[5] = type.charCodeAt(1);
out[6] = type.charCodeAt(2);
out[7] = type.charCodeAt(3);
out.set(data, 8);
writeU32BE(out, 8 + data.length, crc32(out, 4, 8 + data.length));
return out;
}

function bytesToBase64(bytes: Uint8Array): string {
// Walk the input in 8 KB chunks to avoid blowing the call-argument
// limit of String.fromCharCode on large images.
let bin = "";
const CHUNK = 0x2000;
for (let i = 0; i < bytes.length; i += CHUNK) {
bin += String.fromCharCode.apply(
null,
Array.from(bytes.subarray(i, Math.min(i + CHUNK, bytes.length))),
);
}
return btoa(bin);
}

export function encodePngRgba(
width: number,
height: number,
rgba: Uint8Array | number[],
): { data: string; mimeType: string } {
if (!Number.isInteger(width) || width <= 0 || !Number.isInteger(height) || height <= 0) {
throw new Error(`encodePng: bad dimensions ${width}x${height}`);
}
const expected = width * height * 4;
const src = rgba instanceof Uint8Array ? rgba : new Uint8Array(rgba);
if (src.length !== expected) {
throw new Error(
`encodePng: expected ${expected} bytes for ${width}x${height} RGBA, got ${src.length}`,
);
}

// Build the raw image stream with a filter byte (0 = None) per row.
const rowStride = width * 4;
const raw = new Uint8Array((rowStride + 1) * height);
for (let y = 0; y < height; y++) {
raw[y * (rowStride + 1)] = 0;
raw.set(src.subarray(y * rowStride, (y + 1) * rowStride), y * (rowStride + 1) + 1);
}

// zlib wrapper around uncompressed deflate blocks.
const blocks: Uint8Array[] = [];
const blockSize = 65535;
for (let i = 0; i < raw.length; i += blockSize) {
const len = Math.min(blockSize, raw.length - i);
const last = i + len >= raw.length ? 1 : 0;
const header = new Uint8Array(5);
header[0] = last;
header[1] = len & 0xff;
header[2] = (len >>> 8) & 0xff;
header[3] = ~len & 0xff;
header[4] = (~len >>> 8) & 0xff;
blocks.push(header);
blocks.push(raw.subarray(i, i + len));
}
const adler = adler32(raw);
let idatLen = 2 + 4; // zlib header + adler trailer
for (const b of blocks) idatLen += b.length;
const idat = new Uint8Array(idatLen);
idat[0] = 0x78; // CM=8, CINFO=7
idat[1] = 0x01; // FLEVEL=0, FCHECK chosen so (78*256 + 01) % 31 === 0
let pos = 2;
for (const b of blocks) {
idat.set(b, pos);
pos += b.length;
}
writeU32BE(idat, pos, adler);

// IHDR.
const ihdr = new Uint8Array(13);
writeU32BE(ihdr, 0, width);
writeU32BE(ihdr, 4, height);
ihdr[8] = 8; // bit depth
ihdr[9] = 6; // colour type: RGBA
ihdr[10] = 0; // compression method
ihdr[11] = 0; // filter method
ihdr[12] = 0; // interlace: none

const sig = new Uint8Array([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]);
const ihdrChunk = makeChunk("IHDR", ihdr);
const idatChunk = makeChunk("IDAT", idat);
const iendChunk = makeChunk("IEND", new Uint8Array(0));

const total =
sig.length + ihdrChunk.length + idatChunk.length + iendChunk.length;
const png = new Uint8Array(total);
let o = 0;
png.set(sig, o); o += sig.length;
png.set(ihdrChunk, o); o += ihdrChunk.length;
png.set(idatChunk, o); o += idatChunk.length;
png.set(iendChunk, o);

return { data: bytesToBase64(png), mimeType: "image/png" };
}

function isPlainState(v: unknown): v is BotState {
return (
!!v &&
Expand Down
Loading
Loading