From 5fbc36b14c80d85130c4e10a1cd5dea7028d272f Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 5 May 2026 12:36:50 -0400 Subject: [PATCH 01/14] docs: design for GeoTIFF exponential read-ahead cache (#500) Spec for replacing the fixed [SourceChunk, SourceCache] header pipeline with a sequential read-ahead cache that grows fetch sizes by a configurable multiplier. Ports async-tiff's ReadaheadMetadataCache to TypeScript. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...26-05-05-geotiff-readahead-cache-design.md | 150 ++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md diff --git a/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md b/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md new file mode 100644 index 00000000..c213ac71 --- /dev/null +++ b/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md @@ -0,0 +1,150 @@ +# GeoTIFF exponential read-ahead cache + +**Date:** 2026-05-05 +**Issue:** [#500](https://github.com/developmentseed/deck.gl-raster/issues/500) +**Status:** Approved, ready for implementation plan + +## Problem + +When opening a GeoTIFF over HTTP, [`GeoTIFF.fromUrl`](../../packages/geotiff/src/geotiff.ts) wraps the source with chunkd's `[SourceChunk, SourceCache]` middleware pair. `SourceChunk` aligns each request to a fixed `chunkSize` (default 32 KiB) and is stateless: every fetch uses the same chunk size regardless of how many fetches preceded it. + +For TIFF metadata reads — IFD chains, tag values, GeoKeys, GDAL metadata — access is sequential from the start of the file, but the size of metadata varies widely between files (small COGs may fit in 16 KiB; files with many IFDs or large tag arrays may need 1+ MiB). A fixed chunk size is the wrong shape: too small means many round trips, too large means wasted bytes for small files. + +## Solution + +Replace the `[SourceChunk, SourceCache]` pair on the header source with a single new middleware that maintains a **sequential read-ahead cache** rooted at offset 0. Each underlying fetch grows by a configurable multiplier, so successive metadata reads use exponentially larger chunks. + +This is a direct port of [async-tiff's `ReadaheadMetadataCache`](https://github.com/developmentseed/async-tiff/blob/3dd77e3/src/metadata/cache.rs) ([PR #140](https://github.com/developmentseed/async-tiff/pull/140)) to TypeScript and to the chunkd `SourceMiddleware` interface. + +### Why sequential-from-zero? + +TIFF metadata is laid out near the start of the file: header → IFD → tag values → next IFD → … . Cogeotiff's reads land in this region. A cache that grows contiguously from offset 0 captures every metadata read with at most one underlying fetch beyond what the previous read already pulled in. Tile data reads, by contrast, are at arbitrary large offsets — those continue to use the raw `dataSource` with no caching, exactly as they do today. + +## Components + +All new code lives under `packages/geotiff/src/`. + +### `concurrency.ts` — `mutex()` + +A standalone helper that returns a function for running async tasks one at a time. Used by the read-ahead cache to serialize cache extension across concurrent fetches. + +```ts +/** + * Create a mutex: a function that runs async tasks one at a time. + * + * Tasks submitted while another is running are queued and executed in + * submission order — never concurrently with each other. + * + * Useful when an async operation must observe and mutate shared state + * across awaits without races. The TypeScript analogue of holding a + * `tokio::sync::Mutex` across an `await`. + * + * @example + * const lock = mutex(); + * const a = lock(async () => { ... }); // executes immediately + * const b = lock(async () => { ... }); // waits for `a` to settle, then runs + * + * @returns A function that schedules tasks on the queue. + */ +export function mutex(): (task: () => Promise) => Promise { + let tail: Promise = Promise.resolve(); + return (task: () => Promise): Promise => { + const result = tail.then(task, task); + tail = result.catch(() => {}); + return result; + }; +} +``` + +Notes: +- `tail.then(task, task)` ensures the next task runs whether the previous task resolved or rejected. +- `tail = result.catch(() => {})` swallows errors only on the queue chain, not on the returned promise — the caller still observes the original rejection. +- No timeouts, no cancellation. Keep it minimal. + +### `readahead-cache.ts` + +Two pieces in one file. Internal — not exported from `index.ts`. + +#### `SequentialBlockCache` (internal helper class) + +Stores contiguous buffers from offset 0. + +- Fields: `buffers: Uint8Array[]`, `len: number` (sum of buffer lengths). +- `contains(start, end)` → `boolean`. True iff `end <= len`. +- `slice(start, end)` → `ArrayBuffer`. Crosses block boundaries when needed; returns a zero-copy slice when the range fits in one block. +- `appendBuffer(buf: ArrayBuffer)` → mutates. + +#### `SourceReadaheadCache` (the middleware) + +Implements chunkd's [`SourceMiddleware`](../../packages/geotiff/node_modules/@chunkd/source/build/src/middleware.d.ts) interface (`{ name, fetch(req, next) }`). + +- Constructor options: `{ initial: number; multiplier: number }`. +- Fields: `cache: SequentialBlockCache`, `initial`, `multiplier`, `lock: ReturnType`. +- `fetch(req, next)`: + 1. If `req.offset < 0` or `req.length == null`, bypass: `return next(req)`. + 2. Inside `this.lock(...)`: + - While `!cache.contains(req.offset, req.offset + req.length)`: + - `needed = req.offset + req.length - cache.len`. + - `fetchSize = max(nextFetchSize(cache.len), needed)`, clamped against `req.source.metadata?.size - cache.len` if known. + - `buf = await next({ ...req, offset: cache.len, length: fetchSize })`. + - If `buf.byteLength === 0`, break (EOF). + - `cache.appendBuffer(buf)`. + - Return `cache.slice(req.offset, req.offset + req.length)`. +- `nextFetchSize(existingLen)`: `existingLen === 0 ? initial : round(existingLen * multiplier)`. + +### Wiring in `geotiff.ts` + +Update [`GeoTIFF.fromUrl`](../../packages/geotiff/src/geotiff.ts): + +- New options shape (breaking): + ```ts + { + prefetch?: number; // default 32 * 1024 + multiplier?: number; // default 2.0 + } + ``` +- Drop `chunkSize` and `cacheSize`. +- Replace `[new SourceChunk({ size: chunkSize }), new SourceCache({ size: cacheSize })]` with `[new SourceReadaheadCache({ initial: prefetch, multiplier })]`. +- Continue passing `prefetch` to `Tiff.create({ defaultReadSize: prefetch })` via `GeoTIFF.open`, so the very first read is correctly sized. +- Update JSDoc on `fromUrl` to describe the new behavior. + +`GeoTIFF.open` and `GeoTIFF.fromArrayBuffer` are unchanged. Memory sources don't need read-ahead, and `open` callers compose their own middleware. + +## Tests + +### `concurrency.test.ts` + +- Tasks run one at a time (use a "concurrent counter" to detect overlap). +- Submission order is preserved. +- A rejecting task does not block subsequent tasks. +- Each call's result/error is delivered to the right caller. + +### `readahead-cache.test.ts` + +Port the async-tiff unit tests: + +- Initial fetch returns the requested range; underlying fetch count = 1. +- Subsequent fetch within the cached range: count unchanged. +- Fetch exceeding cached range: count + 1; growth size matches `initial * multiplier^n` (use `initial=2, multiplier=3` like the upstream test). +- Fetch larger than `initial * multiplier^n` triggers a single fetch sized to `needed`. +- `SequentialBlockCache.contains`/`slice` works across multiple blocks, including empty buffers and EOF (port `test_sequential_block_cache_empty_buffers`). +- Concurrent test: fire N parallel `fetch` calls and assert the cache only grows by the expected number of underlying fetches (i.e. requests overlap correctly via the mutex). + +### Smoke test against `GeoTIFF.fromUrl` + +Mock a `Source.fetch` with a counter and assert that opening a real fixture takes fewer underlying calls than the previous `[SourceChunk, SourceCache]` pipeline. + +## Out of scope + +- No upstream PR to `@chunkd/middleware`. +- No public export of `SourceReadaheadCache`, `SequentialBlockCache`, or `mutex` (per "minimal public APIs" preference). Easy to expose later if a concrete external use case appears. +- No backwards-compatibility shim for `chunkSize` / `cacheSize` — this is a 0.x package and release-please will surface the breaking change. +- No timeouts or cancellation in `mutex()`. +- No per-source registry — one middleware instance is created per `fromUrl` call and tied to that source's lifetime, same as the existing `SourceChunk`/`SourceCache` lifecycle. + +## References + +- Issue: [developmentseed/deck.gl-raster#500](https://github.com/developmentseed/deck.gl-raster/issues/500) +- Reference implementation: [developmentseed/async-tiff PR #140](https://github.com/developmentseed/async-tiff/pull/140), file [`src/metadata/cache.rs`](https://github.com/developmentseed/async-tiff/blob/3dd77e3/src/metadata/cache.rs) +- Existing source pipeline: [`packages/geotiff/src/geotiff.ts:233-262`](../../packages/geotiff/src/geotiff.ts#L233-L262) +- chunkd `SourceMiddleware` interface: `@chunkd/source/build/src/middleware.d.ts` From e8e7d6c4a6f95d9681d85703909b575673e24749 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 10:27:24 -0400 Subject: [PATCH 02/14] feat(geotiff): add mutex() helper for serializing async tasks Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/concurrency.ts | 25 ++++++++ packages/geotiff/tests/concurrency.test.ts | 71 ++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 packages/geotiff/src/concurrency.ts create mode 100644 packages/geotiff/tests/concurrency.test.ts diff --git a/packages/geotiff/src/concurrency.ts b/packages/geotiff/src/concurrency.ts new file mode 100644 index 00000000..f2704ee4 --- /dev/null +++ b/packages/geotiff/src/concurrency.ts @@ -0,0 +1,25 @@ +/** + * Create a mutex: a function that runs async tasks one at a time. + * + * Tasks submitted while another is running are queued and executed in + * submission order — never concurrently with each other. + * + * Useful when an async operation must observe and mutate shared state + * across awaits without races. The TypeScript analogue of holding a + * `tokio::sync::Mutex` across an `await`. + * + * @example + * const lock = mutex(); + * const a = lock(async () => { ... }); // executes immediately + * const b = lock(async () => { ... }); // waits for `a` to settle, then runs + * + * @returns A function that schedules tasks on the queue. + */ +export function mutex(): (task: () => Promise) => Promise { + let tail: Promise = Promise.resolve(); + return (task: () => Promise): Promise => { + const result = tail.then(task, task); + tail = result.catch(() => {}); + return result; + }; +} diff --git a/packages/geotiff/tests/concurrency.test.ts b/packages/geotiff/tests/concurrency.test.ts new file mode 100644 index 00000000..ef9bd487 --- /dev/null +++ b/packages/geotiff/tests/concurrency.test.ts @@ -0,0 +1,71 @@ +import { describe, expect, it } from "vitest"; +import { mutex } from "../src/concurrency.js"; + +describe("mutex", () => { + it("runs tasks one at a time", async () => { + const lock = mutex(); + let active = 0; + let maxActive = 0; + + const task = async () => { + active++; + maxActive = Math.max(maxActive, active); + await new Promise((r) => setTimeout(r, 5)); + active--; + }; + + await Promise.all([lock(task), lock(task), lock(task), lock(task)]); + + expect(maxActive).toBe(1); + }); + + it("preserves submission order", async () => { + const lock = mutex(); + const order: number[] = []; + + const promises = [1, 2, 3, 4, 5].map((n) => + lock(async () => { + await new Promise((r) => setTimeout(r, Math.random() * 5)); + order.push(n); + }), + ); + + await Promise.all(promises); + expect(order).toEqual([1, 2, 3, 4, 5]); + }); + + it("delivers each task's result to its caller", async () => { + const lock = mutex(); + const a = lock(async () => 1); + const b = lock(async () => "two"); + const c = lock(async () => ({ three: 3 })); + + expect(await a).toBe(1); + expect(await b).toBe("two"); + expect(await c).toEqual({ three: 3 }); + }); + + it("does not block subsequent tasks when a task rejects", async () => { + const lock = mutex(); + const failure = lock(async () => { + throw new Error("boom"); + }); + const after = lock(async () => 42); + + await expect(failure).rejects.toThrow("boom"); + await expect(after).resolves.toBe(42); + }); + + it("propagates rejections only to the failing caller", async () => { + const lock = mutex(); + const ok1 = lock(async () => "a"); + const bad = lock(async () => { + throw new Error("nope"); + }); + const ok2 = lock(async () => "b"); + + expect(await ok1).toBe("a"); + await expect(bad).rejects.toThrow("nope"); + expect(await ok2).toBe("b"); + }); +}); From ab047fdecba34753f85cd28a0bed84618116ef1a Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 10:33:50 -0400 Subject: [PATCH 03/14] feat(geotiff): add SequentialBlockCache for sequential read-ahead Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/readahead-cache.ts | 79 ++++++++++++++++++ .../geotiff/tests/readahead-cache.test.ts | 83 +++++++++++++++++++ 2 files changed, 162 insertions(+) create mode 100644 packages/geotiff/src/readahead-cache.ts create mode 100644 packages/geotiff/tests/readahead-cache.test.ts diff --git a/packages/geotiff/src/readahead-cache.ts b/packages/geotiff/src/readahead-cache.ts new file mode 100644 index 00000000..26b62ad5 --- /dev/null +++ b/packages/geotiff/src/readahead-cache.ts @@ -0,0 +1,79 @@ +/** + * Contiguous-from-zero buffer cache. + * + * Stores a sequence of buffers logically concatenated from byte offset 0. + * Used by {@link SourceReadaheadCache} to retain previously fetched ranges. + * + * @internal + */ +export class SequentialBlockCache { + private readonly buffers: Uint8Array[] = []; + + /** Total cached length in bytes (sum of buffer lengths). */ + len = 0; + + /** Append a buffer to the end of the cache. */ + appendBuffer(buffer: ArrayBuffer): void { + const view = new Uint8Array(buffer); + this.len += view.byteLength; + this.buffers.push(view); + } + + /** True iff the byte range `[start, end)` is fully cached. */ + contains(start: number, end: number): boolean { + return end <= this.len; + } + + /** + * Slice the byte range `[start, end)` out of the cached buffers. + * + * Returns a zero-copy slice when the range fits in one block; copies + * into a fresh buffer when it spans multiple blocks. Caller must ensure + * the range is fully cached (see {@link contains}). + */ + slice(start: number, end: number): ArrayBuffer { + const outLen = end - start; + if (outLen === 0) { + return new ArrayBuffer(0); + } + + let remainingStart = start; + let remainingEnd = end; + const parts: Uint8Array[] = []; + + for (const block of this.buffers) { + const blockLen = block.byteLength; + if (remainingStart >= blockLen) { + remainingStart -= blockLen; + remainingEnd -= blockLen; + continue; + } + const sliceStart = remainingStart; + const sliceEnd = Math.min(remainingEnd, blockLen); + if (sliceEnd > sliceStart) { + parts.push(block.subarray(sliceStart, sliceEnd)); + } + remainingStart = 0; + if (remainingEnd <= blockLen) { + break; + } + remainingEnd -= blockLen; + } + + if (parts.length === 1) { + const part = parts[0]!; + return part.buffer.slice( + part.byteOffset, + part.byteOffset + part.byteLength, + ); + } + + const out = new Uint8Array(outLen); + let offset = 0; + for (const part of parts) { + out.set(part, offset); + offset += part.byteLength; + } + return out.buffer; + } +} diff --git a/packages/geotiff/tests/readahead-cache.test.ts b/packages/geotiff/tests/readahead-cache.test.ts new file mode 100644 index 00000000..5efdae8f --- /dev/null +++ b/packages/geotiff/tests/readahead-cache.test.ts @@ -0,0 +1,83 @@ +import { describe, expect, it } from "vitest"; +import { SequentialBlockCache } from "../src/readahead-cache.js"; + +const enc = new TextEncoder(); +const dec = new TextDecoder(); + +function asString(buf: ArrayBuffer): string { + return dec.decode(new Uint8Array(buf)); +} + +function buf(s: string): ArrayBuffer { + return enc.encode(s).buffer as ArrayBuffer; +} + +describe("SequentialBlockCache", () => { + it("starts empty", () => { + const cache = new SequentialBlockCache(); + expect(cache.len).toBe(0); + expect(cache.contains(0, 0)).toBe(true); + expect(cache.contains(0, 1)).toBe(false); + }); + + it("appendBuffer grows len", () => { + const cache = new SequentialBlockCache(); + cache.appendBuffer(buf("abc")); + expect(cache.len).toBe(3); + cache.appendBuffer(buf("def")); + expect(cache.len).toBe(6); + }); + + it("contains is true iff end <= len", () => { + const cache = new SequentialBlockCache(); + cache.appendBuffer(buf("abcd")); + expect(cache.contains(0, 4)).toBe(true); + expect(cache.contains(2, 4)).toBe(true); + expect(cache.contains(4, 4)).toBe(true); + expect(cache.contains(0, 5)).toBe(false); + expect(cache.contains(4, 5)).toBe(false); + }); + + it("slices a range that fits in one block", () => { + const cache = new SequentialBlockCache(); + cache.appendBuffer(buf("abcdef")); + expect(asString(cache.slice(0, 3))).toBe("abc"); + expect(asString(cache.slice(2, 5))).toBe("cde"); + expect(asString(cache.slice(0, 6))).toBe("abcdef"); + }); + + it("slices across multiple blocks", () => { + const cache = new SequentialBlockCache(); + cache.appendBuffer(buf("012")); + cache.appendBuffer(buf("345")); + cache.appendBuffer(buf("67")); + + expect(asString(cache.slice(0, 8))).toBe("01234567"); + expect(asString(cache.slice(2, 6))).toBe("2345"); + expect(asString(cache.slice(3, 8))).toBe("34567"); + expect(asString(cache.slice(5, 7))).toBe("56"); + }); + + it("handles empty buffers (port of async-tiff test_sequential_block_cache_empty_buffers)", () => { + const cache = new SequentialBlockCache(); + cache.appendBuffer(buf("012")); + cache.appendBuffer(buf("")); + cache.appendBuffer(buf("34")); + cache.appendBuffer(buf("")); + cache.appendBuffer(buf("5")); + cache.appendBuffer(buf("")); + cache.appendBuffer(buf("67")); + + expect(cache.contains(0, 3)).toBe(true); + expect(asString(cache.slice(0, 3))).toBe("012"); + expect(cache.contains(4, 7)).toBe(true); + expect(asString(cache.slice(4, 7))).toBe("456"); + expect(cache.contains(0, 8)).toBe(true); + expect(asString(cache.slice(0, 8))).toBe("01234567"); + expect(cache.contains(6, 6)).toBe(true); + expect(asString(cache.slice(6, 6))).toBe(""); + expect(cache.contains(6, 9)).toBe(false); + expect(cache.contains(9, 9)).toBe(false); + expect(cache.contains(8, 10)).toBe(false); + }); +}); From 8ffade8939eefcbb2eeb78f14990d15582aa7a16 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 10:36:33 -0400 Subject: [PATCH 04/14] feat(geotiff): add SourceReadaheadCache middleware Wraps a chunkd Source with a sequential read-ahead cache from offset 0. Initial fetches start at `initial` bytes and grow by `multiplier` on each subsequent underlying fetch. Bypasses negative-offset and full-file reads. Uses mutex() to serialize concurrent cache extension. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/readahead-cache.ts | 90 ++++++++- .../geotiff/tests/readahead-cache.test.ts | 175 +++++++++++++++++- 2 files changed, 263 insertions(+), 2 deletions(-) diff --git a/packages/geotiff/src/readahead-cache.ts b/packages/geotiff/src/readahead-cache.ts index 26b62ad5..98d32b47 100644 --- a/packages/geotiff/src/readahead-cache.ts +++ b/packages/geotiff/src/readahead-cache.ts @@ -1,3 +1,10 @@ +import type { + SourceCallback, + SourceMiddleware, + SourceRequest, +} from "@chunkd/source"; +import { mutex } from "./concurrency.js"; + /** * Contiguous-from-zero buffer cache. * @@ -20,7 +27,7 @@ export class SequentialBlockCache { } /** True iff the byte range `[start, end)` is fully cached. */ - contains(start: number, end: number): boolean { + contains(_start: number, end: number): boolean { return end <= this.len; } @@ -77,3 +84,84 @@ export class SequentialBlockCache { return out.buffer; } } + +/** + * Options for {@link SourceReadaheadCache}. + */ +export interface SourceReadaheadCacheOptions { + /** Bytes fetched on the first underlying read. */ + initial: number; + /** Multiplier applied to the previous fetch size on each subsequent read. */ + multiplier: number; +} + +/** + * A chunkd {@link SourceMiddleware} that caches sequential reads from offset 0 + * and grows underlying fetch sizes exponentially. + * + * Designed for TIFF metadata access, which is laid out near the start of the + * file: an initial small fetch covers most files, and subsequent fetches grow + * by `multiplier` to handle larger header structures with few round trips. + * + * Bypasses requests with negative offsets or undefined length (full-file + * reads) — those go directly to the next layer. + * + * Stateful per instance: pairs one-to-one with a single source's lifetime. + * + * @internal + */ +export class SourceReadaheadCache implements SourceMiddleware { + readonly name = "source:readahead-cache"; + + private readonly cache = new SequentialBlockCache(); + private readonly initial: number; + private readonly multiplier: number; + private readonly lock = mutex(); + + constructor(options: SourceReadaheadCacheOptions) { + this.initial = options.initial; + this.multiplier = options.multiplier; + } + + async fetch(req: SourceRequest, next: SourceCallback): Promise { + if (req.offset < 0 || req.length == null) { + return next(req); + } + const start = req.offset; + const end = req.offset + req.length; + const sourceSize = req.source.metadata?.size; + + return this.lock(async () => { + while (!this.cache.contains(start, end)) { + const cacheLen = this.cache.len; + const needed = end - cacheLen; + let fetchSize = Math.max(this.nextFetchSize(cacheLen), needed); + if (sourceSize != null) { + const remaining = sourceSize - cacheLen; + if (remaining <= 0) { + break; + } + fetchSize = Math.min(fetchSize, remaining); + } + const buf = await next({ + ...req, + offset: cacheLen, + length: fetchSize, + }); + if (buf.byteLength === 0) { + break; + } + this.cache.appendBuffer(buf); + } + const sliceEnd = Math.min(end, this.cache.len); + return this.cache.slice(start, sliceEnd); + }); + } + + private nextFetchSize(existingLen: number): number { + if (existingLen === 0) { + return this.initial; + } + return Math.round(existingLen * this.multiplier); + } +} diff --git a/packages/geotiff/tests/readahead-cache.test.ts b/packages/geotiff/tests/readahead-cache.test.ts index 5efdae8f..f59849c1 100644 --- a/packages/geotiff/tests/readahead-cache.test.ts +++ b/packages/geotiff/tests/readahead-cache.test.ts @@ -1,5 +1,9 @@ +import type { SourceCallback, SourceRequest } from "@chunkd/source"; import { describe, expect, it } from "vitest"; -import { SequentialBlockCache } from "../src/readahead-cache.js"; +import { + SequentialBlockCache, + SourceReadaheadCache, +} from "../src/readahead-cache.js"; const enc = new TextEncoder(); const dec = new TextDecoder(); @@ -81,3 +85,172 @@ describe("SequentialBlockCache", () => { expect(cache.contains(8, 10)).toBe(false); }); }); + +/** + * Build a fake `next` callback backed by a string. Counts the number of + * underlying fetches and serves bytes from the string. + */ +function makeNext(data: string): { + next: SourceCallback; + count: () => number; +} { + const bytes = enc.encode(data); + let count = 0; + const next: SourceCallback = async (req) => { + count++; + if (req.offset >= bytes.byteLength) { + return new ArrayBuffer(0); + } + const end = Math.min( + req.offset + (req.length ?? bytes.byteLength - req.offset), + bytes.byteLength, + ); + return bytes.buffer.slice(req.offset, end); + }; + return { next, count: () => count }; +} + +function makeReq(offset: number, length: number): SourceRequest { + // Source isn't inspected by SourceReadaheadCache except for `metadata?.size`, + // which we leave undefined here. + return { + source: { metadata: undefined } as never, + offset, + length, + }; +} + +describe("SourceReadaheadCache", () => { + it("name is set", () => { + const m = new SourceReadaheadCache({ initial: 32, multiplier: 2 }); + expect(m.name).toBe("source:readahead-cache"); + }); + + it("ports the async-tiff readahead test", async () => { + const { next, count } = makeNext("abcdefghijklmnopqrstuvwxyz"); + const m = new SourceReadaheadCache({ initial: 2, multiplier: 3 }); + + // Initial request — fetches 2 bytes. + let buf = await m.fetch(makeReq(0, 2), next); + expect(asString(buf)).toBe("ab"); + expect(count()).toBe(1); + + // Within cached range — no new fetch. + buf = await m.fetch(makeReq(1, 1), next); + expect(asString(buf)).toBe("b"); + expect(count()).toBe(1); + + // Exceeds cached range — second fetch of 6 bytes (2 * 3) added. + buf = await m.fetch(makeReq(2, 3), next); + expect(asString(buf)).toBe("cde"); + expect(count()).toBe(2); + + // Cache len now 8 (2 + 6); request fully inside — no new fetch. + buf = await m.fetch(makeReq(5, 3), next); + expect(asString(buf)).toBe("fgh"); + expect(count()).toBe(2); + + // Request exceeds the next growth size — single fetch sized to the need. + buf = await m.fetch(makeReq(8, 12), next); + expect(asString(buf)).toBe("ijklmnopqrst"); + expect(count()).toBe(3); + }); + + it("bypasses negative offset reads", async () => { + const { next, count } = makeNext("abcdefgh"); + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + + const req: SourceRequest = { + source: { metadata: undefined } as never, + offset: -4, + length: 4, + }; + await m.fetch(req, next); + expect(count()).toBe(1); + }); + + it("bypasses reads with no length (full file)", async () => { + const { next, count } = makeNext("abcdefgh"); + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + + const req: SourceRequest = { + source: { metadata: undefined } as never, + offset: 0, + length: undefined, + }; + await m.fetch(req, next); + expect(count()).toBe(1); + }); + + it("serializes concurrent fetches that grow the cache", async () => { + const { next, count } = makeNext("abcdefghijklmnop"); + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + + const [a, b, c] = await Promise.all([ + m.fetch(makeReq(0, 4), next), + m.fetch(makeReq(4, 4), next), + m.fetch(makeReq(8, 4), next), + ]); + + expect(asString(a)).toBe("abcd"); + expect(asString(b)).toBe("efgh"); + expect(asString(c)).toBe("ijkl"); + + // First fetch: 4 bytes (initial). Second: 8 bytes (4*2). Cache then has + // 12 bytes, so the third concurrent request is satisfied without a third + // underlying fetch. + expect(count()).toBe(2); + }); + + it("clamps fetch size to file size when metadata.size is known", async () => { + const bytes = enc.encode("abcdef"); + let count = 0; + const next: SourceCallback = async (req) => { + count++; + // If our middleware sends an over-sized request, this would fail. + expect(req.offset + (req.length ?? 0)).toBeLessThanOrEqual( + bytes.byteLength, + ); + const end = Math.min(req.offset + (req.length ?? 0), bytes.byteLength); + return bytes.buffer.slice(req.offset, end); + }; + + const m = new SourceReadaheadCache({ initial: 100, multiplier: 2 }); + const req: SourceRequest = { + source: { metadata: { size: 6 } } as never, + offset: 0, + length: 6, + }; + + const buf = await m.fetch(req, next); + expect(asString(buf)).toBe("abcdef"); + expect(count).toBe(1); + }); + + it("breaks on EOF (zero-byte underlying fetch) instead of looping", async () => { + // Source claims size unset, returns empty buffer past offset 4. + const bytes = enc.encode("abcd"); + let count = 0; + const next: SourceCallback = async (req) => { + count++; + if (req.offset >= bytes.byteLength) { + return new ArrayBuffer(0); + } + const end = Math.min(req.offset + (req.length ?? 0), bytes.byteLength); + return bytes.buffer.slice(req.offset, end); + }; + + const m = new SourceReadaheadCache({ initial: 2, multiplier: 2 }); + const req: SourceRequest = { + source: { metadata: undefined } as never, + offset: 0, + length: 10, // more than the file has + }; + + // Should not hang; should return whatever's available without infinite loop. + const buf = await m.fetch(req, next); + // Cache stops growing at EOF; the slice copy is bounded by what's cached. + expect(count).toBeLessThan(20); + expect(buf.byteLength).toBeGreaterThanOrEqual(0); + }); +}); From 0863f70f94f3e56a17151d93fc86fcacabb61678 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 10:50:48 -0400 Subject: [PATCH 05/14] feat(geotiff)!: replace fixed chunk pipeline with read-ahead cache Drops the chunkSize and cacheSize options on GeoTIFF.fromUrl. Adds a multiplier option. The prefetch option now means initial fetch size for the new read-ahead cache. Closes #500 Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/geotiff.ts | 36 ++++++++++++------------- packages/geotiff/src/readahead-cache.ts | 7 +++-- 2 files changed, 20 insertions(+), 23 deletions(-) diff --git a/packages/geotiff/src/geotiff.ts b/packages/geotiff/src/geotiff.ts index 474e2c65..34771085 100644 --- a/packages/geotiff/src/geotiff.ts +++ b/packages/geotiff/src/geotiff.ts @@ -1,4 +1,3 @@ -import { SourceCache, SourceChunk } from "@chunkd/middleware"; import { SourceView } from "@chunkd/source"; import { SourceHttp } from "@chunkd/source-http"; import { SourceMemory } from "@chunkd/source-memory"; @@ -14,6 +13,7 @@ import type { CachedTags, GeoKeyDirectory } from "./ifd.js"; import { extractGeoKeyDirectory, prefetchTags } from "./ifd.js"; import { Overview } from "./overview.js"; import type { DecoderPool } from "./pool/pool.js"; +import { SourceReadaheadCache } from "./readahead-cache.js"; import type { Tile } from "./tile.js"; import { createTransform, index, xy } from "./transform.js"; @@ -223,38 +223,36 @@ export class GeoTIFF { /** * Create a new GeoTIFF from a URL. * + * Wraps the HTTP source with a sequential exponential read-ahead cache + * tuned for TIFF metadata: the first underlying fetch is `prefetch` bytes, + * and each subsequent fetch grows by `multiplier`. Tile data reads bypass + * the cache and use the raw HTTP source directly. + * * @param url The URL of the GeoTIFF to open. - * @param options Optional parameters for chunk size and cache size. - * @param options.chunkSize The minimum size for each request made to the source while reading header metadata. Defaults to 32KB. - * @param options.cacheSize The size of the cache for recently accessed header chunks. Currently no caching is applied to data fetches. Defaults to 1MB. - * @param options.prefetch Number of bytes to prefetch when reading TIFF tags and IFDs. Defaults to 32KB, which is enough for most tags and small IFDs. Increase if you have many tags or large IFDs. + * @param options Optional parameters for the read-ahead cache. + * @param options.prefetch Initial fetch size in bytes for header/metadata reads. Defaults to 32KB, which covers most COGs in a single round trip. + * @param options.multiplier Growth factor applied to the previous fetch size on each subsequent header read. Defaults to 2.0. * @returns A Promise that resolves to a GeoTIFF instance. */ static async fromUrl( url: string | URL, { - chunkSize = 1024 * 1024, - cacheSize = 10 * 1024 * 1024, prefetch = 32 * 1024, - }: { chunkSize?: number; cacheSize?: number; prefetch?: number } = {}, + multiplier = 2, + }: { prefetch?: number; multiplier?: number } = {}, ): Promise { const source = new SourceHttp(url, {}); - // Figure out optimal defaults in light of - // https://github.com/blacha/cogeotiff/issues/1431 - // Defaulting to 32KB chunks is too small for tile data. - // https://github.com/developmentseed/deck.gl-raster/issues/294 - - // read files in chunks - const chunk = new SourceChunk({ size: chunkSize }); - // 10MB cache for recently accessed chunks - const cache = new SourceCache({ size: cacheSize }); + const readahead = new SourceReadaheadCache({ + initial: prefetch, + multiplier, + }); - const view = new SourceView(source, [chunk, cache]); + const view = new SourceView(source, [readahead]); return await GeoTIFF.open({ // Use raw source for tile data to avoid unnecessary copying through the - // cache and chunk layers. + // read-ahead cache layer. dataSource: source, headerSource: view, prefetch, diff --git a/packages/geotiff/src/readahead-cache.ts b/packages/geotiff/src/readahead-cache.ts index 98d32b47..f68e850a 100644 --- a/packages/geotiff/src/readahead-cache.ts +++ b/packages/geotiff/src/readahead-cache.ts @@ -69,10 +69,9 @@ export class SequentialBlockCache { if (parts.length === 1) { const part = parts[0]!; - return part.buffer.slice( - part.byteOffset, - part.byteOffset + part.byteLength, - ); + const out = new Uint8Array(part.byteLength); + out.set(part); + return out.buffer; } const out = new Uint8Array(outLen); From 7fe029ef25c099d99e1151e57fe04b11ae93f6ba Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 10:58:41 -0400 Subject: [PATCH 06/14] test(geotiff): integration smoke test for read-ahead cache Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/integration-readahead.test.ts | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 packages/geotiff/tests/integration-readahead.test.ts diff --git a/packages/geotiff/tests/integration-readahead.test.ts b/packages/geotiff/tests/integration-readahead.test.ts new file mode 100644 index 00000000..110fea89 --- /dev/null +++ b/packages/geotiff/tests/integration-readahead.test.ts @@ -0,0 +1,68 @@ +import type { Source } from "@chunkd/source"; +import { SourceView } from "@chunkd/source"; +import { SourceFile } from "@chunkd/source-file"; +import { describe, expect, it } from "vitest"; +import { GeoTIFF } from "../src/geotiff.js"; +import { SourceReadaheadCache } from "../src/readahead-cache.js"; +import { fixturePath } from "./helpers.js"; + +/** + * Wrap a Source so we can count fetches that hit the underlying file. + */ +function counting(source: Source): { source: Source; count: () => number } { + let count = 0; + const wrapped: Source = { + type: source.type, + url: source.url, + metadata: source.metadata, + head: source.head.bind(source), + close: source.close?.bind(source), + fetch: async (offset, length, options) => { + count++; + return source.fetch(offset, length, options); + }, + }; + return { source: wrapped, count: () => count }; +} + +describe("SourceReadaheadCache integration", () => { + const path = fixturePath("uint8_rgb_deflate_block64_cog", "rasterio"); + + it("opens a fixture through the new middleware", async () => { + const file = new SourceFile(path); + const { source, count } = counting(file); + const view = new SourceView(source, [ + new SourceReadaheadCache({ initial: 32 * 1024, multiplier: 2 }), + ]); + + const tiff = await GeoTIFF.open({ + dataSource: file, + headerSource: view, + prefetch: 32 * 1024, + }); + + expect(tiff.width).toBeGreaterThan(0); + expect(tiff.height).toBeGreaterThan(0); + expect(count()).toBeGreaterThan(0); + }); + + it("opens with a tiny initial size and grows the cache as needed", async () => { + // Force multiple cache extensions by starting with a tiny initial size. + const file = new SourceFile(path); + const { source, count } = counting(file); + const view = new SourceView(source, [ + new SourceReadaheadCache({ initial: 256, multiplier: 2 }), + ]); + + const tiff = await GeoTIFF.open({ + dataSource: file, + headerSource: view, + prefetch: 256, + }); + + expect(tiff.width).toBeGreaterThan(0); + // With a 256-byte initial size, we expect more than one underlying fetch + // to read the IFD chain — proving the cache extends correctly. + expect(count()).toBeGreaterThan(1); + }); +}); From dcef212fc5adb7797da9d42d0d2cbb73c24b71a2 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 11:48:11 -0400 Subject: [PATCH 07/14] docs: revise spec to scope readahead cache to open phase After live-testing PR #509, the cache caused catastrophic over-fetching when zooming because cogeotiff lazily reads tile-offset/bytecount entries from the headerSource for previously-untouched IFDs. Spec now adds: - disable() on SourceReadaheadCache, called after open + prefetchTags - Lazy per-IFD bulk prefetch on first Overview.fetchTile - Default prefetch bumped 32 KiB -> 64 KiB Co-Authored-By: Claude Opus 4.7 (1M context) --- ...26-05-05-geotiff-readahead-cache-design.md | 80 ++++++++++++++++++- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md b/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md index c213ac71..e41783f7 100644 --- a/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md +++ b/dev-docs/specs/2026-05-05-geotiff-readahead-cache-design.md @@ -1,8 +1,8 @@ # GeoTIFF exponential read-ahead cache -**Date:** 2026-05-05 +**Date:** 2026-05-05 (revised 2026-05-06) **Issue:** [#500](https://github.com/developmentseed/deck.gl-raster/issues/500) -**Status:** Approved, ready for implementation plan +**Status:** Initial implementation in PR [#509](https://github.com/developmentseed/deck.gl-raster/pull/509); follow-up refinements specified below. ## Problem @@ -142,9 +142,85 @@ Mock a `Source.fetch` with a counter and assert that opening a real fixture take - No timeouts or cancellation in `mutex()`. - No per-source registry — one middleware instance is created per `fromUrl` call and tied to that source's lifetime, same as the existing `SourceChunk`/`SourceCache` lifecycle. +## Revision (2026-05-06): scope cache to the open phase + +After live-testing PR [#509](https://github.com/developmentseed/deck.gl-raster/pull/509) against the land-cover example, the readahead cache caused catastrophic over-fetching when zooming. The middleware was being consulted *throughout the lifetime* of the `Tiff` instance — not just during initial `Tiff.create()` — because cogeotiff/core lazily reads tile metadata from the `headerSource` whenever a tile from a previously-untouched IFD is requested. + +Concretely, with default settings, panning the land-cover fixture issued underlying fetches of 14 MiB, 42 MiB, 127 MiB, 382 MiB. Each new far-offset request landed past `cache.len`, and the exponential growth (`fetchSize = round(cache.len * multiplier)`) blew up. + +### Root causes + +1. **cogeotiff lazy IFD reads.** When `Overview.fetchTile()` runs against an IFD whose tag values weren't bulk-read at open time, cogeotiff issues per-entry 4–8 byte reads against the `headerSource` for `TileOffsets`/`TileByteCounts`. Today's [`prefetchTags`](../../packages/geotiff/src/ifd.ts) only runs on the *primary* image — overviews and masks are not prefetched. +2. **Cache strategy mismatch.** `SourceReadaheadCache` is a sequential-from-zero cache. It's right for the open phase (IFDs and tag values cluster near the start of the file), but wrong for arbitrary-offset reads after open. Each far-offset request pulls the cache forward exponentially. +3. **Initial size too small for some files.** Default `prefetch = 32 KiB` requires 3 underlying fetches even for moderate metadata. geotiff.js uses 65536 (64 KiB) blocks; async-tiff uses 32 KiB. + +### Decisions + +1. **The readahead cache runs *only during the open phase*.** After `Tiff.create()` and the initial `prefetchTags(primaryImage)` complete, `SourceReadaheadCache.disable()` is called. From that point on, every call to `SourceReadaheadCache.fetch(req, next)` short-circuits to `next(req)` — the cache is neither consulted nor extended. Cogeotiff still holds a reference to the wrapped `SourceView`, but the middleware becomes a no-op pass-through. We do not mutate the `Tiff` instance; we don't need to. +2. **Lazy per-IFD bulk prefetch on first tile request.** Each `Overview` lazily triggers a one-shot bulk read of `TileOffsets` + `TileByteCounts` (via cogeotiff's `image.fetch(TiffTag.…)` API) on its first `fetchTile`. The overview caches the resulting promise so concurrent first-tile requests share a single underlying fetch. Since these calls happen post-`disable()`, they bypass the readahead cache and go straight to raw HTTP — exactly one bulk request per array, per overview. +3. **Default `prefetch` bumped from 32 KiB to 64 KiB.** Aligns with geotiff.js's default block size; cuts one round trip on moderately-sized COGs without measurably penalizing tiny ones. The multiplier dominates open-time round-trip count, but a slightly larger initial moves us closer to one-shot opens for typical files. +4. **Background pre-warming is *not* in scope.** Hooking into `onTilesLoaded` to prefetch unvisited overviews is appealing but adds scheduling complexity. Track as a follow-up issue once the lazy mechanism above is in place — it can be layered on by calling the same per-IFD prefetch path opportunistically. + +### `SourceReadaheadCache.disable()` contract + +```ts +class SourceReadaheadCache implements SourceMiddleware { + // ... + /** + * Permanently bypass the cache. + * + * After this is called, every {@link fetch} returns `next(req)` immediately + * — no cache consultation, no cache extension. Existing in-flight requests + * complete normally (the mutex preserves serialization). + * + * Intended to be called once `GeoTIFF.fromUrl` has finished its open-phase + * reads (`Tiff.create` + `prefetchTags(primaryImage)`). At that point the + * readahead cache has done its job; subsequent reads from cogeotiff are at + * arbitrary offsets (lazy IFD lookups, GDAL ghost-header probes) and do + * not benefit from sequential-from-zero growth. + * + * Idempotent. One-way: there is no `enable()`. + */ + disable(): void; +} +``` + +### Per-overview lazy prefetch contract + +`Overview.fetchTile` becomes: + +```ts +async fetchTile(x, y, options): Promise { + await this.ensureTagsLoaded(); // memoized; resolves once + return /* existing fetch path */; +} + +private ensureTagsLoaded(): Promise { + if (!this._tagsPromise) { + this._tagsPromise = Promise.all([ + this.dataImage.fetch(TiffTag.TileOffsets), + this.dataImage.fetch(TiffTag.TileByteCounts), + this.maskImage?.fetch(TiffTag.TileOffsets) ?? Promise.resolve(null), + this.maskImage?.fetch(TiffTag.TileByteCounts) ?? Promise.resolve(null), + ]).then(() => undefined); + } + return this._tagsPromise; +} +``` + +`Overview.fetchTiles` calls `ensureTagsLoaded` once before launching parallel tile fetches. The primary image's `GeoTIFF.fetchTile` doesn't need this — `prefetchTags` has already loaded those tags during open. + +### Tests + +Update existing tests and add new ones: +- `readahead-cache.test.ts`: add a test for `disable()` — fetches before disable hit the cache normally; fetches after disable always pass through, never grow `cache.len`. +- `integration-readahead.test.ts`: add an assertion that after open, simulated tile-offset reads at far offsets do *not* trigger cache growth (count underlying fetches; there should be one fetch per requested tag, not exponential growth). +- New: `overview.test.ts` (or extend an existing test): assert that the second tile request from an overview does not trigger any new underlying tag fetches (i.e. `ensureTagsLoaded` is memoized). + ## References - Issue: [developmentseed/deck.gl-raster#500](https://github.com/developmentseed/deck.gl-raster/issues/500) +- Initial PR: [developmentseed/deck.gl-raster#509](https://github.com/developmentseed/deck.gl-raster/pull/509) - Reference implementation: [developmentseed/async-tiff PR #140](https://github.com/developmentseed/async-tiff/pull/140), file [`src/metadata/cache.rs`](https://github.com/developmentseed/async-tiff/blob/3dd77e3/src/metadata/cache.rs) - Existing source pipeline: [`packages/geotiff/src/geotiff.ts:233-262`](../../packages/geotiff/src/geotiff.ts#L233-L262) - chunkd `SourceMiddleware` interface: `@chunkd/source/build/src/middleware.d.ts` From ded30fbdc312ae166f3785cd67f1f67eaec7e22d Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 11:55:26 -0400 Subject: [PATCH 08/14] feat(geotiff): add SourceReadaheadCache.disable() for one-way bypass Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/readahead-cache.ts | 26 ++++++++ .../geotiff/tests/readahead-cache.test.ts | 65 +++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/packages/geotiff/src/readahead-cache.ts b/packages/geotiff/src/readahead-cache.ts index f68e850a..f3e83a15 100644 --- a/packages/geotiff/src/readahead-cache.ts +++ b/packages/geotiff/src/readahead-cache.ts @@ -116,13 +116,39 @@ export class SourceReadaheadCache implements SourceMiddleware { private readonly initial: number; private readonly multiplier: number; private readonly lock = mutex(); + private disabled = false; constructor(options: SourceReadaheadCacheOptions) { this.initial = options.initial; this.multiplier = options.multiplier; } + /** + * Permanently bypass the cache. + * + * After this is called, every {@link fetch} returns `next(req)` immediately + * — no cache consultation, no cache extension. The flag is checked before + * acquiring the serialization mutex, so post-disable requests do not queue + * behind in-flight work. + * + * Intended to be called once `GeoTIFF.fromUrl` has finished its open-phase + * reads (`Tiff.create` + `prefetchTags(primaryImage)`). At that point the + * sequential read-ahead cache has done its job; subsequent reads from + * cogeotiff are at arbitrary offsets (lazy IFD lookups, GDAL ghost-header + * probes) and do not benefit from sequential-from-zero growth — in fact + * they cause catastrophic over-fetching as the cache grows exponentially + * to encompass each new far-offset request. + * + * Idempotent. One-way: there is no `enable()`. + */ + disable(): void { + this.disabled = true; + } + async fetch(req: SourceRequest, next: SourceCallback): Promise { + if (this.disabled) { + return next(req); + } if (req.offset < 0 || req.length == null) { return next(req); } diff --git a/packages/geotiff/tests/readahead-cache.test.ts b/packages/geotiff/tests/readahead-cache.test.ts index f59849c1..7fb75776 100644 --- a/packages/geotiff/tests/readahead-cache.test.ts +++ b/packages/geotiff/tests/readahead-cache.test.ts @@ -254,3 +254,68 @@ describe("SourceReadaheadCache", () => { expect(buf.byteLength).toBeGreaterThanOrEqual(0); }); }); + +describe("SourceReadaheadCache.disable()", () => { + it("after disable, every fetch passes through to next without consulting cache", async () => { + const { next, count } = makeNext("abcdefghijklmnop"); + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + + // Pre-disable fetch: cache extends. + await m.fetch(makeReq(0, 4), next); + expect(count()).toBe(1); + + m.disable(); + + // Post-disable: cache is bypassed. Each request hits next directly. + await m.fetch(makeReq(0, 4), next); + expect(count()).toBe(2); + await m.fetch(makeReq(0, 4), next); + expect(count()).toBe(3); + }); + + it("after disable, far-offset reads do not trigger cache growth", async () => { + const { next, count } = makeNext("a".repeat(1024)); + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + + // Establish some cache. + await m.fetch(makeReq(0, 4), next); + expect(count()).toBe(1); + + m.disable(); + + // A far-offset read post-disable issues exactly one underlying fetch + // — no exponential growth. + await m.fetch(makeReq(512, 4), next); + expect(count()).toBe(2); + }); + + it("disable() is idempotent", () => { + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + m.disable(); + m.disable(); + m.disable(); + // No throw; still works as a pass-through. + expect(m.name).toBe("source:readahead-cache"); + }); + + it("preserves bypass behavior for negative offsets and undefined length after disable", async () => { + const { next, count } = makeNext("abcd"); + const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); + m.disable(); + + const negReq: SourceRequest = { + source: { metadata: undefined } as never, + offset: -2, + length: 2, + }; + const fullReq: SourceRequest = { + source: { metadata: undefined } as never, + offset: 0, + length: undefined, + }; + + await m.fetch(negReq, next); + await m.fetch(fullReq, next); + expect(count()).toBe(2); + }); +}); From 4bf6108d9e54365f763cf3228762f70e8757682a Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 6 May 2026 14:35:43 -0400 Subject: [PATCH 09/14] feat(geotiff): scope readahead cache to open phase; bump default prefetch to 64 KiB Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/geotiff.ts | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/packages/geotiff/src/geotiff.ts b/packages/geotiff/src/geotiff.ts index 34771085..86b6fcef 100644 --- a/packages/geotiff/src/geotiff.ts +++ b/packages/geotiff/src/geotiff.ts @@ -228,16 +228,31 @@ export class GeoTIFF { * and each subsequent fetch grows by `multiplier`. Tile data reads bypass * the cache and use the raw HTTP source directly. * + * The cache is **only active during the open phase**. Once `Tiff.create` + * and `prefetchTags(primaryImage)` finish, {@link SourceReadaheadCache.disable} + * is called: every subsequent fetch through the wrapped source becomes a + * pass-through to raw HTTP. This is intentional — cogeotiff/core lazily + * reads tile-offset/bytecount entries from the header source whenever a + * tile from a previously-untouched IFD is requested, and those reads are + * at arbitrary far offsets. With the cache active they would each pull + * the cache forward exponentially (e.g. a tile lookup at offset 8 MB with + * cache.len = 2 MB triggers a 4 MB underlying fetch). With it disabled, + * they go straight to raw HTTP and the cache stops mattering. + * + * Per-IFD bulk loading of `TileOffsets`/`TileByteCounts` happens lazily + * in {@link Overview.fetchTile} on first use — see {@link Overview} for + * details. + * * @param url The URL of the GeoTIFF to open. * @param options Optional parameters for the read-ahead cache. - * @param options.prefetch Initial fetch size in bytes for header/metadata reads. Defaults to 32KB, which covers most COGs in a single round trip. + * @param options.prefetch Initial fetch size in bytes for header/metadata reads. Defaults to 64KB, which covers most COGs in a single round trip. * @param options.multiplier Growth factor applied to the previous fetch size on each subsequent header read. Defaults to 2.0. * @returns A Promise that resolves to a GeoTIFF instance. */ static async fromUrl( url: string | URL, { - prefetch = 32 * 1024, + prefetch = 64 * 1024, multiplier = 2, }: { prefetch?: number; multiplier?: number } = {}, ): Promise { @@ -250,13 +265,20 @@ export class GeoTIFF { const view = new SourceView(source, [readahead]); - return await GeoTIFF.open({ + const geotiff = await GeoTIFF.open({ // Use raw source for tile data to avoid unnecessary copying through the // read-ahead cache layer. dataSource: source, headerSource: view, prefetch, }); + + // Open phase complete: scope the cache to the open phase only. From here + // on, all reads (lazy overview tag lookups, GDAL ghost-header probes, + // etc.) bypass the cache and go straight to raw HTTP. + readahead.disable(); + + return geotiff; } // ── Properties from the primary image ───────────────────────────────── From bb55cedfb88af9006a0555079cb938ccefebb358 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 12 May 2026 11:38:55 -0400 Subject: [PATCH 10/14] feat(geotiff): bulk-prefetch TileOffsets/TileByteCounts per overview on first tile request Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/overview.ts | 45 ++++++++++ packages/geotiff/tests/overview.test.ts | 106 ++++++++++++++++++++++++ 2 files changed, 151 insertions(+) create mode 100644 packages/geotiff/tests/overview.test.ts diff --git a/packages/geotiff/src/overview.ts b/packages/geotiff/src/overview.ts index 195d9229..e03ba1c1 100644 --- a/packages/geotiff/src/overview.ts +++ b/packages/geotiff/src/overview.ts @@ -1,4 +1,5 @@ import type { Source, TiffImage, TiffImageTileCount } from "@cogeotiff/core"; +import { TiffTag } from "@cogeotiff/core"; import type { Affine } from "@developmentseed/affine"; import { compose, scale } from "@developmentseed/affine"; import type { ProjJson } from "@developmentseed/proj"; @@ -32,6 +33,9 @@ export class Overview { /** The IFD for the mask associated with this overview level, if any. */ readonly maskImage: TiffImage | null = null; + /** Memoized promise from {@link ensureTagsLoaded}. */ + private _tagsPromise: Promise | null = null; + constructor( geotiff: GeoTIFF, gkd: GeoKeyDirectory, @@ -118,9 +122,49 @@ export class Overview { signal?: AbortSignal; } = {}, ): Promise { + await this.ensureTagsLoaded(); return await fetchTile(this, x, y, options); } + /** + * Bulk-load `TileOffsets` and `TileByteCounts` for this overview's data IFD + * (and mask IFD, if present) on first call. Subsequent calls return the + * same memoized promise — no additional underlying fetches. + * + * Why this exists: cogeotiff/core lazily reads individual entries from the + * tile-offset/bytecount arrays via the header source, one 4–8 byte entry + * per tile request. For overviews not pre-loaded by `prefetchTags` (i.e. + * everything except the primary image), this means many tiny per-tile + * range requests on every tile fetch. Calling + * `image.fetch(TiffTag.TileOffsets)` once forces cogeotiff to bulk-load + * the full array; thereafter all per-tile lookups are served from memory. + * + * The bulk fetch goes through the source originally passed to + * `Tiff.create` — for {@link GeoTIFF.fromUrl}, that's the wrapped header + * source. After {@link GeoTIFF.fromUrl} disables its read-ahead cache, + * the wrapper is a pass-through, so this read hits raw HTTP directly. + */ + ensureTagsLoaded(): Promise { + if (this._tagsPromise === null) { + this._tagsPromise = this.loadTags(); + } + return this._tagsPromise; + } + + private async loadTags(): Promise { + const dataPromises = [ + this.image.fetch(TiffTag.TileOffsets), + this.image.fetch(TiffTag.TileByteCounts), + ]; + const maskPromises = this.maskImage + ? [ + this.maskImage.fetch(TiffTag.TileOffsets), + this.maskImage.fetch(TiffTag.TileByteCounts), + ] + : []; + await Promise.all([...dataPromises, ...maskPromises]); + } + /** * Fetch multiple tiles in parallel. * @@ -141,6 +185,7 @@ export class Overview { signal?: AbortSignal; } = {}, ): Promise { + await this.ensureTagsLoaded(); return await fetchTiles(this, xy, options); } diff --git a/packages/geotiff/tests/overview.test.ts b/packages/geotiff/tests/overview.test.ts new file mode 100644 index 00000000..791def2f --- /dev/null +++ b/packages/geotiff/tests/overview.test.ts @@ -0,0 +1,106 @@ +import type { TiffImage } from "@cogeotiff/core"; +import { TiffTag } from "@cogeotiff/core"; +import { describe, expect, it } from "vitest"; +import { Overview } from "../src/overview.js"; + +/** + * A minimal fake TiffImage that counts calls to `fetch` per tag. + * + * Only the methods/fields actually used by `ensureTagsLoaded` need to work; + * everything else is stubbed since these tests don't call `fetchTile`. + */ +function makeFakeImage(): { + image: TiffImage; + fetchCalls: () => Map; +} { + const calls = new Map(); + const image = { + fetch: async (tag: number) => { + calls.set(tag, (calls.get(tag) ?? 0) + 1); + return null; + }, + } as unknown as TiffImage; + return { image, fetchCalls: () => calls }; +} + +describe("Overview.ensureTagsLoaded", () => { + it("bulk-fetches TileOffsets and TileByteCounts on first call", async () => { + const data = makeFakeImage(); + const overview = new Overview( + {} as never, // geotiff + {} as never, // gkd + data.image, + null, + {} as never, // cachedTags + { fetch: async () => new ArrayBuffer(0) }, // dataSource + ); + + await overview.ensureTagsLoaded(); + + const calls = data.fetchCalls(); + expect(calls.get(TiffTag.TileOffsets)).toBe(1); + expect(calls.get(TiffTag.TileByteCounts)).toBe(1); + }); + + it("memoizes: a second call does not refetch", async () => { + const data = makeFakeImage(); + const overview = new Overview( + {} as never, + {} as never, + data.image, + null, + {} as never, + { fetch: async () => new ArrayBuffer(0) }, + ); + + await overview.ensureTagsLoaded(); + await overview.ensureTagsLoaded(); + await overview.ensureTagsLoaded(); + + const calls = data.fetchCalls(); + expect(calls.get(TiffTag.TileOffsets)).toBe(1); + expect(calls.get(TiffTag.TileByteCounts)).toBe(1); + }); + + it("dedupes concurrent first-call invocations into one underlying load", async () => { + const data = makeFakeImage(); + const overview = new Overview( + {} as never, + {} as never, + data.image, + null, + {} as never, + { fetch: async () => new ArrayBuffer(0) }, + ); + + await Promise.all([ + overview.ensureTagsLoaded(), + overview.ensureTagsLoaded(), + overview.ensureTagsLoaded(), + ]); + + const calls = data.fetchCalls(); + expect(calls.get(TiffTag.TileOffsets)).toBe(1); + expect(calls.get(TiffTag.TileByteCounts)).toBe(1); + }); + + it("also fetches mask tags when a mask IFD is present", async () => { + const data = makeFakeImage(); + const mask = makeFakeImage(); + const overview = new Overview( + {} as never, + {} as never, + data.image, + mask.image, + {} as never, + { fetch: async () => new ArrayBuffer(0) }, + ); + + await overview.ensureTagsLoaded(); + + expect(data.fetchCalls().get(TiffTag.TileOffsets)).toBe(1); + expect(data.fetchCalls().get(TiffTag.TileByteCounts)).toBe(1); + expect(mask.fetchCalls().get(TiffTag.TileOffsets)).toBe(1); + expect(mask.fetchCalls().get(TiffTag.TileByteCounts)).toBe(1); + }); +}); From 272d996fb5b9d8f3735514da49f7e6aafb007fb5 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 12 May 2026 11:49:54 -0400 Subject: [PATCH 11/14] test(geotiff): assert disable() prevents post-open cache growth Co-Authored-By: Claude Opus 4.7 (1M context) --- .../tests/integration-readahead.test.ts | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/packages/geotiff/tests/integration-readahead.test.ts b/packages/geotiff/tests/integration-readahead.test.ts index 5a31be13..a599b3e9 100644 --- a/packages/geotiff/tests/integration-readahead.test.ts +++ b/packages/geotiff/tests/integration-readahead.test.ts @@ -64,4 +64,43 @@ describe("SourceReadaheadCache integration", () => { // to read the IFD chain — proving the cache extends correctly. expect(count()).toBeGreaterThan(1); }); + + it("disables the readahead cache after open completes (full fromUrl path)", async () => { + const file = new SourceFile(path); + const counter = counting(file); + + const readahead = new SourceReadaheadCache({ + initial: 32 * 1024, + multiplier: 2, + }); + const view = new SourceView(counter.source, [readahead]); + + // Mirror what GeoTIFF.fromUrl does internally: + await GeoTIFF.open({ + dataSource: file, + headerSource: view, + prefetch: 32 * 1024, + }); + readahead.disable(); + + const fetchesBeforePostOpenReads = counter.count(); + + // Issue some far-offset reads through the wrapped view — exactly what + // cogeotiff would do when lazy-loading an unfetched overview's tag. + // Each post-disable read must hit raw underlying source 1:1; the cache + // must not extend. + const head = await view.head(); + const fileSize = head.size!; + + await view.fetch(Math.floor(fileSize * 0.5), 8); + await view.fetch(Math.floor(fileSize * 0.7), 8); + await view.fetch(Math.floor(fileSize * 0.9), 8); + + const fetchesAfterPostOpenReads = counter.count(); + const postOpenFetchCount = + fetchesAfterPostOpenReads - fetchesBeforePostOpenReads; + + // Three direct reads → exactly three underlying fetches. No growth. + expect(postOpenFetchCount).toBe(3); + }); }); From 8d27c518d97238458d360db3a02c5a2528d5edc4 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 12 May 2026 11:56:46 -0400 Subject: [PATCH 12/14] refactor(geotiff): move source middleware into src/source/ Group SourceReadaheadCache and its mutex() helper under a source/ subdirectory to make the chunkd Source middleware layer explicit. No behavior change. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/geotiff.ts | 2 +- packages/geotiff/src/{ => source}/concurrency.ts | 0 packages/geotiff/src/{ => source}/readahead-cache.ts | 0 packages/geotiff/tests/integration-readahead.test.ts | 2 +- packages/geotiff/tests/{ => source}/concurrency.test.ts | 2 +- packages/geotiff/tests/{ => source}/readahead-cache.test.ts | 2 +- 6 files changed, 4 insertions(+), 4 deletions(-) rename packages/geotiff/src/{ => source}/concurrency.ts (100%) rename packages/geotiff/src/{ => source}/readahead-cache.ts (100%) rename packages/geotiff/tests/{ => source}/concurrency.test.ts (97%) rename packages/geotiff/tests/{ => source}/readahead-cache.test.ts (99%) diff --git a/packages/geotiff/src/geotiff.ts b/packages/geotiff/src/geotiff.ts index 3ba97522..03e49012 100644 --- a/packages/geotiff/src/geotiff.ts +++ b/packages/geotiff/src/geotiff.ts @@ -13,7 +13,7 @@ import type { CachedTags, GeoKeyDirectory } from "./ifd.js"; import { extractGeoKeyDirectory, prefetchTags } from "./ifd.js"; import { Overview } from "./overview.js"; import type { DecoderPool } from "./pool/pool.js"; -import { SourceReadaheadCache } from "./readahead-cache.js"; +import { SourceReadaheadCache } from "./source/readahead-cache.js"; import type { Tile } from "./tile.js"; import { createTransform, index, xy } from "./transform.js"; diff --git a/packages/geotiff/src/concurrency.ts b/packages/geotiff/src/source/concurrency.ts similarity index 100% rename from packages/geotiff/src/concurrency.ts rename to packages/geotiff/src/source/concurrency.ts diff --git a/packages/geotiff/src/readahead-cache.ts b/packages/geotiff/src/source/readahead-cache.ts similarity index 100% rename from packages/geotiff/src/readahead-cache.ts rename to packages/geotiff/src/source/readahead-cache.ts diff --git a/packages/geotiff/tests/integration-readahead.test.ts b/packages/geotiff/tests/integration-readahead.test.ts index a599b3e9..101b0664 100644 --- a/packages/geotiff/tests/integration-readahead.test.ts +++ b/packages/geotiff/tests/integration-readahead.test.ts @@ -3,7 +3,7 @@ import { SourceView } from "@chunkd/source"; import { SourceFile } from "@chunkd/source-file"; import { describe, expect, it } from "vitest"; import { GeoTIFF } from "../src/geotiff.js"; -import { SourceReadaheadCache } from "../src/readahead-cache.js"; +import { SourceReadaheadCache } from "../src/source/readahead-cache.js"; import { fixturePath } from "./helpers.js"; /** diff --git a/packages/geotiff/tests/concurrency.test.ts b/packages/geotiff/tests/source/concurrency.test.ts similarity index 97% rename from packages/geotiff/tests/concurrency.test.ts rename to packages/geotiff/tests/source/concurrency.test.ts index ef9bd487..ed525b01 100644 --- a/packages/geotiff/tests/concurrency.test.ts +++ b/packages/geotiff/tests/source/concurrency.test.ts @@ -1,5 +1,5 @@ import { describe, expect, it } from "vitest"; -import { mutex } from "../src/concurrency.js"; +import { mutex } from "../../src/source/concurrency.js"; describe("mutex", () => { it("runs tasks one at a time", async () => { diff --git a/packages/geotiff/tests/readahead-cache.test.ts b/packages/geotiff/tests/source/readahead-cache.test.ts similarity index 99% rename from packages/geotiff/tests/readahead-cache.test.ts rename to packages/geotiff/tests/source/readahead-cache.test.ts index 7fb75776..71496d7e 100644 --- a/packages/geotiff/tests/readahead-cache.test.ts +++ b/packages/geotiff/tests/source/readahead-cache.test.ts @@ -3,7 +3,7 @@ import { describe, expect, it } from "vitest"; import { SequentialBlockCache, SourceReadaheadCache, -} from "../src/readahead-cache.js"; +} from "../../src/source/readahead-cache.js"; const enc = new TextEncoder(); const dec = new TextDecoder(); From 631f5b7f6f8d441fe986076966270d79826349c8 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 12 May 2026 12:14:03 -0400 Subject: [PATCH 13/14] feat(geotiff): freeze() preserves cache hits, add maxExtension cap, default multiplier 4 Addresses PR review feedback on #509: - disable() renamed to freeze(): after the open phase finishes, the cache still serves hits from memory but stops extending. Misses bypass to the underlying source directly. Previously disable() bypassed entirely, which forced an extra HTTP request even when the data was already cached. - New maxExtension option (default 4 MiB): caps how much the cache can grow in a single underlying fetch. If satisfying a request would require pulling more than the cap, the middleware bypasses for that request instead. Bounds the worst case when cogeotiff reads at a far offset during the open phase (e.g. GDAL ghost-header probes at EOF). - Default multiplier bumped 2 -> 4: faster cache expansion during open means fewer round trips for files with larger metadata regions. The maxExtension cap keeps the worst case bounded. Co-Authored-By: Claude Opus 4.7 (1M context) --- packages/geotiff/src/geotiff.ts | 34 +++--- .../geotiff/src/source/readahead-cache.ts | 101 ++++++++++++++---- .../tests/integration-readahead.test.ts | 46 ++++++-- .../tests/source/readahead-cache.test.ts | 82 ++++++++++---- 4 files changed, 194 insertions(+), 69 deletions(-) diff --git a/packages/geotiff/src/geotiff.ts b/packages/geotiff/src/geotiff.ts index 03e49012..2152c3b8 100644 --- a/packages/geotiff/src/geotiff.ts +++ b/packages/geotiff/src/geotiff.ts @@ -234,16 +234,17 @@ export class GeoTIFF { * and each subsequent fetch grows by `multiplier`. Tile data reads bypass * the cache and use the raw HTTP source directly. * - * The cache is **only active during the open phase**. Once `Tiff.create` - * and `prefetchTags(primaryImage)` finish, {@link SourceReadaheadCache.disable} - * is called: every subsequent fetch through the wrapped source becomes a - * pass-through to raw HTTP. This is intentional — cogeotiff/core lazily - * reads tile-offset/bytecount entries from the header source whenever a - * tile from a previously-untouched IFD is requested, and those reads are - * at arbitrary far offsets. With the cache active they would each pull - * the cache forward exponentially (e.g. a tile lookup at offset 8 MB with - * cache.len = 2 MB triggers a 4 MB underlying fetch). With it disabled, - * they go straight to raw HTTP and the cache stops mattering. + * The cache is **frozen at the end of the open phase**. Once `Tiff.create` + * and `prefetchTags(primaryImage)` finish, {@link SourceReadaheadCache.freeze} + * is called. After that, cache hits are still served from memory, but + * misses bypass to raw HTTP — the cache never extends again. This is + * intentional: cogeotiff/core lazily reads tile-offset/bytecount entries + * from the header source whenever a tile from a previously-untouched IFD + * is requested, and those reads are at arbitrary far offsets. With the + * cache able to extend, each one would pull the cache forward + * exponentially (e.g. a tile lookup at offset 8 MB with cache.len = 2 MB + * triggers a 4 MB underlying fetch). With it frozen, those reads go + * straight to raw HTTP and the cache stops growing. * * Per-IFD bulk loading of `TileOffsets`/`TileByteCounts` happens lazily * in {@link Overview.fetchTile} on first use — see {@link Overview} for @@ -252,7 +253,7 @@ export class GeoTIFF { * @param url The URL of the GeoTIFF to open. * @param options Optional parameters for the read-ahead cache. * @param options.prefetch Initial fetch size in bytes for header/metadata reads. Defaults to 64KB, which covers most COGs in a single round trip. - * @param options.multiplier Growth factor applied to the previous fetch size on each subsequent header read. Defaults to 2.0. + * @param options.multiplier Growth factor applied to the previous fetch size on each subsequent header read. Defaults to 4.0. * @param options.signal An optional {@link AbortSignal} to cancel the header reads. * @returns A Promise that resolves to a GeoTIFF instance. */ @@ -260,7 +261,7 @@ export class GeoTIFF { url: string | URL, { prefetch = 64 * 1024, - multiplier = 2, + multiplier = 4, signal, }: { prefetch?: number; @@ -302,10 +303,11 @@ export class GeoTIFF { signal, }); - // Open phase complete: scope the cache to the open phase only. From here - // on, all reads (lazy overview tag lookups, GDAL ghost-header probes, - // etc.) bypass the cache and go straight to raw HTTP. - readahead.disable(); + // Open phase complete: freeze the cache so it stops extending. Subsequent + // reads (lazy overview tag lookups, GDAL ghost-header probes, etc.) are + // still served from the cache if covered, but misses go straight to raw + // HTTP instead of triggering exponential growth. + readahead.freeze(); return geotiff; } diff --git a/packages/geotiff/src/source/readahead-cache.ts b/packages/geotiff/src/source/readahead-cache.ts index f3e83a15..49f2c26c 100644 --- a/packages/geotiff/src/source/readahead-cache.ts +++ b/packages/geotiff/src/source/readahead-cache.ts @@ -92,8 +92,20 @@ export interface SourceReadaheadCacheOptions { initial: number; /** Multiplier applied to the previous fetch size on each subsequent read. */ multiplier: number; + /** + * Maximum bytes a single underlying fetch may add to the cache. If + * satisfying a request would require extending the cache by more than this, + * the middleware bypasses the cache entirely for that request and serves it + * with one direct fetch instead. Bounds the worst case when cogeotiff reads + * at a far offset during the open phase (e.g. GDAL ghost-header probes at + * EOF). Defaults to {@link DEFAULT_MAX_EXTENSION}. + */ + maxExtension?: number; } +/** Default cap on per-fetch cache extension: 4 MiB. */ +export const DEFAULT_MAX_EXTENSION = 4 * 1024 * 1024; + /** * A chunkd {@link SourceMiddleware} that caches sequential reads from offset 0 * and grows underlying fetch sizes exponentially. @@ -102,8 +114,34 @@ export interface SourceReadaheadCacheOptions { * file: an initial small fetch covers most files, and subsequent fetches grow * by `multiplier` to handle larger header structures with few round trips. * - * Bypasses requests with negative offsets or undefined length (full-file - * reads) — those go directly to the next layer. + * # Lifecycle + * + * The cache has two states: + * + * - **Active** (the default): on a miss, the cache extends by exponentially + * growing reads. On a hit (range fully covered by `[0, cache.len)`), it + * serves directly from the in-memory buffer. + * - **Frozen** (after {@link freeze} is called): cache hits are still served, + * but misses bypass to `next()` directly — the cache never extends again. + * + * `GeoTIFF.fromUrl` calls {@link freeze} once the open phase (`Tiff.create` + + * `prefetchTags(primaryImage)`) finishes. From that point on, the cache acts + * as a read-only in-memory index of the bytes already pulled during open. + * + * # Bounded extension + * + * Even while active, a single underlying fetch is capped at + * {@link SourceReadaheadCacheOptions.maxExtension} bytes. If satisfying a + * request would require pulling more than the cap in one extension — e.g. + * the request is at a far offset, or it is just very large — the middleware + * bypasses for that request and returns `next(req)` directly. The cache + * stays at its current size; the rare far-offset read pays one round trip + * but does not pollute the cache. + * + * # Bypass cases + * + * Requests with negative offsets, or with `length == null` (full-file + * reads), always bypass to the next layer regardless of state. * * Stateful per instance: pairs one-to-one with a single source's lifetime. * @@ -115,40 +153,34 @@ export class SourceReadaheadCache implements SourceMiddleware { private readonly cache = new SequentialBlockCache(); private readonly initial: number; private readonly multiplier: number; + private readonly maxExtension: number; private readonly lock = mutex(); - private disabled = false; + private frozen = false; constructor(options: SourceReadaheadCacheOptions) { this.initial = options.initial; this.multiplier = options.multiplier; + this.maxExtension = options.maxExtension ?? DEFAULT_MAX_EXTENSION; } /** - * Permanently bypass the cache. - * - * After this is called, every {@link fetch} returns `next(req)` immediately - * — no cache consultation, no cache extension. The flag is checked before - * acquiring the serialization mutex, so post-disable requests do not queue - * behind in-flight work. + * Stop extending the cache. Hits continue to be served from memory; misses + * bypass to the next layer. * * Intended to be called once `GeoTIFF.fromUrl` has finished its open-phase - * reads (`Tiff.create` + `prefetchTags(primaryImage)`). At that point the - * sequential read-ahead cache has done its job; subsequent reads from - * cogeotiff are at arbitrary offsets (lazy IFD lookups, GDAL ghost-header - * probes) and do not benefit from sequential-from-zero growth — in fact - * they cause catastrophic over-fetching as the cache grows exponentially - * to encompass each new far-offset request. + * reads. At that point cogeotiff's subsequent reads are at arbitrary + * offsets (lazy IFD lookups, GDAL ghost-header probes, per-tile lookups) + * and do not benefit from sequential-from-zero growth — and in fact would + * cause catastrophic over-fetching as the cache grows exponentially to + * encompass each new far-offset request. * - * Idempotent. One-way: there is no `enable()`. + * Idempotent. One-way: there is no `unfreeze()`. */ - disable(): void { - this.disabled = true; + freeze(): void { + this.frozen = true; } async fetch(req: SourceRequest, next: SourceCallback): Promise { - if (this.disabled) { - return next(req); - } if (req.offset < 0 || req.length == null) { return next(req); } @@ -157,10 +189,33 @@ export class SourceReadaheadCache implements SourceMiddleware { const sourceSize = req.source.metadata?.size; return this.lock(async () => { + // Cache hits are always served from memory, regardless of frozen state. + if (this.cache.contains(start, end)) { + return this.cache.slice(start, end); + } + + // On miss after freeze: never extend; serve with a direct fetch. + if (this.frozen) { + return next(req); + } + + // While active: if extending would exceed the per-fetch cap, bypass + // for this request instead of pulling a large chunk into the cache. + // This guards against pathological reads during the open phase (e.g. + // a tag value or ghost header at the end of the file). + const needed = end - this.cache.len; + const naiveFetchSize = Math.max( + this.nextFetchSize(this.cache.len), + needed, + ); + if (naiveFetchSize > this.maxExtension) { + return next(req); + } + while (!this.cache.contains(start, end)) { const cacheLen = this.cache.len; - const needed = end - cacheLen; - let fetchSize = Math.max(this.nextFetchSize(cacheLen), needed); + const stepNeeded = end - cacheLen; + let fetchSize = Math.max(this.nextFetchSize(cacheLen), stepNeeded); if (sourceSize != null) { const remaining = sourceSize - cacheLen; if (remaining <= 0) { diff --git a/packages/geotiff/tests/integration-readahead.test.ts b/packages/geotiff/tests/integration-readahead.test.ts index 101b0664..bae599ba 100644 --- a/packages/geotiff/tests/integration-readahead.test.ts +++ b/packages/geotiff/tests/integration-readahead.test.ts @@ -65,7 +65,7 @@ describe("SourceReadaheadCache integration", () => { expect(count()).toBeGreaterThan(1); }); - it("disables the readahead cache after open completes (full fromUrl path)", async () => { + it("freezes the readahead cache after open completes (full fromUrl path)", async () => { const file = new SourceFile(path); const counter = counting(file); @@ -81,26 +81,52 @@ describe("SourceReadaheadCache integration", () => { headerSource: view, prefetch: 32 * 1024, }); - readahead.disable(); + readahead.freeze(); const fetchesBeforePostOpenReads = counter.count(); - // Issue some far-offset reads through the wrapped view — exactly what - // cogeotiff would do when lazy-loading an unfetched overview's tag. - // Each post-disable read must hit raw underlying source 1:1; the cache - // must not extend. + // Issue reads at the very end of the file — definitely past whatever the + // cache extended to during open. Each post-freeze miss must hit raw + // underlying source 1:1; the cache must not extend. const head = await view.head(); const fileSize = head.size!; - await view.fetch(Math.floor(fileSize * 0.5), 8); - await view.fetch(Math.floor(fileSize * 0.7), 8); - await view.fetch(Math.floor(fileSize * 0.9), 8); + await view.fetch(fileSize - 24, 8); + await view.fetch(fileSize - 16, 8); + await view.fetch(fileSize - 8, 8); const fetchesAfterPostOpenReads = counter.count(); const postOpenFetchCount = fetchesAfterPostOpenReads - fetchesBeforePostOpenReads; - // Three direct reads → exactly three underlying fetches. No growth. + // Three end-of-file reads → exactly three underlying fetches. No growth. expect(postOpenFetchCount).toBe(3); }); + + it("after freeze, cached ranges are served from memory without an underlying fetch", async () => { + const file = new SourceFile(path); + const counter = counting(file); + + const readahead = new SourceReadaheadCache({ + initial: 32 * 1024, + multiplier: 2, + }); + const view = new SourceView(counter.source, [readahead]); + + await GeoTIFF.open({ + dataSource: file, + headerSource: view, + prefetch: 32 * 1024, + }); + readahead.freeze(); + + const fetchesBeforePostOpenReads = counter.count(); + + // Re-read a range near the start: definitely covered by the cache + // (open phase always pulled the first 32 KiB at least). + await view.fetch(0, 16); + await view.fetch(100, 16); + + expect(counter.count()).toBe(fetchesBeforePostOpenReads); + }); }); diff --git a/packages/geotiff/tests/source/readahead-cache.test.ts b/packages/geotiff/tests/source/readahead-cache.test.ts index 71496d7e..bdf9bbb7 100644 --- a/packages/geotiff/tests/source/readahead-cache.test.ts +++ b/packages/geotiff/tests/source/readahead-cache.test.ts @@ -255,25 +255,24 @@ describe("SourceReadaheadCache", () => { }); }); -describe("SourceReadaheadCache.disable()", () => { - it("after disable, every fetch passes through to next without consulting cache", async () => { +describe("SourceReadaheadCache.freeze()", () => { + it("after freeze, cache hits are still served from memory", async () => { const { next, count } = makeNext("abcdefghijklmnop"); const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); - // Pre-disable fetch: cache extends. + // Pre-freeze fetch: cache extends to 4 bytes. await m.fetch(makeReq(0, 4), next); expect(count()).toBe(1); - m.disable(); + m.freeze(); - // Post-disable: cache is bypassed. Each request hits next directly. - await m.fetch(makeReq(0, 4), next); - expect(count()).toBe(2); - await m.fetch(makeReq(0, 4), next); - expect(count()).toBe(3); + // Post-freeze: range fully within cache → served from memory, no fetch. + const buf = await m.fetch(makeReq(0, 4), next); + expect(asString(buf)).toBe("abcd"); + expect(count()).toBe(1); }); - it("after disable, far-offset reads do not trigger cache growth", async () => { + it("after freeze, cache misses bypass to next without extending", async () => { const { next, count } = makeNext("a".repeat(1024)); const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); @@ -281,27 +280,30 @@ describe("SourceReadaheadCache.disable()", () => { await m.fetch(makeReq(0, 4), next); expect(count()).toBe(1); - m.disable(); + m.freeze(); - // A far-offset read post-disable issues exactly one underlying fetch - // — no exponential growth. + // Far-offset read post-freeze: one direct fetch, no exponential growth. await m.fetch(makeReq(512, 4), next); expect(count()).toBe(2); + + // Another far-offset read: another single direct fetch. + await m.fetch(makeReq(800, 4), next); + expect(count()).toBe(3); }); - it("disable() is idempotent", () => { + it("freeze() is idempotent", () => { const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); - m.disable(); - m.disable(); - m.disable(); - // No throw; still works as a pass-through. + m.freeze(); + m.freeze(); + m.freeze(); + // No throw; still works. expect(m.name).toBe("source:readahead-cache"); }); - it("preserves bypass behavior for negative offsets and undefined length after disable", async () => { + it("preserves bypass behavior for negative offsets and undefined length after freeze", async () => { const { next, count } = makeNext("abcd"); const m = new SourceReadaheadCache({ initial: 4, multiplier: 2 }); - m.disable(); + m.freeze(); const negReq: SourceRequest = { source: { metadata: undefined } as never, @@ -319,3 +321,43 @@ describe("SourceReadaheadCache.disable()", () => { expect(count()).toBe(2); }); }); + +describe("SourceReadaheadCache maxExtension cap", () => { + it("bypasses single requests that would require more than maxExtension", async () => { + const { next, count } = makeNext("a".repeat(10000)); + const m = new SourceReadaheadCache({ + initial: 4, + multiplier: 2, + maxExtension: 100, + }); + + // Establish a small cache. + await m.fetch(makeReq(0, 4), next); + expect(count()).toBe(1); + + // Request at far offset would require extending the cache by ~9996 bytes. + // Cap is 100 → bypass, single direct fetch, cache stays at 4. + await m.fetch(makeReq(9000, 4), next); + expect(count()).toBe(2); + + // Cache is still small: next near-offset request fetches from cache.len, + // not from far offset. + await m.fetch(makeReq(4, 4), next); + expect(count()).toBe(3); + }); + + it("does not cap normal sequential growth", async () => { + const { next, count } = makeNext("a".repeat(100)); + const m = new SourceReadaheadCache({ + initial: 4, + multiplier: 2, + maxExtension: 100, + }); + + // 1st extension: 4 bytes — well under cap. + await m.fetch(makeReq(0, 4), next); + // 2nd extension (sequential): cache.len=4, needed=4, fetchSize=max(8, 4)=8 < cap. + await m.fetch(makeReq(4, 4), next); + expect(count()).toBe(2); + }); +}); From ab1787526157a70c7ee2d41e3798f5cb5b9f003d Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Tue, 12 May 2026 12:38:03 -0400 Subject: [PATCH 14/14] feat(geotiff): cap by request gap, not fetch size (default 128 MiB) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous maxExtension cap (4 MiB) capped the total per-fetch size, which includes nextFetchSize × multiplier. With multiplier=4, once cache.len exceeded ~1 MiB the exponential growth alone exceeded the cap, stalling the readahead cache even for fully sequential reads. This made the cache useless for files with large headers — e.g. a 200 GB COG with a 61 MiB header would need ~15 individual fetches instead of ~5 exponentially-growing ones. Replace with a maxGap cap: bypass only when a request *starts* more than maxGap bytes past cache.len. Sequential extension is unbounded; only far-offset probes (e.g. GDAL ghost-header reads near EOF) bypass. Default 128 MiB — larger than any realistic TIFF metadata region, small relative to large-COG file sizes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../geotiff/src/source/readahead-cache.ts | 69 +++++++++++-------- .../tests/source/readahead-cache.test.ts | 47 +++++++++---- 2 files changed, 75 insertions(+), 41 deletions(-) diff --git a/packages/geotiff/src/source/readahead-cache.ts b/packages/geotiff/src/source/readahead-cache.ts index 49f2c26c..1395517d 100644 --- a/packages/geotiff/src/source/readahead-cache.ts +++ b/packages/geotiff/src/source/readahead-cache.ts @@ -93,18 +93,32 @@ export interface SourceReadaheadCacheOptions { /** Multiplier applied to the previous fetch size on each subsequent read. */ multiplier: number; /** - * Maximum bytes a single underlying fetch may add to the cache. If - * satisfying a request would require extending the cache by more than this, - * the middleware bypasses the cache entirely for that request and serves it - * with one direct fetch instead. Bounds the worst case when cogeotiff reads - * at a far offset during the open phase (e.g. GDAL ghost-header probes at - * EOF). Defaults to {@link DEFAULT_MAX_EXTENSION}. + * Maximum bytes from the end of the cache to the start of a request before + * the middleware bypasses to next(). If a request starts more than this far + * past `cache.len`, it is served by one direct fetch instead of triggering + * a cache extension that spans the gap. + * + * Sequential reads (gap ≈ 0) are unaffected — the cache extends naturally + * by `nextFetchSize(cache.len)` regardless of how big that is, so even + * files with hundreds of megabytes of metadata can be opened in a few + * exponentially-growing fetches. The cap only kicks in for far-offset + * probes (e.g. GDAL ghost-header reads near EOF on a large file). + * + * Defaults to {@link DEFAULT_MAX_GAP}. */ - maxExtension?: number; + maxGap?: number; } -/** Default cap on per-fetch cache extension: 4 MiB. */ -export const DEFAULT_MAX_EXTENSION = 4 * 1024 * 1024; +/** + * Default cap on the distance from `cache.len` to a request's start before + * the middleware bypasses: 128 MiB. + * + * Chosen to be larger than any realistic TIFF metadata region (so sequential + * extension of the cache is never artificially stopped) while still small + * compared to typical large-COG file sizes (so a far-offset probe near EOF + * does not pull hundreds of MB of unused data into the cache). + */ +export const DEFAULT_MAX_GAP = 128 * 1024 * 1024; /** * A chunkd {@link SourceMiddleware} that caches sequential reads from offset 0 @@ -130,13 +144,15 @@ export const DEFAULT_MAX_EXTENSION = 4 * 1024 * 1024; * * # Bounded extension * - * Even while active, a single underlying fetch is capped at - * {@link SourceReadaheadCacheOptions.maxExtension} bytes. If satisfying a - * request would require pulling more than the cap in one extension — e.g. - * the request is at a far offset, or it is just very large — the middleware - * bypasses for that request and returns `next(req)` directly. The cache - * stays at its current size; the rare far-offset read pays one round trip - * but does not pollute the cache. + * Sequential extension is unbounded — the cache grows as far as cogeotiff's + * sequential reads require, even for files with very large metadata regions + * (a 200 GB COG can easily have a 60+ MB header). The bound is on the + * *gap* between `cache.len` and the start of a request: if a request lands + * more than {@link SourceReadaheadCacheOptions.maxGap} bytes past the + * cache, the middleware bypasses for that one request instead of pulling + * the entire gap into the cache. This protects against pathological probes + * (e.g. GDAL ghost-header reads near the end of the file) without + * artificially capping the legitimate sequential growth path. * * # Bypass cases * @@ -153,14 +169,14 @@ export class SourceReadaheadCache implements SourceMiddleware { private readonly cache = new SequentialBlockCache(); private readonly initial: number; private readonly multiplier: number; - private readonly maxExtension: number; + private readonly maxGap: number; private readonly lock = mutex(); private frozen = false; constructor(options: SourceReadaheadCacheOptions) { this.initial = options.initial; this.multiplier = options.multiplier; - this.maxExtension = options.maxExtension ?? DEFAULT_MAX_EXTENSION; + this.maxGap = options.maxGap ?? DEFAULT_MAX_GAP; } /** @@ -199,16 +215,13 @@ export class SourceReadaheadCache implements SourceMiddleware { return next(req); } - // While active: if extending would exceed the per-fetch cap, bypass - // for this request instead of pulling a large chunk into the cache. - // This guards against pathological reads during the open phase (e.g. - // a tag value or ghost header at the end of the file). - const needed = end - this.cache.len; - const naiveFetchSize = Math.max( - this.nextFetchSize(this.cache.len), - needed, - ); - if (naiveFetchSize > this.maxExtension) { + // While active: if the request starts too far past the cache, bypass + // and serve it with one direct fetch. Sequential extension is fine — + // even very large metadata regions are reached by exponential growth + // — but a far-offset probe (e.g. GDAL ghost header near EOF on a + // large file) shouldn't drag the cache through the gap. + const gap = start - this.cache.len; + if (gap > this.maxGap) { return next(req); } diff --git a/packages/geotiff/tests/source/readahead-cache.test.ts b/packages/geotiff/tests/source/readahead-cache.test.ts index bdf9bbb7..376910de 100644 --- a/packages/geotiff/tests/source/readahead-cache.test.ts +++ b/packages/geotiff/tests/source/readahead-cache.test.ts @@ -322,42 +322,63 @@ describe("SourceReadaheadCache.freeze()", () => { }); }); -describe("SourceReadaheadCache maxExtension cap", () => { - it("bypasses single requests that would require more than maxExtension", async () => { +describe("SourceReadaheadCache maxGap cap", () => { + it("bypasses a request that starts more than maxGap past cache.len", async () => { const { next, count } = makeNext("a".repeat(10000)); const m = new SourceReadaheadCache({ initial: 4, multiplier: 2, - maxExtension: 100, + maxGap: 100, }); - // Establish a small cache. + // Establish a small cache: cache.len = 4 after this. await m.fetch(makeReq(0, 4), next); expect(count()).toBe(1); - // Request at far offset would require extending the cache by ~9996 bytes. - // Cap is 100 → bypass, single direct fetch, cache stays at 4. + // Request starts at offset 9000 — gap = 9000 - 4 = 8996, way past the + // cap. Bypass: one direct fetch, cache stays at len 4. await m.fetch(makeReq(9000, 4), next); expect(count()).toBe(2); - // Cache is still small: next near-offset request fetches from cache.len, + // Cache is still small: next near-offset request extends from cache.len, // not from far offset. await m.fetch(makeReq(4, 4), next); expect(count()).toBe(3); }); - it("does not cap normal sequential growth", async () => { - const { next, count } = makeNext("a".repeat(100)); + it("does not cap sequential extension by total fetch size", async () => { + // multiplier = 4: each fetch is 4× the previous cache.len. With initial + // of 4, the second extension would be 16 bytes; absolute cap of 8 wouldn't + // stop it — only the *gap* matters now. + const { next, count } = makeNext("a".repeat(1024)); const m = new SourceReadaheadCache({ initial: 4, - multiplier: 2, - maxExtension: 100, + multiplier: 4, + maxGap: 8, }); - // 1st extension: 4 bytes — well under cap. + // 1st extension: cache.len 0 → 4. Gap 0. Extend. await m.fetch(makeReq(0, 4), next); - // 2nd extension (sequential): cache.len=4, needed=4, fetchSize=max(8, 4)=8 < cap. + // 2nd: sequential at offset 4. Gap = 0. Extends by nextFetchSize=16. + // cache.len → 20. await m.fetch(makeReq(4, 4), next); + // 3rd: sequential at offset 20. Gap = 0. Extends by nextFetchSize=80. + // cache.len → 100. + await m.fetch(makeReq(20, 4), next); + expect(count()).toBe(3); + }); + + it("allows extending when the gap is exactly at the cap", async () => { + const { next, count } = makeNext("a".repeat(1024)); + const m = new SourceReadaheadCache({ + initial: 4, + multiplier: 2, + maxGap: 50, + }); + + await m.fetch(makeReq(0, 4), next); + // Gap = 50 - 4 = 46. At cap. Extend. + await m.fetch(makeReq(50, 4), next); expect(count()).toBe(2); }); });