Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ The accessor discovers the `MultiBandStacBackendArray` from the DataArray's lazy
- `summary()` — multi-line human-readable report with COG-read distribution histogram
- `to_dataframe()` — one row per `(chunk, COG file)` for further analysis in pandas

With `fetch_headers=True`, each matched COG header is fetched (a small HTTP range request to read the IFD block) and `CogRead.overview_level`, `window_col_off`, `window_row_off`, `window_width`, and `window_height` are populated.
With `fetch_headers=True`, each matched COG header is fetched (a small HTTP range request to read the IFD block) and `CogRead.overview_level`, `window_col_off`, `window_row_off`, `window_width`, and `window_height` are populated. This path shares `_chunk_reader._open_and_window()` with real reads through a lightweight window context, but does not construct the full read context or resolve dtype/nodata pixel contracts.

## Phase 1 in detail

Expand Down Expand Up @@ -110,7 +110,7 @@ A `rasterix.RasterIndex` is attached to every DataArray returned by `open()`. Th

## Per-chunk read and resample pipeline

Each call to `_read_item_band()` in `_chunk_reader.py` first validates the source asset against the inferred output contract, then follows a four-step pipeline to turn a remote COG into a correctly-sized, correctly-projected numpy array for one destination chunk.
Each call to `_read_item_band()` in `_chunk_reader.py` first validates the source asset against the inferred output contract, then follows a four-step pipeline to turn a remote COG into a correctly-sized, correctly-projected numpy array for one destination chunk. Header/window helpers consume the smaller `_WindowContext`; `_ChunkContext` extends it with read-only fields such as nodata, output dtype, explicitness flags, and the warp cache.

### 1. Overview selection

Expand Down
27 changes: 19 additions & 8 deletions src/lazycogs/_chunk_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,24 +33,35 @@


@dataclass(frozen=True)
class _ChunkContext:
"""Immutable per-chunk parameters shared across all item reads.
class _WindowContext:
"""Immutable parameters needed to open a COG and compute a read window.

Built once per chunk in async_mosaic_chunk and passed through to all
internal helpers. Frozen to prevent accidental mutation across concurrent
coroutines.
This header/window-only context is shared by dry-run explain and real chunk
reads. It intentionally excludes pixel-read contract fields such as dtype,
nodata, and warp caches.
"""

chunk_affine: Affine
dst_crs: CRS
chunk_width: int
chunk_height: int
store: Store | None
path_fn: Callable[[str], str] | None


@dataclass(frozen=True)
class _ChunkContext(_WindowContext):
"""Immutable per-chunk parameters shared across all item reads.

Built once per chunk in async_mosaic_chunk and passed through to all
internal helpers. Frozen to prevent accidental mutation across concurrent
coroutines.
"""

nodata: float | int | None
out_dtype: np.dtype
dtype_was_explicit: bool
nodata_was_explicit: bool
store: Store | None
path_fn: Callable[[str], str] | None
warp_cache: dict[tuple[tuple[float, ...], CRS], WarpMap] | None


Expand Down Expand Up @@ -313,7 +324,7 @@ def _native_window(
async def _open_and_window(
item: dict,
band: str,
ctx: _ChunkContext,
ctx: _WindowContext,
) -> tuple[GeoTIFF, GeoTIFF | Overview, Window | None, str] | None:
"""Open a COG asset and compute the pixel window covering the chunk.

Expand Down
6 changes: 2 additions & 4 deletions src/lazycogs/_explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from xarray.core import indexing

from lazycogs._backend import MultiBandStacBackendArray
from lazycogs._chunk_reader import _ChunkContext, _open_and_window
from lazycogs._chunk_reader import _open_and_window, _WindowContext
from lazycogs._executor import run_duckdb, run_on_loop

if TYPE_CHECKING:
Expand Down Expand Up @@ -588,15 +588,13 @@ async def _inspect_item_async(
the item has no matching asset or the chunk does not overlap.

"""
ctx = _ChunkContext(
ctx = _WindowContext(
chunk_affine=chunk_affine,
dst_crs=dst_crs,
chunk_width=chunk_width,
chunk_height=chunk_height,
nodata=None,
store=store,
path_fn=None,
warp_cache=None,
)
opened = await _open_and_window(item, band, ctx)
if opened is None:
Expand Down
16 changes: 16 additions & 0 deletions tests/benchmarks/bench_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,22 @@ def run() -> object:
benchmark(run)


@pytest.mark.benchmark
def test_explain_fetch_headers(benchmark, benchmark_parquet: str) -> None:
"""Explain plan with COG header opens: open + explain(fetch_headers=True)."""

def run() -> object:
da = lazycogs.open(
benchmark_parquet,
bbox=BENCHMARK_BBOX,
crs=BENCHMARK_CRS,
resolution=60.0,
)
return da.lazycogs.explain(fetch_headers=True)

benchmark(run)


@pytest.mark.benchmark
@pytest.mark.parametrize("method", [FirstMethod, MedianMethod], ids=["first", "median"])
def test_mosaic_method(
Expand Down
22 changes: 22 additions & 0 deletions tests/benchmarks/test_regressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,28 @@ def test_open_explicit_overrides_win_over_benchmark_inference(
assert da.encoding["_FillValue"] == -9999


def test_explain_fetch_headers_uses_local_benchmark_data(
benchmark_parquet: str,
) -> None:
"""Header-fetch explain works on real local COG metadata without pixel reads."""
da = lazycogs.open(
benchmark_parquet,
bbox=BENCHMARK_BBOX,
crs=BENCHMARK_CRS,
resolution=60.0,
bands=BENCHMARK_SINGLE_BAND,
)

plan = da.lazycogs.explain(fetch_headers=True)
df = plan.to_dataframe()

assert plan.fetch_headers is True
assert plan.total_cog_reads > 0
assert df["window_width"].notna().any()
assert df["window_height"].notna().any()
assert df["overview_resolution"].notna().any()


def test_open_rejects_conflicting_sampled_nodata_on_local_benchmark_copy(
tmp_path: Path,
benchmark_items: list[dict[str, Any]],
Expand Down
79 changes: 78 additions & 1 deletion tests/test_chunk_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
from __future__ import annotations

import asyncio
from unittest.mock import MagicMock, patch
from types import SimpleNamespace
from unittest.mock import AsyncMock, MagicMock, patch

import numpy as np
import pytest
Expand All @@ -12,9 +13,12 @@

from lazycogs._chunk_reader import (
_apply_bands_with_warp_cache,
_ChunkContext,
_drain_in_order,
_native_window,
_open_and_window,
_select_overview,
_WindowContext,
read_chunk_async,
)
from lazycogs._executor import get_reproject_pool
Expand Down Expand Up @@ -163,6 +167,79 @@ def test_native_window_clamped_to_image_bounds():
assert win.row_off + win.height <= 4 # clamped at image height


# ---------------------------------------------------------------------------
# _open_and_window context boundary
# ---------------------------------------------------------------------------


def test_chunk_context_is_a_window_context():
"""Read context is accepted anywhere header/window helpers expect context."""
ctx = _ChunkContext(
chunk_affine=Affine(1.0, 0.0, 0.0, 0.0, -1.0, 4.0),
dst_crs=CRS.from_epsg(4326),
chunk_width=4,
chunk_height=4,
store=None,
path_fn=None,
nodata=None,
out_dtype=np.dtype("float32"),
dtype_was_explicit=False,
nodata_was_explicit=False,
warp_cache=None,
)

assert isinstance(ctx, _WindowContext)
assert ctx.chunk_width == 4
assert ctx.chunk_height == 4


def test_open_and_window_accepts_chunk_context():
"""_open_and_window uses only the stable header/window context fields."""
ctx = _ChunkContext(
chunk_affine=Affine(1.0, 0.0, 0.0, 0.0, -1.0, 4.0),
dst_crs=CRS.from_epsg(4326),
chunk_width=4,
chunk_height=4,
store=None,
path_fn=None,
nodata=None,
out_dtype=np.dtype("float32"),
dtype_was_explicit=False,
nodata_was_explicit=False,
warp_cache=None,
)
geotiff = SimpleNamespace(
crs=ctx.dst_crs,
overviews=[],
transform=ctx.chunk_affine,
width=4,
height=4,
)
item = {"id": "item-0", "assets": {"red": {"href": "file:///tmp/red.tif"}}}

with (
patch(
"lazycogs._chunk_reader._resolve_store",
return_value=(None, "/tmp/red.tif"),
),
patch(
"lazycogs._chunk_reader.GeoTIFF.open",
new_callable=AsyncMock,
return_value=geotiff,
),
):
opened = asyncio.run(_open_and_window(item, "red", ctx))

assert opened is not None
opened_geotiff, reader, window, path = opened
assert opened_geotiff is geotiff
assert reader is geotiff
assert window is not None
assert window.width == 4
assert window.height == 4
assert path == "/tmp/red.tif"


# ---------------------------------------------------------------------------
# read_chunk_async concurrency
# ---------------------------------------------------------------------------
Expand Down
47 changes: 47 additions & 0 deletions tests/test_explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

from __future__ import annotations

import asyncio
from types import SimpleNamespace
from unittest.mock import AsyncMock, patch

import numpy as np
Expand All @@ -13,13 +15,15 @@
from xarray.core import indexing

from lazycogs._backend import MultiBandStacBackendArray
from lazycogs._chunk_reader import _ChunkContext, _WindowContext
from lazycogs._explain import (
ChunkRead,
CogRead,
ExplainPlan,
_compute_chunk_bbox_4326,
_find_backend_array,
_infer_chunk_sizes,
_inspect_item_async,
_iter_spatial_chunks,
_roi_pixel_offsets,
)
Expand Down Expand Up @@ -458,6 +462,49 @@ def _fake_items(band: str, n: int) -> list[dict]:
]


def test_inspect_item_async_builds_window_context_for_header_fetch(wgs84):
"""Header fetch explain path does not require pixel-read contract fields."""
chunk_affine = Affine(1.0, 0.0, 0.0, 0.0, -1.0, 10.0)
store = object()
geotiff = SimpleNamespace(overviews=[], transform=chunk_affine)
reader = geotiff
window = SimpleNamespace(col_off=1, row_off=2, width=3, height=4)

with patch(
"lazycogs._explain._open_and_window",
new_callable=AsyncMock,
return_value=(geotiff, reader, window, "s3://bucket/item-0.tif"),
) as open_and_window:
read = asyncio.run(
_inspect_item_async(
_fake_items("red", 1)[0],
"red",
chunk_affine,
wgs84,
10,
10,
store,
),
)

_, _, ctx = open_and_window.call_args.args
assert isinstance(ctx, _WindowContext)
assert not isinstance(ctx, _ChunkContext)
assert ctx.chunk_affine == chunk_affine
assert ctx.dst_crs == wgs84
assert ctx.chunk_width == 10
assert ctx.chunk_height == 10
assert ctx.store is store
assert ctx.path_fn is None
assert not hasattr(ctx, "out_dtype")
assert not hasattr(ctx, "nodata")
assert read is not None
assert read.window_col_off == 1
assert read.window_row_off == 2
assert read.window_width == 3
assert read.window_height == 4


def test_accessor_raises_on_non_stac_da():
"""explain() raises ValueError when the array is not lazycogs-backed."""
da = xr.DataArray(np.zeros((3, 3)))
Expand Down
Loading